From ae0bb3e011fec51fa67073d8e23d8ffeb36185d1 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 19 May 2009 11:07:10 +0300 Subject: KVM: VMX: Properly handle software interrupt re-injection in real mode When reinjecting a software interrupt or exception, use the correct instruction length provided by the hardware instead of a hardcoded 1. Fixes problems running the suse 9.1 livecd boot loader. Problem introduced by commit f0a3602c20 ("KVM: Move interrupt injection logic to x86.c"). Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 29f912927a58..db0b8b6df198 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -801,8 +801,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = nr; vmx->rmode.irq.rip = kvm_rip_read(vcpu); - if (nr == BP_VECTOR || nr == OF_VECTOR) - vmx->rmode.irq.rip++; + if (kvm_exception_is_soft(nr)) + vmx->rmode.irq.rip += + vmx->vcpu.arch.event_exit_inst_len; intr_info |= INTR_TYPE_SOFT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); @@ -2468,6 +2469,9 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; vmx->rmode.irq.rip = kvm_rip_read(vcpu); + if (vcpu->arch.interrupt.soft) + vmx->rmode.irq.rip += + vmx->vcpu.arch.event_exit_inst_len; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); -- cgit v1.2.3-70-g09d2 From af24a4e4aec77ef16c1971cf4465f767ba946034 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 15 May 2009 18:42:05 +0530 Subject: KVM: Replace MSR_IA32_TIME_STAMP_COUNTER with MSR_IA32_TSC of msr-index.h Use standard msr-index.h's MSR declaration. MSR_IA32_TSC is better than MSR_IA32_TIME_STAMP_COUNTER as it also solves 80 column issue. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 5 ++--- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index eabdc1cfab5c..79561752af97 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -752,8 +752,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } -#define MSR_IA32_TIME_STAMP_COUNTER 0x010 - #define TSS_IOPB_BASE_OFFSET 0x66 #define TSS_BASE_SIZE 0x68 #define TSS_IOPB_SIZE (65536 / 8) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b1f658ad2f06..48b22c9892d8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1953,7 +1953,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) struct vcpu_svm *svm = to_svm(vcpu); switch (ecx) { - case MSR_IA32_TIME_STAMP_COUNTER: { + case MSR_IA32_TSC: { u64 tsc; rdtscll(tsc); @@ -2043,7 +2043,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) struct vcpu_svm *svm = to_svm(vcpu); switch (ecx) { - case MSR_IA32_TIME_STAMP_COUNTER: { + case MSR_IA32_TSC: { u64 tsc; rdtscll(tsc); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index db0b8b6df198..c87c93fd129a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -941,7 +941,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_EFER: return kvm_get_msr_common(vcpu, msr_index, pdata); #endif - case MSR_IA32_TIME_STAMP_COUNTER: + case MSR_IA32_TSC: data = guest_read_tsc(); break; case MSR_IA32_SYSENTER_CS: @@ -1001,7 +1001,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_TIME_STAMP_COUNTER: + case MSR_IA32_TSC: rdtscll(host_tsc); guest_write_tsc(data, host_tsc); break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d4529011828..6d46079a901c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -466,7 +466,7 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; @@ -644,8 +644,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, - &vcpu->hv_clock.tsc_timestamp); + kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); ktime_get_ts(&ts); local_irq_restore(flags); -- cgit v1.2.3-70-g09d2 From 890ca9aefa78f7831f8f633cab9e4803636dffe4 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 11 May 2009 16:48:15 +0800 Subject: KVM: Add MCE support The related MSRs are emulated. MCE capability is exported via extension KVM_CAP_MCE and ioctl KVM_X86_GET_MCE_CAP_SUPPORTED. A new vcpu ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation such as the mcg_cap. MCE is injected via vcpu ioctl command KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are not implemented. Signed-off-by: Huang Ying Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm.h | 1 + arch/x86/include/asm/kvm_host.h | 5 + arch/x86/kvm/x86.c | 220 +++++++++++++++++++++++++++++++++++----- include/linux/kvm.h | 20 ++++ 4 files changed, 222 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 125be8b19568..708b9c32a5da 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -17,6 +17,7 @@ #define __KVM_HAVE_USER_NMI #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX +#define __KVM_HAVE_MCE /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 79561752af97..81c68f630b14 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -373,6 +373,11 @@ struct kvm_vcpu_arch { unsigned long dr6; unsigned long dr7; unsigned long eff_db[KVM_NR_DB_REGS]; + + u64 mcg_cap; + u64 mcg_status; + u64 mcg_ctl; + u64 *mce_banks; }; struct kvm_mem_alias { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d46079a901c..55a9dd182de8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -42,6 +42,7 @@ #include #include #include +#include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -55,6 +56,10 @@ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) + +#define KVM_MAX_MCE_BANKS 32 +#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P + /* EFER defaults: * - enable syscall per default because its emulated by KVM * - enable LME and LMA per default on 64 bit KVM @@ -777,23 +782,43 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 0; } -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) { + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + switch (msr) { - case MSR_EFER: - set_efer(vcpu, data); - break; - case MSR_IA32_MC0_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", - __func__, data); - break; case MSR_IA32_MCG_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", - __func__, data); + vcpu->arch.mcg_status = data; break; case MSR_IA32_MCG_CTL: - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", - __func__, data); + if (!(mcg_cap & MCG_CTL_P)) + return 1; + if (data != 0 && data != ~(u64)0) + return -1; + vcpu->arch.mcg_ctl = data; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + u32 offset = msr - MSR_IA32_MC0_CTL; + /* only 0 or all 1s can be written to IA32_MCi_CTL */ + if ((offset & 0x3) == 0 && + data != 0 && data != ~(u64)0) + return -1; + vcpu->arch.mce_banks[offset] = data; + break; + } + return 1; + } + return 0; +} + +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { + case MSR_EFER: + set_efer(vcpu, data); break; case MSR_IA32_DEBUGCTLMSR: if (!data) { @@ -849,6 +874,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_request_guest_time_update(vcpu); break; } + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return set_msr_mce(vcpu, msr, data); default: pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); return 1; @@ -904,26 +933,49 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; switch (msr) { - case 0xc0010010: /* SYSCFG */ - case 0xc0010015: /* HWCR */ - case MSR_IA32_PLATFORM_ID: case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: - case MSR_IA32_MC0_CTL: - case MSR_IA32_MCG_STATUS: + data = 0; + break; case MSR_IA32_MCG_CAP: + data = vcpu->arch.mcg_cap; + break; case MSR_IA32_MCG_CTL: - case MSR_IA32_MC0_MISC: - case MSR_IA32_MC0_MISC+4: - case MSR_IA32_MC0_MISC+8: - case MSR_IA32_MC0_MISC+12: - case MSR_IA32_MC0_MISC+16: - case MSR_IA32_MC0_MISC+20: + if (!(mcg_cap & MCG_CTL_P)) + return 1; + data = vcpu->arch.mcg_ctl; + break; + case MSR_IA32_MCG_STATUS: + data = vcpu->arch.mcg_status; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + u32 offset = msr - MSR_IA32_MC0_CTL; + data = vcpu->arch.mce_banks[offset]; + break; + } + return 1; + } + *pdata = data; + return 0; +} + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data; + + switch (msr) { + case 0xc0010010: /* SYSCFG */ + case 0xc0010015: /* HWCR */ + case MSR_IA32_PLATFORM_ID: case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: @@ -966,6 +1018,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_SYSTEM_TIME: data = vcpu->arch.time; break; + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return get_msr_mce(vcpu, msr, pdata); default: pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1087,6 +1146,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IOMMU: r = iommu_found(); break; + case KVM_CAP_MCE: + r = KVM_MAX_MCE_BANKS; + break; default: r = 0; break; @@ -1146,6 +1208,16 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } + case KVM_X86_GET_MCE_CAP_SUPPORTED: { + u64 mce_cap; + + mce_cap = KVM_MCE_CAP_SUPPORTED; + r = -EFAULT; + if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) + goto out; + r = 0; + break; + } default: r = -EINVAL; } @@ -1502,6 +1574,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, return 0; } +static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, + u64 mcg_cap) +{ + int r; + unsigned bank_num = mcg_cap & 0xff, bank; + + r = -EINVAL; + if (!bank_num) + goto out; + if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + goto out; + r = 0; + vcpu->arch.mcg_cap = mcg_cap; + /* Init IA32_MCG_CTL to all 1s */ + if (mcg_cap & MCG_CTL_P) + vcpu->arch.mcg_ctl = ~(u64)0; + /* Init IA32_MCi_CTL to all 1s */ + for (bank = 0; bank < bank_num; bank++) + vcpu->arch.mce_banks[bank*4] = ~(u64)0; +out: + return r; +} + +static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, + struct kvm_x86_mce *mce) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + u64 *banks = vcpu->arch.mce_banks; + + if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) + return -EINVAL; + /* + * if IA32_MCG_CTL is not all 1s, the uncorrected error + * reporting is disabled + */ + if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && + vcpu->arch.mcg_ctl != ~(u64)0) + return 0; + banks += 4 * mce->bank; + /* + * if IA32_MCi_CTL is not all 1s, the uncorrected error + * reporting is disabled for the bank + */ + if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) + return 0; + if (mce->status & MCI_STATUS_UC) { + if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || + !(vcpu->arch.cr4 & X86_CR4_MCE)) { + printk(KERN_DEBUG "kvm: set_mce: " + "injects mce exception while " + "previous one is in progress!\n"); + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return 0; + } + if (banks[1] & MCI_STATUS_VAL) + mce->status |= MCI_STATUS_OVER; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + banks[1] = mce->status; + kvm_queue_exception(vcpu, MC_VECTOR); + } else if (!(banks[1] & MCI_STATUS_VAL) + || !(banks[1] & MCI_STATUS_UC)) { + if (banks[1] & MCI_STATUS_VAL) + mce->status |= MCI_STATUS_OVER; + banks[2] = mce->addr; + banks[3] = mce->misc; + banks[1] = mce->status; + } else + banks[1] |= MCI_STATUS_OVER; + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1635,6 +1781,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp, kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); break; } + case KVM_X86_SETUP_MCE: { + u64 mcg_cap; + + r = -EFAULT; + if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) + goto out; + r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); + break; + } + case KVM_X86_SET_MCE: { + struct kvm_x86_mce mce; + + r = -EFAULT; + if (copy_from_user(&mce, argp, sizeof mce)) + goto out; + r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); + break; + } default: r = -EINVAL; } @@ -4440,6 +4604,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_mmu_destroy; } + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + GFP_KERNEL); + if (!vcpu->arch.mce_banks) { + r = -ENOMEM; + goto fail_mmu_destroy; + } + vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + return 0; fail_mmu_destroy: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 3db5d8d37485..7b17141c47c9 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -415,6 +415,9 @@ struct kvm_trace_rec { #define KVM_CAP_ASSIGN_DEV_IRQ 29 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30 +#ifdef __KVM_HAVE_MCE +#define KVM_CAP_MCE 31 +#endif #ifdef KVM_CAP_IRQ_ROUTING @@ -454,6 +457,19 @@ struct kvm_irq_routing { #endif +#ifdef KVM_CAP_MCE +/* x86 MCE */ +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; +#endif + /* * ioctls for VM fds */ @@ -541,6 +557,10 @@ struct kvm_irq_routing { #define KVM_NMI _IO(KVMIO, 0x9a) /* Available with KVM_CAP_SET_GUEST_DEBUG */ #define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) +/* MCE for x86 */ +#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64) +#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64) +#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce) /* * Deprecated interfaces -- cgit v1.2.3-70-g09d2 From 60af2ecdc53af5b941e4a2eb99862a7ccf3ed7c8 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 11:00:10 +0530 Subject: KVM: Use MSR names in place of address Replace 0xc0010010 with MSR_K8_SYSCFG and 0xc0010015 with MSR_K7_HWCR. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 55a9dd182de8..a93ba37eb52c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -973,8 +973,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) u64 data; switch (msr) { - case 0xc0010010: /* SYSCFG */ - case 0xc0010015: /* HWCR */ case MSR_IA32_PLATFORM_ID: case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: @@ -983,6 +981,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: + case MSR_K8_SYSCFG: + case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: -- cgit v1.2.3-70-g09d2 From cb007648de83cf226d69ec76e1c01848b4e8e49f Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Tue, 12 May 2009 12:36:44 +0100 Subject: KVM: fix cpuid E2BIG handling for extended request types If we run out of cpuid entries for extended request types we should return -E2BIG, just like we do for the standard request types. Signed-off-by: Mark McLoughlin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a93ba37eb52c..25a1c5739dfc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1506,6 +1506,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + r = -EFAULT; if (copy_to_user(entries, cpuid_entries, nent * sizeof(struct kvm_cpuid_entry2))) -- cgit v1.2.3-70-g09d2 From c9eaf20f268c7051bfde2ba212c5ea76a6cbc7a1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 18 May 2009 16:13:45 +0300 Subject: KVM: x86 emulator: Implement zero-extended immediate decoding Absolute jumps use zero extended immediate operands. Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 616de4628d60..a30d5fc17eef 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -60,6 +60,7 @@ #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ #define SrcOne (7<<4) /* Implied '1' */ #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ +#define SrcImmU (9<<4) /* Immediate operand, unsigned */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -1027,6 +1028,7 @@ done_prefixes: c->src.type = OP_MEM; break; case SrcImm: + case SrcImmU: c->src.type = OP_IMM; c->src.ptr = (unsigned long *)c->eip; c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; @@ -1044,6 +1046,19 @@ done_prefixes: c->src.val = insn_fetch(s32, 4, c->eip); break; } + if ((c->d & SrcMask) == SrcImmU) { + switch (c->src.bytes) { + case 1: + c->src.val &= 0xff; + break; + case 2: + c->src.val &= 0xffff; + break; + case 4: + c->src.val &= 0xffffffff; + break; + } + } break; case SrcImmByte: case SrcImmUByte: -- cgit v1.2.3-70-g09d2 From ee3d29e8bee8d7c321279a9bd9bd25d4cfbf79b7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 18 May 2009 16:15:20 +0300 Subject: KVM: x86 emulator: fix jmp far decoding (opcode 0xea) The jump target should not be sign extened; use an unsigned decode flag. Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index a30d5fc17eef..ef4dfca3ed7e 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -196,7 +196,7 @@ static u32 opcode_table[256] = { ByteOp | SrcImmUByte, SrcImmUByte, /* 0xE8 - 0xEF */ SrcImm | Stack, SrcImm | ImplicitOps, - SrcImm | Src2Imm16, SrcImmByte | ImplicitOps, + SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ -- cgit v1.2.3-70-g09d2 From 284e9b0f5ad0c049efb7f145588782bf3d8be93e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 May 2009 08:16:14 -0400 Subject: KVM: cleanup arch/x86/kvm/Makefile Use proper foo-y style list additions to cleanup all the conditionals, move module selection after compound object selection and remove the superflous comment. Signed-off-by: Christoph Hellwig Signed-off-by: Avi Kivity --- arch/x86/kvm/Makefile | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b43c4efafe80..bee9512cd60d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,22 +1,16 @@ -# -# Makefile for Kernel-based Virtual Machine module -# - -common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o) -ifeq ($(CONFIG_KVM_TRACE),y) -common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) -endif -ifeq ($(CONFIG_IOMMU_API),y) -common-objs += $(addprefix ../../../virt/kvm/, iommu.o) -endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm -kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ - i8254.o timer.o -obj-$(CONFIG_KVM) += kvm.o -kvm-intel-objs = vmx.o -obj-$(CONFIG_KVM_INTEL) += kvm-intel.o -kvm-amd-objs = svm.o -obj-$(CONFIG_KVM_AMD) += kvm-amd.o +kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ + coalesced_mmio.o irq_comm.o) +kvm-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o) +kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) + +kvm-y += x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ + i8254.o timer.o +kvm-intel-y += vmx.o +kvm-amd-y += svm.o + +obj-$(CONFIG_KVM) += kvm.o +obj-$(CONFIG_KVM_INTEL) += kvm-intel.o +obj-$(CONFIG_KVM_AMD) += kvm-amd.o -- cgit v1.2.3-70-g09d2 From 787ff73637bbf34ef276cb04a127b421de71cc43 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 18 May 2009 11:44:06 +0300 Subject: KVM: Drop interrupt shadow when single stepping should be done only on VMX The problem exists only on VMX. Also currently we skip this step if there is pending exception. The patch fixes this too. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 8 ++++++++ arch/x86/kvm/x86.c | 3 --- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c87c93fd129a..c14bffc8c1f9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3453,6 +3453,14 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + /* When single-stepping over STI and MOV SS, we must clear the + * corresponding interruptibility bits in the guest state. Otherwise + * vmentry fails as it then expects bit 14 (BS) in pending debug + * exceptions being set, but that's not correct for the guest debugging + * case. */ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vmx_set_interrupt_shadow(vcpu, 0); + /* * Loading guest fpu may have cleared host cr0.ts */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 25a1c5739dfc..f0642396783f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3361,9 +3361,6 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - kvm_x86_ops->set_interrupt_shadow(vcpu, 0); - /* try to reinject previous events if any */ if (vcpu->arch.nmi_injected) { kvm_x86_ops->set_nmi(vcpu); -- cgit v1.2.3-70-g09d2 From 0ba12d10817a8db1fd7d96d3283ec6c0b294aeab Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 May 2009 16:45:19 +0300 Subject: KVM: Move common KVM Kconfig items to new file virt/kvm/Kconfig Reduce Kconfig code duplication. Signed-off-by: Avi Kivity --- arch/ia64/kvm/Kconfig | 7 ++----- arch/powerpc/kvm/Kconfig | 3 +-- arch/s390/kvm/Kconfig | 6 +----- arch/x86/kvm/Kconfig | 7 ++----- virt/kvm/Kconfig | 7 +++++++ 5 files changed, 13 insertions(+), 17 deletions(-) create mode 100644 virt/kvm/Kconfig diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 64d520937874..f922bbba3797 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -1,12 +1,8 @@ # # KVM configuration # -config HAVE_KVM - bool -config HAVE_KVM_IRQCHIP - bool - default y +source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" @@ -28,6 +24,7 @@ config KVM depends on PCI select PREEMPT_NOTIFIERS select ANON_INODES + select HAVE_KVM_IRQCHIP ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 5a152a52796f..46019dccce1c 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -2,8 +2,7 @@ # KVM configuration # -config HAVE_KVM_IRQCHIP - bool +source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 3e260b7e37b2..ad75ce33be12 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -1,11 +1,7 @@ # # KVM configuration # -config HAVE_KVM - bool - -config HAVE_KVM_IRQCHIP - bool +source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8600a09e0c6c..939b1cb9c5b3 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -1,12 +1,8 @@ # # KVM configuration # -config HAVE_KVM - bool -config HAVE_KVM_IRQCHIP - bool - default y +source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" @@ -29,6 +25,7 @@ config KVM select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES + select HAVE_KVM_IRQCHIP ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig new file mode 100644 index 000000000000..79e83fc00dc7 --- /dev/null +++ b/virt/kvm/Kconfig @@ -0,0 +1,7 @@ +# KVM common configuration items and defaults + +config HAVE_KVM + bool + +config HAVE_KVM_IRQCHIP + bool -- cgit v1.2.3-70-g09d2 From 721eecbf4fe995ca94a9edec0c9843b1cc0eaaf3 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 20 May 2009 10:30:49 -0400 Subject: KVM: irqfd KVM provides a complete virtual system environment for guests, including support for injecting interrupts modeled after the real exception/interrupt facilities present on the native platform (such as the IDT on x86). Virtual interrupts can come from a variety of sources (emulated devices, pass-through devices, etc) but all must be injected to the guest via the KVM infrastructure. This patch adds a new mechanism to inject a specific interrupt to a guest using a decoupled eventfd mechnanism: Any legal signal on the irqfd (using eventfd semantics from either userspace or kernel) will translate into an injected interrupt in the guest at the next available interrupt window. Signed-off-by: Gregory Haskins Signed-off-by: Avi Kivity --- arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/x86.c | 1 + include/linux/kvm.h | 11 ++ include/linux/kvm_host.h | 24 ++++ virt/kvm/Kconfig | 4 + virt/kvm/eventfd.c | 329 +++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 12 ++ 8 files changed, 383 insertions(+), 1 deletion(-) create mode 100644 virt/kvm/eventfd.c diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 939b1cb9c5b3..8cd2a4efe238 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -26,6 +26,7 @@ config KVM select MMU_NOTIFIER select ANON_INODES select HAVE_KVM_IRQCHIP + select HAVE_KVM_EVENTFD ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index bee9512cd60d..01e3c61f749a 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -2,7 +2,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o) + coalesced_mmio.o irq_comm.o eventfd.o) kvm-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f0642396783f..15f39fc08ece 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1126,6 +1126,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_ASSIGN_DEV_IRQ: + case KVM_CAP_IRQFD: r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 7b17141c47c9..8f53f24e5274 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -418,6 +418,7 @@ struct kvm_trace_rec { #ifdef __KVM_HAVE_MCE #define KVM_CAP_MCE 31 #endif +#define KVM_CAP_IRQFD 32 #ifdef KVM_CAP_IRQ_ROUTING @@ -470,6 +471,15 @@ struct kvm_x86_mce { }; #endif +#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) + +struct kvm_irqfd { + __u32 fd; + __u32 gsi; + __u32 flags; + __u8 pad[20]; +}; + /* * ioctls for VM fds */ @@ -514,6 +524,7 @@ struct kvm_x86_mce { #define KVM_ASSIGN_SET_MSIX_ENTRY \ _IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry) #define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) +#define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd) /* * ioctls for vcpu fds diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3060bdc35ffe..7724dcb6ff76 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -136,6 +136,12 @@ struct kvm { struct list_head vm_list; struct kvm_io_bus mmio_bus; struct kvm_io_bus pio_bus; +#ifdef CONFIG_HAVE_KVM_EVENTFD + struct { + spinlock_t lock; + struct list_head items; + } irqfds; +#endif struct kvm_vm_stat stat; struct kvm_arch arch; atomic_t users_count; @@ -525,4 +531,22 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} #endif +#ifdef CONFIG_HAVE_KVM_EVENTFD + +void kvm_irqfd_init(struct kvm *kvm); +int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); +void kvm_irqfd_release(struct kvm *kvm); + +#else + +static inline void kvm_irqfd_init(struct kvm *kvm) {} +static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) +{ + return -EINVAL; +} + +static inline void kvm_irqfd_release(struct kvm *kvm) {} + +#endif /* CONFIG_HAVE_KVM_EVENTFD */ + #endif diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 79e83fc00dc7..56c6848d2df8 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -5,3 +5,7 @@ config HAVE_KVM config HAVE_KVM_IRQCHIP bool + +config HAVE_KVM_EVENTFD + bool + select EVENTFD diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c new file mode 100644 index 000000000000..314012323afe --- /dev/null +++ b/virt/kvm/eventfd.c @@ -0,0 +1,329 @@ +/* + * kvm eventfd support - use eventfd objects to signal various KVM events + * + * Copyright 2009 Novell. All Rights Reserved. + * + * Author: + * Gregory Haskins + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * -------------------------------------------------------------------- + * irqfd: Allows an fd to be used to inject an interrupt to the guest + * + * Credit goes to Avi Kivity for the original idea. + * -------------------------------------------------------------------- + */ + +struct _irqfd { + struct kvm *kvm; + struct eventfd_ctx *eventfd; + int gsi; + struct list_head list; + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct inject; + struct work_struct shutdown; +}; + +static struct workqueue_struct *irqfd_cleanup_wq; + +static void +irqfd_inject(struct work_struct *work) +{ + struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); + struct kvm *kvm = irqfd->kvm; + + mutex_lock(&kvm->lock); + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); + mutex_unlock(&kvm->lock); +} + +/* + * Race-free decouple logic (ordering is critical) + */ +static void +irqfd_shutdown(struct work_struct *work) +{ + struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); + + /* + * Synchronize with the wait-queue and unhook ourselves to prevent + * further events. + */ + remove_wait_queue(irqfd->wqh, &irqfd->wait); + + /* + * We know no new events will be scheduled at this point, so block + * until all previously outstanding events have completed + */ + flush_work(&irqfd->inject); + + /* + * It is now safe to release the object's resources + */ + eventfd_ctx_put(irqfd->eventfd); + kfree(irqfd); +} + + +/* assumes kvm->irqfds.lock is held */ +static bool +irqfd_is_active(struct _irqfd *irqfd) +{ + return list_empty(&irqfd->list) ? false : true; +} + +/* + * Mark the irqfd as inactive and schedule it for removal + * + * assumes kvm->irqfds.lock is held + */ +static void +irqfd_deactivate(struct _irqfd *irqfd) +{ + BUG_ON(!irqfd_is_active(irqfd)); + + list_del_init(&irqfd->list); + + queue_work(irqfd_cleanup_wq, &irqfd->shutdown); +} + +/* + * Called with wqh->lock held and interrupts disabled + */ +static int +irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); + unsigned long flags = (unsigned long)key; + + if (flags & POLLIN) + /* An event has been signaled, inject an interrupt */ + schedule_work(&irqfd->inject); + + if (flags & POLLHUP) { + /* The eventfd is closing, detach from KVM */ + struct kvm *kvm = irqfd->kvm; + unsigned long flags; + + spin_lock_irqsave(&kvm->irqfds.lock, flags); + + /* + * We must check if someone deactivated the irqfd before + * we could acquire the irqfds.lock since the item is + * deactivated from the KVM side before it is unhooked from + * the wait-queue. If it is already deactivated, we can + * simply return knowing the other side will cleanup for us. + * We cannot race against the irqfd going away since the + * other side is required to acquire wqh->lock, which we hold + */ + if (irqfd_is_active(irqfd)) + irqfd_deactivate(irqfd); + + spin_unlock_irqrestore(&kvm->irqfds.lock, flags); + } + + return 0; +} + +static void +irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); + + irqfd->wqh = wqh; + add_wait_queue(wqh, &irqfd->wait); +} + +static int +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) +{ + struct _irqfd *irqfd; + struct file *file = NULL; + struct eventfd_ctx *eventfd = NULL; + int ret; + unsigned int events; + + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + if (!irqfd) + return -ENOMEM; + + irqfd->kvm = kvm; + irqfd->gsi = gsi; + INIT_LIST_HEAD(&irqfd->list); + INIT_WORK(&irqfd->inject, irqfd_inject); + INIT_WORK(&irqfd->shutdown, irqfd_shutdown); + + file = eventfd_fget(fd); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto fail; + } + + eventfd = eventfd_ctx_fileget(file); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto fail; + } + + irqfd->eventfd = eventfd; + + /* + * Install our own custom wake-up handling so we are notified via + * a callback whenever someone signals the underlying eventfd + */ + init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); + + events = file->f_op->poll(file, &irqfd->pt); + + spin_lock_irq(&kvm->irqfds.lock); + list_add_tail(&irqfd->list, &kvm->irqfds.items); + spin_unlock_irq(&kvm->irqfds.lock); + + /* + * Check if there was an event already pending on the eventfd + * before we registered, and trigger it as if we didn't miss it. + */ + if (events & POLLIN) + schedule_work(&irqfd->inject); + + /* + * do not drop the file until the irqfd is fully initialized, otherwise + * we might race against the POLLHUP + */ + fput(file); + + return 0; + +fail: + if (eventfd && !IS_ERR(eventfd)) + eventfd_ctx_put(eventfd); + + if (file && !IS_ERR(file)) + fput(file); + + kfree(irqfd); + return ret; +} + +void +kvm_irqfd_init(struct kvm *kvm) +{ + spin_lock_init(&kvm->irqfds.lock); + INIT_LIST_HEAD(&kvm->irqfds.items); +} + +/* + * shutdown any irqfd's that match fd+gsi + */ +static int +kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) +{ + struct _irqfd *irqfd, *tmp; + struct eventfd_ctx *eventfd; + + eventfd = eventfd_ctx_fdget(fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) + irqfd_deactivate(irqfd); + } + + spin_unlock_irq(&kvm->irqfds.lock); + eventfd_ctx_put(eventfd); + + /* + * Block until we know all outstanding shutdown jobs have completed + * so that we guarantee there will not be any more interrupts on this + * gsi once this deassign function returns. + */ + flush_workqueue(irqfd_cleanup_wq); + + return 0; +} + +int +kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) +{ + if (flags & KVM_IRQFD_FLAG_DEASSIGN) + return kvm_irqfd_deassign(kvm, fd, gsi); + + return kvm_irqfd_assign(kvm, fd, gsi); +} + +/* + * This function is called as the kvm VM fd is being released. Shutdown all + * irqfds that still remain open + */ +void +kvm_irqfd_release(struct kvm *kvm) +{ + struct _irqfd *irqfd, *tmp; + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) + irqfd_deactivate(irqfd); + + spin_unlock_irq(&kvm->irqfds.lock); + + /* + * Block until we know all outstanding shutdown jobs have completed + * since we do not take a kvm* reference. + */ + flush_workqueue(irqfd_cleanup_wq); + +} + +/* + * create a host-wide workqueue for issuing deferred shutdown requests + * aggregated from all vm* instances. We need our own isolated single-thread + * queue to prevent deadlock against flushing the normal work-queue. + */ +static int __init irqfd_module_init(void) +{ + irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); + if (!irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + +static void __exit irqfd_module_exit(void) +{ + destroy_workqueue(irqfd_cleanup_wq); +} + +module_init(irqfd_module_init); +module_exit(irqfd_module_exit); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2884baf1d5f9..dee321e58894 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -986,6 +986,7 @@ static struct kvm *kvm_create_vm(void) spin_lock_init(&kvm->mmu_lock); spin_lock_init(&kvm->requests_lock); kvm_io_bus_init(&kvm->pio_bus); + kvm_irqfd_init(kvm); mutex_init(&kvm->lock); kvm_io_bus_init(&kvm->mmio_bus); init_rwsem(&kvm->slots_lock); @@ -1071,6 +1072,8 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) { struct kvm *kvm = filp->private_data; + kvm_irqfd_release(kvm); + kvm_put_kvm(kvm); return 0; } @@ -2222,6 +2225,15 @@ static long kvm_vm_ioctl(struct file *filp, } #endif #endif /* KVM_CAP_IRQ_ROUTING */ + case KVM_IRQFD: { + struct kvm_irqfd data; + + r = -EFAULT; + if (copy_from_user(&data, argp, sizeof data)) + goto out; + r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); + break; + } default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } -- cgit v1.2.3-70-g09d2 From c5ff41ce66382d657a76bc06ba252d848826950f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 14 May 2009 22:42:53 +0200 Subject: KVM: Allow PIT emulation without speaker port The in-kernel speaker emulation is only a dummy and also unneeded from the performance point of view. Rather, it takes user space support to generate sound output on the host, e.g. console beeps. To allow this, introduce KVM_CREATE_PIT2 which controls in-kernel speaker port emulation via a flag passed along the new IOCTL. It also leaves room for future extensions of the PIT configuration interface. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 14 ++++++++------ arch/x86/kvm/i8254.h | 2 +- arch/x86/kvm/x86.c | 12 +++++++++++- include/linux/kvm.h | 12 ++++++++++++ 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 21f68e00524f..0990bc9aac1f 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -563,7 +563,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) } } -struct kvm_pit *kvm_create_pit(struct kvm *kvm) +struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; struct kvm_kpit_state *pit_state; @@ -589,11 +589,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) pit->dev.private = pit; kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); - pit->speaker_dev.read = speaker_ioport_read; - pit->speaker_dev.write = speaker_ioport_write; - pit->speaker_dev.in_range = speaker_in_range; - pit->speaker_dev.private = pit; - kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); + if (flags & KVM_PIT_SPEAKER_DUMMY) { + pit->speaker_dev.read = speaker_ioport_read; + pit->speaker_dev.write = speaker_ioport_write; + pit->speaker_dev.in_range = speaker_in_range; + pit->speaker_dev.private = pit; + kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); + } kvm->arch.vpit = pit; pit->kvm = kvm; diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index bbd863ff60b7..b2670180f225 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -50,7 +50,7 @@ struct kvm_pit { void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); -struct kvm_pit *kvm_create_pit(struct kvm *kvm); +struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); void kvm_pit_reset(struct kvm_pit *pit); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 15f39fc08ece..5eb3b8dd74b8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1127,6 +1127,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_IRQFD: + case KVM_CAP_PIT2: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2038,6 +2039,7 @@ long kvm_arch_vm_ioctl(struct file *filp, union { struct kvm_pit_state ps; struct kvm_memory_alias alias; + struct kvm_pit_config pit_config; } u; switch (ioctl) { @@ -2098,12 +2100,20 @@ long kvm_arch_vm_ioctl(struct file *filp, } break; case KVM_CREATE_PIT: + u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; + goto create_pit; + case KVM_CREATE_PIT2: + r = -EFAULT; + if (copy_from_user(&u.pit_config, argp, + sizeof(struct kvm_pit_config))) + goto out; + create_pit: mutex_lock(&kvm->lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; r = -ENOMEM; - kvm->arch.vpit = kvm_create_pit(kvm); + kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); if (kvm->arch.vpit) r = 0; create_pit_unlock: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 8f53f24e5274..65df6f5e2b98 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -70,6 +70,14 @@ struct kvm_irqchip { } chip; }; +/* for KVM_CREATE_PIT2 */ +struct kvm_pit_config { + __u32 flags; + __u32 pad[15]; +}; + +#define KVM_PIT_SPEAKER_DUMMY 1 + #define KVM_EXIT_UNKNOWN 0 #define KVM_EXIT_EXCEPTION 1 #define KVM_EXIT_IO 2 @@ -419,6 +427,9 @@ struct kvm_trace_rec { #define KVM_CAP_MCE 31 #endif #define KVM_CAP_IRQFD 32 +#ifdef __KVM_HAVE_PIT +#define KVM_CAP_PIT2 33 +#endif #ifdef KVM_CAP_IRQ_ROUTING @@ -525,6 +536,7 @@ struct kvm_irqfd { _IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry) #define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) #define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd) +#define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config) /* * ioctls for vcpu fds -- cgit v1.2.3-70-g09d2 From 3032b925f00ba2653f7695d356d6f8284c82038d Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Mon, 25 May 2009 10:22:17 +0200 Subject: KVM: ia64: Correct itc_offset calculations Init the itc_offset for all possible vCPUs. The current code by mistake ends up only initializing the offset on vCPU 0. Spotted by Gleb Natapov. Signed-off-by: Jes Sorensen Acked-by : Xiantao Zhang Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 80c57b0a21c4..319922137fdd 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1224,7 +1224,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) /*Initialize itc offset for vcpus*/ itc_offset = 0UL - kvm_get_itc(vcpu); - for (i = 0; i < kvm->arch.online_vcpus; i++) { + for (i = 0; i < KVM_MAX_VCPUS; i++) { v = (struct kvm_vcpu *)((char *)vcpu + sizeof(struct kvm_vcpu_data) * i); v->arch.itc_offset = itc_offset; -- cgit v1.2.3-70-g09d2 From 9ace903d171db7dc2fed96e44ac62b6f4c3ccb3d Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Wed, 20 May 2009 15:34:55 +0200 Subject: KVM: s390: infrastructure to kick vcpus out of guest state To ensure vcpu's come out of guest context in certain cases this patch adds a s390 specific way to kick them out of guest context. Currently it kicks them out to rerun the vcpu_run path in the s390 code, but the mechanism itself is expandable and with a new flag we could also add e.g. kicks to userspace etc. Signed-off-by: Christian Ehrhardt Signed-off-by: Avi Kivity --- arch/s390/include/asm/kvm_host.h | 5 ++-- arch/s390/kvm/intercept.c | 12 ++++++--- arch/s390/kvm/kvm-s390.c | 5 ++++ arch/s390/kvm/kvm-s390.h | 3 +++ arch/s390/kvm/sigp.c | 56 ++++++++++++++++++++++++---------------- 5 files changed, 54 insertions(+), 27 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 1cd02f6073a0..0a784f6f9c90 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -182,8 +182,9 @@ struct kvm_s390_interrupt_info { }; /* for local_interrupt.action_flags */ -#define ACTION_STORE_ON_STOP 1 -#define ACTION_STOP_ON_STOP 2 +#define ACTION_STORE_ON_STOP (1<<0) +#define ACTION_STOP_ON_STOP (1<<1) +#define ACTION_RELOADVCPU_ON_STOP (1<<2) struct kvm_s390_local_interrupt { spinlock_t lock; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 98997ccba501..0732ab4305f4 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -128,7 +128,7 @@ static int handle_noop(struct kvm_vcpu *vcpu) static int handle_stop(struct kvm_vcpu *vcpu) { - int rc; + int rc = 0; vcpu->stat.exit_stop_request++; atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); @@ -141,12 +141,18 @@ static int handle_stop(struct kvm_vcpu *vcpu) rc = -ENOTSUPP; } + if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) { + vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP; + rc = SIE_INTERCEPT_RERUNVCPU; + vcpu->run->exit_reason = KVM_EXIT_INTR; + } + if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) { vcpu->arch.local_int.action_bits &= ~ACTION_STOP_ON_STOP; VCPU_EVENT(vcpu, 3, "%s", "cpu stopped"); rc = -ENOTSUPP; - } else - rc = 0; + } + spin_unlock_bh(&vcpu->arch.local_int.lock); return rc; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 90d9d1ba258b..1d65f6277166 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -490,6 +490,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu_load(vcpu); +rerun_vcpu: /* verify, that memory has been registered */ if (!vcpu->kvm->arch.guest_memsize) { vcpu_put(vcpu); @@ -509,6 +510,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr; break; case KVM_EXIT_UNKNOWN: + case KVM_EXIT_INTR: case KVM_EXIT_S390_RESET: break; default: @@ -522,6 +524,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) rc = kvm_handle_sie_intercept(vcpu); } while (!signal_pending(current) && !rc); + if (rc == SIE_INTERCEPT_RERUNVCPU) + goto rerun_vcpu; + if (signal_pending(current) && !rc) rc = -EINTR; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 748fee872323..2072cd4a013e 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -20,6 +20,8 @@ typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); +/* negativ values are error codes, positive values for internal conditions */ +#define SIE_INTERCEPT_RERUNVCPU (1<<0) int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu); #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ @@ -50,6 +52,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_interrupt *s390int); int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); +int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action); /* implemented in priv.c */ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 0ef81d6776e9..21897b0f8a36 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -1,7 +1,7 @@ /* * sigp.c - handlinge interprocessor communication * - * Copyright IBM Corp. 2008 + * Copyright IBM Corp. 2008,2009 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) @@ -9,6 +9,7 @@ * * Author(s): Carsten Otte * Christian Borntraeger + * Christian Ehrhardt */ #include @@ -107,46 +108,57 @@ unlock: return rc; } -static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store) +static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; - struct kvm_s390_local_interrupt *li; struct kvm_s390_interrupt_info *inti; - int rc; - - if (cpu_addr >= KVM_MAX_VCPUS) - return 3; /* not operational */ inti = kzalloc(sizeof(*inti), GFP_KERNEL); if (!inti) return -ENOMEM; - inti->type = KVM_S390_SIGP_STOP; - spin_lock(&fi->lock); - li = fi->local_int[cpu_addr]; - if (li == NULL) { - rc = 3; /* not operational */ - kfree(inti); - goto unlock; - } spin_lock_bh(&li->lock); list_add_tail(&inti->list, &li->list); atomic_set(&li->active, 1); atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); - if (store) - li->action_bits |= ACTION_STORE_ON_STOP; - li->action_bits |= ACTION_STOP_ON_STOP; + li->action_bits |= action; if (waitqueue_active(&li->wq)) wake_up_interruptible(&li->wq); spin_unlock_bh(&li->lock); - rc = 0; /* order accepted */ + + return 0; /* order accepted */ +} + +static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_local_interrupt *li; + int rc; + + if (cpu_addr >= KVM_MAX_VCPUS) + return 3; /* not operational */ + + spin_lock(&fi->lock); + li = fi->local_int[cpu_addr]; + if (li == NULL) { + rc = 3; /* not operational */ + goto unlock; + } + + rc = __inject_sigp_stop(li, action); + unlock: spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr); return rc; } +int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + return __inject_sigp_stop(li, action); +} + static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) { int rc; @@ -262,11 +274,11 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) break; case SIGP_STOP: vcpu->stat.instruction_sigp_stop++; - rc = __sigp_stop(vcpu, cpu_addr, 0); + rc = __sigp_stop(vcpu, cpu_addr, ACTION_STOP_ON_STOP); break; case SIGP_STOP_STORE_STATUS: vcpu->stat.instruction_sigp_stop++; - rc = __sigp_stop(vcpu, cpu_addr, 1); + rc = __sigp_stop(vcpu, cpu_addr, ACTION_STORE_ON_STOP); break; case SIGP_SET_ARCH: vcpu->stat.instruction_sigp_arch++; -- cgit v1.2.3-70-g09d2 From b1d16c495d9e6fe48e7df2e1d18cafc6555a116a Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Wed, 20 May 2009 15:34:56 +0200 Subject: KVM: s390: fix signal handling If signal pending is true we exit without updating kvm_run, userspace currently just does nothing and jumps to kvm_run again. Since we did not set an exit_reason we might end up with a random one (whatever was the last exit). Therefore it was possible to e.g. jump to the psw position the last real interruption set. Setting the INTR exit reason ensures that no old psw data is swapped in on reentry. Signed-off-by: Christian Ehrhardt Signed-off-by: Avi Kivity --- arch/s390/kvm/kvm-s390.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1d65f6277166..5c1c30259002 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -527,8 +527,10 @@ rerun_vcpu: if (rc == SIE_INTERCEPT_RERUNVCPU) goto rerun_vcpu; - if (signal_pending(current) && !rc) + if (signal_pending(current) && !rc) { + kvm_run->exit_reason = KVM_EXIT_INTR; rc = -EINTR; + } if (rc == -ENOTSUPP) { /* intercept cannot be handled in-kernel, prepare kvm-run */ -- cgit v1.2.3-70-g09d2 From 628eb9b8a8f3ef31d8316112a4596b1a21b38159 Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Mon, 25 May 2009 13:40:51 +0200 Subject: KVM: s390: streamline memslot handling This patch relocates the variables kvm-s390 uses to track guest mem addr/size. As discussed dropping the variables at struct kvm_arch level allows to use the common vcpu->request based mechanism to reload guest memory if e.g. changes via set_memory_region. The kick mechanism introduced in this series is used to ensure running vcpus leave guest state to catch the update. Signed-off-by: Christian Ehrhardt Signed-off-by: Avi Kivity --- arch/s390/include/asm/kvm_host.h | 4 +--- arch/s390/kvm/gaccess.h | 23 +++++++++--------- arch/s390/kvm/intercept.c | 6 ++--- arch/s390/kvm/kvm-s390.c | 50 +++++++++++++--------------------------- arch/s390/kvm/kvm-s390.h | 29 ++++++++++++++++++++++- arch/s390/kvm/sigp.c | 4 ++-- 6 files changed, 62 insertions(+), 54 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 0a784f6f9c90..75535d4d7a05 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -1,7 +1,7 @@ /* * asm-s390/kvm_host.h - definition for kernel virtual machines on s390 * - * Copyright IBM Corp. 2008 + * Copyright IBM Corp. 2008,2009 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) @@ -228,8 +228,6 @@ struct kvm_vm_stat { }; struct kvm_arch{ - unsigned long guest_origin; - unsigned long guest_memsize; struct sca_block *sca; debug_info_t *dbf; struct kvm_s390_float_interrupt float_int; diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index ed60f3a74a85..03c716a0f01f 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -1,7 +1,7 @@ /* * gaccess.h - access guest memory * - * Copyright IBM Corp. 2008 + * Copyright IBM Corp. 2008,2009 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) @@ -16,13 +16,14 @@ #include #include #include +#include "kvm-s390.h" static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu, unsigned long guestaddr) { unsigned long prefix = vcpu->arch.sie_block->prefix; - unsigned long origin = vcpu->kvm->arch.guest_origin; - unsigned long memsize = vcpu->kvm->arch.guest_memsize; + unsigned long origin = vcpu->arch.sie_block->gmsor; + unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu); if (guestaddr < 2 * PAGE_SIZE) guestaddr += prefix; @@ -158,8 +159,8 @@ static inline int copy_to_guest(struct kvm_vcpu *vcpu, unsigned long guestdest, const void *from, unsigned long n) { unsigned long prefix = vcpu->arch.sie_block->prefix; - unsigned long origin = vcpu->kvm->arch.guest_origin; - unsigned long memsize = vcpu->kvm->arch.guest_memsize; + unsigned long origin = vcpu->arch.sie_block->gmsor; + unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu); if ((guestdest < 2 * PAGE_SIZE) && (guestdest + n > 2 * PAGE_SIZE)) goto slowpath; @@ -209,8 +210,8 @@ static inline int copy_from_guest(struct kvm_vcpu *vcpu, void *to, unsigned long guestsrc, unsigned long n) { unsigned long prefix = vcpu->arch.sie_block->prefix; - unsigned long origin = vcpu->kvm->arch.guest_origin; - unsigned long memsize = vcpu->kvm->arch.guest_memsize; + unsigned long origin = vcpu->arch.sie_block->gmsor; + unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu); if ((guestsrc < 2 * PAGE_SIZE) && (guestsrc + n > 2 * PAGE_SIZE)) goto slowpath; @@ -244,8 +245,8 @@ static inline int copy_to_guest_absolute(struct kvm_vcpu *vcpu, unsigned long guestdest, const void *from, unsigned long n) { - unsigned long origin = vcpu->kvm->arch.guest_origin; - unsigned long memsize = vcpu->kvm->arch.guest_memsize; + unsigned long origin = vcpu->arch.sie_block->gmsor; + unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu); if (guestdest + n > memsize) return -EFAULT; @@ -262,8 +263,8 @@ static inline int copy_from_guest_absolute(struct kvm_vcpu *vcpu, void *to, unsigned long guestsrc, unsigned long n) { - unsigned long origin = vcpu->kvm->arch.guest_origin; - unsigned long memsize = vcpu->kvm->arch.guest_memsize; + unsigned long origin = vcpu->arch.sie_block->gmsor; + unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu); if (guestsrc + n > memsize) return -EFAULT; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 0732ab4305f4..ba9d8a7bc1ac 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -1,7 +1,7 @@ /* * intercept.c - in-kernel handling for sie intercepts * - * Copyright IBM Corp. 2008 + * Copyright IBM Corp. 2008,2009 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) @@ -164,9 +164,9 @@ static int handle_validity(struct kvm_vcpu *vcpu) vcpu->stat.exit_validity++; if ((viwhy == 0x37) && (vcpu->arch.sie_block->prefix - <= vcpu->kvm->arch.guest_memsize - 2*PAGE_SIZE)){ + <= kvm_s390_vcpu_get_memsize(vcpu) - 2*PAGE_SIZE)) { rc = fault_in_pages_writeable((char __user *) - vcpu->kvm->arch.guest_origin + + vcpu->arch.sie_block->gmsor + vcpu->arch.sie_block->prefix, 2*PAGE_SIZE); if (rc) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5c1c30259002..098bfa6fbdf6 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1,7 +1,7 @@ /* * s390host.c -- hosting zSeries kernel virtual machines * - * Copyright IBM Corp. 2008 + * Copyright IBM Corp. 2008,2009 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) @@ -10,6 +10,7 @@ * Author(s): Carsten Otte * Christian Borntraeger * Heiko Carstens + * Christian Ehrhardt */ #include @@ -278,16 +279,10 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gbea = 1; } -/* The current code can have up to 256 pages for virtio */ -#define VIRTIODESCSPACE (256ul * 4096ul) - int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH); - vcpu->arch.sie_block->gmslm = vcpu->kvm->arch.guest_memsize + - vcpu->kvm->arch.guest_origin + - VIRTIODESCSPACE - 1ul; - vcpu->arch.sie_block->gmsor = vcpu->kvm->arch.guest_origin; + set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests); vcpu->arch.sie_block->ecb = 2; vcpu->arch.sie_block->eca = 0xC1002001U; vcpu->arch.sie_block->fac = (int) (long) facilities; @@ -491,9 +486,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu_load(vcpu); rerun_vcpu: + if (vcpu->requests) + if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) + kvm_s390_vcpu_set_mem(vcpu); + /* verify, that memory has been registered */ - if (!vcpu->kvm->arch.guest_memsize) { + if (!vcpu->arch.sie_block->gmslm) { vcpu_put(vcpu); + VCPU_EVENT(vcpu, 3, "%s", "no memory registered to run vcpu"); return -EINVAL; } @@ -691,7 +691,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, vmas. It is okay to mmap() and munmap() stuff in this slot after doing this call at any time */ - if (mem->slot || kvm->arch.guest_memsize) + if (mem->slot) return -EINVAL; if (mem->guest_phys_addr) @@ -706,36 +706,18 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (!user_alloc) return -EINVAL; - /* lock all vcpus */ - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (!kvm->vcpus[i]) - continue; - if (!mutex_trylock(&kvm->vcpus[i]->mutex)) - goto fail_out; - } - - kvm->arch.guest_origin = mem->userspace_addr; - kvm->arch.guest_memsize = mem->memory_size; - - /* update sie control blocks, and unlock all vcpus */ + /* request update of sie control block for all available vcpus */ for (i = 0; i < KVM_MAX_VCPUS; ++i) { if (kvm->vcpus[i]) { - kvm->vcpus[i]->arch.sie_block->gmsor = - kvm->arch.guest_origin; - kvm->vcpus[i]->arch.sie_block->gmslm = - kvm->arch.guest_memsize + - kvm->arch.guest_origin + - VIRTIODESCSPACE - 1ul; - mutex_unlock(&kvm->vcpus[i]->mutex); + if (test_and_set_bit(KVM_REQ_MMU_RELOAD, + &kvm->vcpus[i]->requests)) + continue; + kvm_s390_inject_sigp_stop(kvm->vcpus[i], + ACTION_RELOADVCPU_ON_STOP); } } return 0; - -fail_out: - for (; i >= 0; i--) - mutex_unlock(&kvm->vcpus[i]->mutex); - return -EINVAL; } void kvm_arch_flush_shadow(struct kvm *kvm) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 2072cd4a013e..ec5eee7c25d8 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -1,7 +1,7 @@ /* * kvm_s390.h - definition for kvm on s390 * - * Copyright IBM Corp. 2008 + * Copyright IBM Corp. 2008,2009 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) @@ -9,6 +9,7 @@ * * Author(s): Carsten Otte * Christian Borntraeger + * Christian Ehrhardt */ #ifndef ARCH_S390_KVM_S390_H @@ -18,6 +19,9 @@ #include #include +/* The current code can have up to 256 pages for virtio */ +#define VIRTIODESCSPACE (256ul * 4096ul) + typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); /* negativ values are error codes, positive values for internal conditions */ @@ -54,6 +58,29 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action); +static inline int kvm_s390_vcpu_get_memsize(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.sie_block->gmslm + - vcpu->arch.sie_block->gmsor + - VIRTIODESCSPACE + 1ul; +} + +static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu) +{ + struct kvm_memory_slot *mem; + + down_read(&vcpu->kvm->slots_lock); + mem = &vcpu->kvm->memslots[0]; + + vcpu->arch.sie_block->gmsor = mem->userspace_addr; + vcpu->arch.sie_block->gmslm = + mem->userspace_addr + + (mem->npages << PAGE_SHIFT) + + VIRTIODESCSPACE - 1ul; + + up_read(&vcpu->kvm->slots_lock); +} + /* implemented in priv.c */ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 21897b0f8a36..40c8c6748cfe 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -189,9 +189,9 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, /* make sure that the new value is valid memory */ address = address & 0x7fffe000u; if ((copy_from_guest(vcpu, &tmp, - (u64) (address + vcpu->kvm->arch.guest_origin) , 1)) || + (u64) (address + vcpu->arch.sie_block->gmsor) , 1)) || (copy_from_guest(vcpu, &tmp, (u64) (address + - vcpu->kvm->arch.guest_origin + PAGE_SIZE), 1))) { + vcpu->arch.sie_block->gmsor + PAGE_SIZE), 1))) { *reg |= SIGP_STAT_INVALID_PARAMETER; return 1; /* invalid parameter */ } -- cgit v1.2.3-70-g09d2 From e7333391403b31feb27a05bc0dcd052a471f1276 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 21 May 2009 13:50:13 +0800 Subject: KVM: Downsize max support MSI-X entry to 256 We only trap one page for MSI-X entry now, so it's 4k/(128/8) = 256 entries at most. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- include/linux/kvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 65df6f5e2b98..632a8560dbcf 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -676,7 +676,7 @@ struct kvm_assigned_msix_nr { __u16 padding; }; -#define KVM_MAX_MSIX_PER_DEV 512 +#define KVM_MAX_MSIX_PER_DEV 256 struct kvm_assigned_msix_entry { __u32 assigned_dev_id; __u32 gsi; -- cgit v1.2.3-70-g09d2 From 017cb99e875f2d8ff375cbb576c794b081cd0bd5 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 28 May 2009 11:56:31 +0200 Subject: KVM: SVM: use explicit 64bit storage for sysenter values Since AMD does not support sysenter in 64bit mode, the VMCB fields storing the MSRs are truncated to 32bit upon VMRUN/#VMEXIT. So store the values in a separate 64bit storage to avoid truncation. [andre: fix amd->amd migration] Signed-off-by: Christoph Egger Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/kvm_svm.h | 2 ++ arch/x86/kvm/svm.c | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index ed66e4c078dc..6f275b4cf628 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h @@ -27,6 +27,8 @@ struct vcpu_svm { unsigned long vmcb_pa; struct svm_cpu_data *svm_data; uint64_t asid_generation; + uint64_t sysenter_esp; + uint64_t sysenter_eip; u64 next_rip; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 48b22c9892d8..e3e7edca35d1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -367,8 +367,6 @@ static void svm_vcpu_init_msrpm(u32 *msrpm) #endif set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); - set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); } static void svm_enable_lbrv(struct vcpu_svm *svm) @@ -1981,10 +1979,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) *data = svm->vmcb->save.sysenter_cs; break; case MSR_IA32_SYSENTER_EIP: - *data = svm->vmcb->save.sysenter_eip; + *data = svm->sysenter_eip; break; case MSR_IA32_SYSENTER_ESP: - *data = svm->vmcb->save.sysenter_esp; + *data = svm->sysenter_esp; break; /* Nobody will change the following 5 values in the VMCB so we can safely return them on rdmsr. They will always be 0 @@ -2071,9 +2069,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) svm->vmcb->save.sysenter_cs = data; break; case MSR_IA32_SYSENTER_EIP: + svm->sysenter_eip = data; svm->vmcb->save.sysenter_eip = data; break; case MSR_IA32_SYSENTER_ESP: + svm->sysenter_esp = data; svm->vmcb->save.sysenter_esp = data; break; case MSR_IA32_DEBUGCTLMSR: -- cgit v1.2.3-70-g09d2 From 968a6347936f9421b5bc5358b579e811c52726bc Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 30 Apr 2009 10:58:42 +0800 Subject: KVM: No disable_irq for MSI/MSI-X interrupt on device assignment Disable interrupt at interrupt handler and enable it when guest ack is for the level triggered interrupt, to prevent reinjected interrupt. MSI/MSI-X don't need it. One possible problem is multiply same vector interrupt injected between irq handler and scheduled work handler would be merged as one for MSI/MSI-X. But AFAIK, the drivers handle it well. The patch fixed the oplin card performance issue(MSI-X performance is half of MSI/INTx). Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index dee321e58894..2d9bc439d9a9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -120,7 +120,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) { struct kvm_assigned_dev_kernel *assigned_dev; struct kvm *kvm; - int irq, i; + int i; assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, interrupt_work); @@ -143,20 +143,10 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, guest_entries[i].vector, 1); - irq = assigned_dev->host_msix_entries[i].vector; - if (irq != 0) - enable_irq(irq); - assigned_dev->host_irq_disabled = false; } - } else { + } else kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, assigned_dev->guest_irq, 1); - if (assigned_dev->irq_requested_type & - KVM_DEV_IRQ_GUEST_MSI) { - enable_irq(assigned_dev->host_irq); - assigned_dev->host_irq_disabled = false; - } - } spin_unlock_irq(&assigned_dev->assigned_dev_lock); mutex_unlock(&assigned_dev->kvm->lock); @@ -179,8 +169,10 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) schedule_work(&assigned_dev->interrupt_work); - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { + disable_irq_nosync(irq); + assigned_dev->host_irq_disabled = true; + } out: spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); @@ -417,6 +409,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, { dev->guest_irq = irq->guest_irq; dev->ack_notifier.gsi = -1; + dev->host_irq_disabled = false; return 0; } #endif @@ -427,6 +420,7 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, { dev->guest_irq = irq->guest_irq; dev->ack_notifier.gsi = -1; + dev->host_irq_disabled = false; return 0; } #endif -- cgit v1.2.3-70-g09d2 From b188d2d365702cfc660a56d66f4bf77ebf14a5c2 Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Fri, 29 May 2009 12:58:50 +0200 Subject: KVM: remove redundant declarations Changing s390 code in kvm_arch_vcpu_load/put come across this header declarations. They are complete duplicates, not even useful forward declarations as nothing using it is in between (maybe it was that in the past). This patch removes the two dispensable lines. Signed-off-by: Christian Ehrhardt Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7724dcb6ff76..19240feefe6f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -249,8 +249,6 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); -void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); int kvm_dev_ioctl_check_extension(long ext); -- cgit v1.2.3-70-g09d2 From 6c8166a77c98f473eb91e96a61c3cf78ac617278 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 31 May 2009 18:15:37 +0300 Subject: KVM: SVM: Fold kvm_svm.h info svm.c kvm_svm.h is only included from svm.c, so fold it in. Signed-off-by: Avi Kivity --- arch/x86/kvm/kvm_svm.h | 53 -------------------------------------------------- arch/x86/kvm/svm.c | 41 +++++++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 54 deletions(-) delete mode 100644 arch/x86/kvm/kvm_svm.h diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h deleted file mode 100644 index 6f275b4cf628..000000000000 --- a/arch/x86/kvm/kvm_svm.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef __KVM_SVM_H -#define __KVM_SVM_H - -#include -#include -#include -#include -#include - -#include - -static const u32 host_save_user_msrs[] = { -#ifdef CONFIG_X86_64 - MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, - MSR_FS_BASE, -#endif - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, -}; - -#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) - -struct kvm_vcpu; - -struct vcpu_svm { - struct kvm_vcpu vcpu; - struct vmcb *vmcb; - unsigned long vmcb_pa; - struct svm_cpu_data *svm_data; - uint64_t asid_generation; - uint64_t sysenter_esp; - uint64_t sysenter_eip; - - u64 next_rip; - - u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - u64 host_gs_base; - unsigned long host_cr2; - - u32 *msrpm; - struct vmcb *hsave; - u64 hsave_msr; - - u64 nested_vmcb; - - /* These are the merged vectors */ - u32 *nested_msrpm; - - /* gpa pointers to the real vectors */ - u64 nested_vmcb_msrpm; -}; - -#endif - diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e3e7edca35d1..522e69597a16 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -15,7 +15,6 @@ */ #include -#include "kvm_svm.h" #include "irq.h" #include "mmu.h" #include "kvm_cache_regs.h" @@ -57,6 +56,46 @@ MODULE_LICENSE("GPL"); #define nsvm_printk(fmt, args...) do {} while(0) #endif +static const u32 host_save_user_msrs[] = { +#ifdef CONFIG_X86_64 + MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, + MSR_FS_BASE, +#endif + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, +}; + +#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) + +struct kvm_vcpu; + +struct vcpu_svm { + struct kvm_vcpu vcpu; + struct vmcb *vmcb; + unsigned long vmcb_pa; + struct svm_cpu_data *svm_data; + uint64_t asid_generation; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + + u64 next_rip; + + u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; + u64 host_gs_base; + unsigned long host_cr2; + + u32 *msrpm; + struct vmcb *hsave; + u64 hsave_msr; + + u64 nested_vmcb; + + /* These are the merged vectors */ + u32 *nested_msrpm; + + /* gpa pointers to the real vectors */ + u64 nested_vmcb_msrpm; +}; + /* enable NPT for AMD64 and X86 with PAE */ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static bool npt_enabled = true; -- cgit v1.2.3-70-g09d2 From 2986b8c72c272ea58edd37903b042c6da985627d Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 2 Jun 2009 11:46:14 +1000 Subject: KVM: powerpc: fix some init/exit annotations Fixes a couple of warnings like this one: WARNING: arch/powerpc/kvm/kvm-440.o(.text+0x1e8c): Section mismatch in reference from the function kvmppc_44x_exit() to the function .exit.text:kvmppc_booke_exit() The function kvmppc_44x_exit() references a function in an exit section. Often the function kvmppc_booke_exit() has valid usage outside the exit section and the fix is to remove the __exit annotation of kvmppc_booke_exit. Also add some __init annotations on obvious routines. Signed-off-by: Stephen Rothwell Signed-off-by: Avi Kivity --- arch/powerpc/kvm/44x.c | 4 ++-- arch/powerpc/kvm/booke.c | 2 +- arch/powerpc/kvm/e500.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 0cef809cec21..f4d1b55aa70b 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -138,7 +138,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_44x); } -static int kvmppc_44x_init(void) +static int __init kvmppc_44x_init(void) { int r; @@ -149,7 +149,7 @@ static int kvmppc_44x_init(void) return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE); } -static void kvmppc_44x_exit(void) +static void __exit kvmppc_44x_exit(void) { kvmppc_booke_exit(); } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 642e4204cf25..e7bf4d029484 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -520,7 +520,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return kvmppc_core_vcpu_translate(vcpu, tr); } -int kvmppc_booke_init(void) +int __init kvmppc_booke_init(void) { unsigned long ivor[16]; unsigned long max_ivor = 0; diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index d8067fd81cdd..674e796f7aa5 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -132,7 +132,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } -static int kvmppc_e500_init(void) +static int __init kvmppc_e500_init(void) { int r, i; unsigned long ivor[3]; @@ -160,7 +160,7 @@ static int kvmppc_e500_init(void) return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE); } -static void kvmppc_e500_exit(void) +static void __init kvmppc_e500_exit(void) { kvmppc_booke_exit(); } -- cgit v1.2.3-70-g09d2 From 787a660a4f03325a0e00493ac39017e53fd345fa Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 1 Jun 2009 12:54:45 -0400 Subject: KVM: Clean up coalesced_mmio destruction We invoke kfree() on a data member instead of the structure. This works today because the kvm_io_device is the first element of the private structure, but this could change in the future, so lets clean this up. Signed-off-by: Gregory Haskins Acked-by: Chris Wright Signed-off-by: Avi Kivity --- virt/kvm/coalesced_mmio.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 5ae620d32fac..03ea2807b1f2 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -80,7 +80,10 @@ static void coalesced_mmio_write(struct kvm_io_device *this, static void coalesced_mmio_destructor(struct kvm_io_device *this) { - kfree(this); + struct kvm_coalesced_mmio_dev *dev = + (struct kvm_coalesced_mmio_dev *)this->private; + + kfree(dev); } int kvm_coalesced_mmio_init(struct kvm *kvm) -- cgit v1.2.3-70-g09d2 From d76685c4a074041ed168e0b04dd604c3df5dcaa5 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 1 Jun 2009 12:54:50 -0400 Subject: KVM: cleanup io_device code We modernize the io_device code so that we use container_of() instead of dev->private, and move the vtable to a separate ops structure (theoretically allows better caching for multiple instances of the same ops structure) Signed-off-by: Gregory Haskins Acked-by: Chris Wright Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 40 ++++++++++++++++++++++++++++------------ arch/x86/kvm/i8259.c | 20 ++++++++++++++------ arch/x86/kvm/lapic.c | 22 +++++++++++++++------- arch/x86/kvm/x86.c | 2 +- virt/kvm/coalesced_mmio.c | 25 +++++++++++++++---------- virt/kvm/ioapic.c | 22 +++++++++++++++------- virt/kvm/iodev.h | 29 ++++++++++++++++++++--------- virt/kvm/kvm_main.c | 2 +- 8 files changed, 109 insertions(+), 53 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 0990bc9aac1f..e800d2d66266 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -350,10 +350,20 @@ void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val) mutex_unlock(&kvm->arch.vpit->pit_state.lock); } +static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_pit, dev); +} + +static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_pit, speaker_dev); +} + static void pit_ioport_write(struct kvm_io_device *this, gpa_t addr, int len, const void *data) { - struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; int channel, access; @@ -426,7 +436,7 @@ static void pit_ioport_write(struct kvm_io_device *this, static void pit_ioport_read(struct kvm_io_device *this, gpa_t addr, int len, void *data) { - struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; int ret, count; @@ -497,7 +507,7 @@ static int pit_in_range(struct kvm_io_device *this, gpa_t addr, static void speaker_ioport_write(struct kvm_io_device *this, gpa_t addr, int len, const void *data) { - struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; u32 val = *(u32 *) data; @@ -511,7 +521,7 @@ static void speaker_ioport_write(struct kvm_io_device *this, static void speaker_ioport_read(struct kvm_io_device *this, gpa_t addr, int len, void *data) { - struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; unsigned int refresh_clock; @@ -563,6 +573,18 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) } } +static const struct kvm_io_device_ops pit_dev_ops = { + .read = pit_ioport_read, + .write = pit_ioport_write, + .in_range = pit_in_range, +}; + +static const struct kvm_io_device_ops speaker_dev_ops = { + .read = speaker_ioport_read, + .write = speaker_ioport_write, + .in_range = speaker_in_range, +}; + struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; @@ -583,17 +605,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) spin_lock_init(&pit->pit_state.inject_lock); /* Initialize PIO device */ - pit->dev.read = pit_ioport_read; - pit->dev.write = pit_ioport_write; - pit->dev.in_range = pit_in_range; - pit->dev.private = pit; + kvm_iodevice_init(&pit->dev, &pit_dev_ops); kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); if (flags & KVM_PIT_SPEAKER_DUMMY) { - pit->speaker_dev.read = speaker_ioport_read; - pit->speaker_dev.write = speaker_ioport_write; - pit->speaker_dev.in_range = speaker_in_range; - pit->speaker_dev.private = pit; + kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 1ccb50c74f18..2520922282d5 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -444,10 +444,15 @@ static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, } } +static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_pic, dev); +} + static void picdev_write(struct kvm_io_device *this, gpa_t addr, int len, const void *val) { - struct kvm_pic *s = this->private; + struct kvm_pic *s = to_pic(this); unsigned char data = *(unsigned char *)val; if (len != 1) { @@ -474,7 +479,7 @@ static void picdev_write(struct kvm_io_device *this, static void picdev_read(struct kvm_io_device *this, gpa_t addr, int len, void *val) { - struct kvm_pic *s = this->private; + struct kvm_pic *s = to_pic(this); unsigned char data = 0; if (len != 1) { @@ -516,6 +521,12 @@ static void pic_irq_request(void *opaque, int level) } } +static const struct kvm_io_device_ops picdev_ops = { + .read = picdev_read, + .write = picdev_write, + .in_range = picdev_in_range, +}; + struct kvm_pic *kvm_create_pic(struct kvm *kvm) { struct kvm_pic *s; @@ -534,10 +545,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) /* * Initialize PIO device */ - s->dev.read = picdev_read; - s->dev.write = picdev_write; - s->dev.in_range = picdev_in_range; - s->dev.private = s; + kvm_iodevice_init(&s->dev, &picdev_ops); kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); return s; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ae99d83f81a3..4bfd458a4f3e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -522,10 +522,15 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) return val; } +static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_lapic, dev); +} + static void apic_mmio_read(struct kvm_io_device *this, gpa_t address, int len, void *data) { - struct kvm_lapic *apic = (struct kvm_lapic *)this->private; + struct kvm_lapic *apic = to_lapic(this); unsigned int offset = address - apic->base_address; unsigned char alignment = offset & 0xf; u32 result; @@ -606,7 +611,7 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) static void apic_mmio_write(struct kvm_io_device *this, gpa_t address, int len, const void *data) { - struct kvm_lapic *apic = (struct kvm_lapic *)this->private; + struct kvm_lapic *apic = to_lapic(this); unsigned int offset = address - apic->base_address; unsigned char alignment = offset & 0xf; u32 val; @@ -723,7 +728,7 @@ static void apic_mmio_write(struct kvm_io_device *this, static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, int len, int size) { - struct kvm_lapic *apic = (struct kvm_lapic *)this->private; + struct kvm_lapic *apic = to_lapic(this); int ret = 0; @@ -917,6 +922,12 @@ static struct kvm_timer_ops lapic_timer_ops = { .is_periodic = lapic_is_periodic, }; +static const struct kvm_io_device_ops apic_mmio_ops = { + .read = apic_mmio_read, + .write = apic_mmio_write, + .in_range = apic_mmio_range, +}; + int kvm_create_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; @@ -951,10 +962,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; kvm_lapic_reset(vcpu); - apic->dev.read = apic_mmio_read; - apic->dev.write = apic_mmio_write; - apic->dev.in_range = apic_mmio_range; - apic->dev.private = apic; + kvm_iodevice_init(&apic->dev, &apic_mmio_ops); return 0; nomem_free_apic: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5eb3b8dd74b8..75e9df097845 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2264,7 +2264,7 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, if (vcpu->arch.apic) { dev = &vcpu->arch.apic->dev; - if (dev->in_range(dev, addr, len, is_write)) + if (kvm_iodevice_in_range(dev, addr, len, is_write)) return dev; } return NULL; diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 03ea2807b1f2..c4c7ec2f9d30 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -14,11 +14,15 @@ #include "coalesced_mmio.h" +static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_coalesced_mmio_dev, dev); +} + static int coalesced_mmio_in_range(struct kvm_io_device *this, gpa_t addr, int len, int is_write) { - struct kvm_coalesced_mmio_dev *dev = - (struct kvm_coalesced_mmio_dev*)this->private; + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); struct kvm_coalesced_mmio_zone *zone; int next; int i; @@ -63,8 +67,7 @@ static int coalesced_mmio_in_range(struct kvm_io_device *this, static void coalesced_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, const void *val) { - struct kvm_coalesced_mmio_dev *dev = - (struct kvm_coalesced_mmio_dev*)this->private; + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; /* kvm->lock must be taken by caller before call to in_range()*/ @@ -80,12 +83,17 @@ static void coalesced_mmio_write(struct kvm_io_device *this, static void coalesced_mmio_destructor(struct kvm_io_device *this) { - struct kvm_coalesced_mmio_dev *dev = - (struct kvm_coalesced_mmio_dev *)this->private; + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); kfree(dev); } +static const struct kvm_io_device_ops coalesced_mmio_ops = { + .write = coalesced_mmio_write, + .in_range = coalesced_mmio_in_range, + .destructor = coalesced_mmio_destructor, +}; + int kvm_coalesced_mmio_init(struct kvm *kvm) { struct kvm_coalesced_mmio_dev *dev; @@ -93,10 +101,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm) dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); if (!dev) return -ENOMEM; - dev->dev.write = coalesced_mmio_write; - dev->dev.in_range = coalesced_mmio_in_range; - dev->dev.destructor = coalesced_mmio_destructor; - dev->dev.private = dev; + kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); dev->kvm = kvm; kvm->coalesced_mmio_dev = dev; kvm_io_bus_register_dev(&kvm->mmio_bus, &dev->dev); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 1150c6d5c7b8..469b9faae665 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -220,10 +220,15 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) __kvm_ioapic_update_eoi(ioapic, i, trigger_mode); } +static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_ioapic, dev); +} + static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, int len, int is_write) { - struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + struct kvm_ioapic *ioapic = to_ioapic(this); return ((addr >= ioapic->base_address && (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); @@ -232,7 +237,7 @@ static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, void *val) { - struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + struct kvm_ioapic *ioapic = to_ioapic(this); u32 result; ioapic_debug("addr %lx\n", (unsigned long)addr); @@ -269,7 +274,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, const void *val) { - struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + struct kvm_ioapic *ioapic = to_ioapic(this); u32 data; ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", @@ -314,6 +319,12 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->id = 0; } +static const struct kvm_io_device_ops ioapic_mmio_ops = { + .read = ioapic_mmio_read, + .write = ioapic_mmio_write, + .in_range = ioapic_in_range, +}; + int kvm_ioapic_init(struct kvm *kvm) { struct kvm_ioapic *ioapic; @@ -323,10 +334,7 @@ int kvm_ioapic_init(struct kvm *kvm) return -ENOMEM; kvm->arch.vioapic = ioapic; kvm_ioapic_reset(ioapic); - ioapic->dev.read = ioapic_mmio_read; - ioapic->dev.write = ioapic_mmio_write; - ioapic->dev.in_range = ioapic_in_range; - ioapic->dev.private = ioapic; + kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); ioapic->kvm = kvm; kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); return 0; diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h index 55e8846ac3a6..2c67f5acd6db 100644 --- a/virt/kvm/iodev.h +++ b/virt/kvm/iodev.h @@ -18,7 +18,9 @@ #include -struct kvm_io_device { +struct kvm_io_device; + +struct kvm_io_device_ops { void (*read)(struct kvm_io_device *this, gpa_t addr, int len, @@ -30,16 +32,25 @@ struct kvm_io_device { int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len, int is_write); void (*destructor)(struct kvm_io_device *this); +}; + - void *private; +struct kvm_io_device { + const struct kvm_io_device_ops *ops; }; +static inline void kvm_iodevice_init(struct kvm_io_device *dev, + const struct kvm_io_device_ops *ops) +{ + dev->ops = ops; +} + static inline void kvm_iodevice_read(struct kvm_io_device *dev, gpa_t addr, int len, void *val) { - dev->read(dev, addr, len, val); + dev->ops->read(dev, addr, len, val); } static inline void kvm_iodevice_write(struct kvm_io_device *dev, @@ -47,19 +58,19 @@ static inline void kvm_iodevice_write(struct kvm_io_device *dev, int len, const void *val) { - dev->write(dev, addr, len, val); + dev->ops->write(dev, addr, len, val); } -static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, - gpa_t addr, int len, int is_write) +static inline int kvm_iodevice_in_range(struct kvm_io_device *dev, + gpa_t addr, int len, int is_write) { - return dev->in_range(dev, addr, len, is_write); + return dev->ops->in_range(dev, addr, len, is_write); } static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) { - if (dev->destructor) - dev->destructor(dev); + if (dev->ops->destructor) + dev->ops->destructor(dev); } #endif /* __KVM_IODEV_H__ */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2d9bc439d9a9..d1f9c87daa87 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2463,7 +2463,7 @@ struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, for (i = 0; i < bus->dev_count; i++) { struct kvm_io_device *pos = bus->devs[i]; - if (pos->in_range(pos, addr, len, is_write)) + if (kvm_iodevice_in_range(pos, addr, len, is_write)) return pos; } -- cgit v1.2.3-70-g09d2 From 6b66ac1ae3328177305a2600eb85b7446f41fdc9 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 1 Jun 2009 12:54:56 -0400 Subject: KVM: do not register i8254 PIO regions until we are initialized We currently publish the i8254 resources to the pio_bus before the devices are fully initialized. Since we hold the pit_lock, its probably not a real issue. But lets clean this up anyway. Reported-by: Avi Kivity Signed-off-by: Gregory Haskins Acked-by: Chris Wright Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index e800d2d66266..977af7ab8193 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -604,15 +604,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_lock(&pit->pit_state.lock); spin_lock_init(&pit->pit_state.inject_lock); - /* Initialize PIO device */ - kvm_iodevice_init(&pit->dev, &pit_dev_ops); - kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); - - if (flags & KVM_PIT_SPEAKER_DUMMY) { - kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); - } - kvm->arch.vpit = pit; pit->kvm = kvm; @@ -631,6 +622,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pit->mask_notifier.func = pit_mask_notifer; kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_iodevice_init(&pit->dev, &pit_dev_ops); + kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + + if (flags & KVM_PIT_SPEAKER_DUMMY) { + kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); + kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); + } + return pit; } -- cgit v1.2.3-70-g09d2 From 2d84e993a8947dbeb6b416555940d97522330846 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 31 May 2009 18:29:59 +0300 Subject: KVM: VMX: Avoid duplicate ept tlb flush when setting cr3 vmx_set_cr3() will call vmx_tlb_flush(), which will flush the ept context. So there is no need to call ept_sync_context() explicitly. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c14bffc8c1f9..ea0e1d5ebe70 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1651,7 +1651,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); - ept_sync_context(eptp); ept_load_pdptrs(vcpu); guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : VMX_EPT_IDENTITY_PAGETABLE_ADDR; -- cgit v1.2.3-70-g09d2 From 8f5d549f028056d6ad6044f2d9e27ecf361d955e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 31 May 2009 18:41:29 +0300 Subject: KVM: VMX: Simplify pdptr and cr3 management Instead of reading the PDPTRs from memory after every exit (which is slow and wrong, as the PDPTRs are stored on the cpu), sync the PDPTRs from memory to the VMCS before entry, and from the VMCS to memory after exit. Do the same for cr3. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ea0e1d5ebe70..752465f98bfd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1547,10 +1547,6 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_ERR "EPT: Fail to load pdptrs!\n"); - return; - } vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); @@ -1558,6 +1554,16 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) } } +static void ept_save_pdptrs(struct kvm_vcpu *vcpu) +{ + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + } +} + static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, @@ -1651,7 +1657,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); - ept_load_pdptrs(vcpu); guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : VMX_EPT_IDENTITY_PAGETABLE_ADDR; } @@ -3252,7 +3257,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * to sync with guest real CR3. */ if (enable_ept && is_paging(vcpu)) { vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - ept_load_pdptrs(vcpu); + ept_save_pdptrs(vcpu); } if (unlikely(vmx->fail)) { @@ -3437,6 +3442,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (enable_ept && is_paging(vcpu)) { + vmcs_writel(GUEST_CR3, vcpu->arch.cr3); + ept_load_pdptrs(vcpu); + } /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) vmx->entry_time = ktime_get(); -- cgit v1.2.3-70-g09d2 From 6de4f3ada40b336522250a7832a0cc4de8856589 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 31 May 2009 22:58:47 +0300 Subject: KVM: Cache pdptrs Instead of reloading the pdptrs on every entry and exit (vmcs writes on vmx, guest memory access on svm) extract them on demand. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/kvm_cache_regs.h | 9 +++++++++ arch/x86/kvm/mmu.c | 7 +++++-- arch/x86/kvm/paging_tmpl.h | 2 +- arch/x86/kvm/svm.c | 24 ++++++++++++++++++------ arch/x86/kvm/vmx.c | 22 ++++++++++++++++++---- arch/x86/kvm/x86.c | 8 ++++++++ 7 files changed, 63 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 81c68f630b14..1cc901ec4ba5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -120,6 +120,10 @@ enum kvm_reg { NR_VCPU_REGS }; +enum kvm_reg_ex { + VCPU_EXREG_PDPTR = NR_VCPU_REGS, +}; + enum { VCPU_SREG_ES, VCPU_SREG_CS, diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 1ff819dce7d3..7bcc5b6a4403 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) kvm_register_write(vcpu, VCPU_REGS_RIP, val); } +static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) +{ + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) + kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); + + return vcpu->arch.pdptrs[index]; +} + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0ef5bb2b4043..8ee67e3fb9d0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,6 +18,7 @@ */ #include "mmu.h" +#include "kvm_cache_regs.h" #include #include @@ -1954,6 +1955,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) gfn_t root_gfn; struct kvm_mmu_page *sp; int direct = 0; + u64 pdptr; root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; @@ -1981,11 +1983,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - if (!is_present_pte(vcpu->arch.pdptrs[i])) { + pdptr = kvm_pdptr_read(vcpu, i); + if (!is_present_pte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } - root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; + root_gfn = pdptr >> PAGE_SHIFT; } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; if (mmu_check_root(vcpu, root_gfn)) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67785f635399..4cb1dbfd7c2a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -131,7 +131,7 @@ walk: pte = vcpu->arch.cr3; #if PTTYPE == 64 if (!is_long_mode(vcpu)) { - pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; + pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); if (!is_present_pte(pte)) goto not_present; --walker->level; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 522e69597a16..7749b0692cb2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -777,6 +777,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + switch (reg) { + case VCPU_EXREG_PDPTR: + BUG_ON(!npt_enabled); + load_pdptrs(vcpu, vcpu->arch.cr3); + break; + default: + BUG(); + } +} + static void svm_set_vintr(struct vcpu_svm *svm) { svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; @@ -2285,12 +2297,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } vcpu->arch.cr0 = svm->vmcb->save.cr0; vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - } if (mmu_reload) { kvm_mmu_reset_context(vcpu); kvm_mmu_load(vcpu); @@ -2641,6 +2647,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->next_rip = 0; + if (npt_enabled) { + vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); + vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); + } + svm_complete_interrupts(svm); } @@ -2749,6 +2760,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_gdt = svm_set_gdt, .get_dr = svm_get_dr, .set_dr = svm_set_dr, + .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 752465f98bfd..d726dec69529 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -161,6 +161,8 @@ static struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +static void ept_save_pdptrs(struct kvm_vcpu *vcpu); + /* * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it * away by decrementing the array size. @@ -1047,6 +1049,10 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) case VCPU_REGS_RIP: vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); break; + case VCPU_EXREG_PDPTR: + if (enable_ept) + ept_save_pdptrs(vcpu); + break; default: break; } @@ -1546,6 +1552,10 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty)) + return; + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); @@ -1562,6 +1572,11 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } + + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); @@ -3255,10 +3270,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ - if (enable_ept && is_paging(vcpu)) { + if (enable_ept && is_paging(vcpu)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - ept_save_pdptrs(vcpu); - } if (unlikely(vmx->fail)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -3567,7 +3580,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif ); - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; get_debugreg(vcpu->arch.dr6, 6); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 75e9df097845..2ad8c97f58cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -246,6 +246,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) ret = 1; memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty); out: return ret; @@ -261,6 +265,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) if (is_long_mode(vcpu) || !is_pae(vcpu)) return false; + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) + return true; + r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); if (r < 0) goto out; -- cgit v1.2.3-70-g09d2 From 596ae895653fe336b9b5815ad2c175d22bb26f21 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 3 Jun 2009 14:12:10 +0300 Subject: KVM: VMX: Fix reporting of unhandled EPT violations Instead of returning -ENOTSUPP, exit normally but indicate the hardware exit reason. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d726dec69529..959cb59cfaeb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3153,8 +3153,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", (long unsigned int)exit_qualification); kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = 0; - return -ENOTSUPP; + kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; + return 0; } gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); -- cgit v1.2.3-70-g09d2 From 105f8d40a737564100dc7dcd1d94d4e15fada20e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 4 Jun 2009 18:09:08 +0300 Subject: KVM: Calculate available entries in coalesced mmio ring Instead of checking whether we'll wrap around, calculate how many entries are available, and check whether we have enough (just one) for the pending mmio. By itself, this doesn't change anything, but it paves the way for making this function lockless. Signed-off-by: Avi Kivity --- virt/kvm/coalesced_mmio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index c4c7ec2f9d30..754906800999 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -24,7 +24,8 @@ static int coalesced_mmio_in_range(struct kvm_io_device *this, { struct kvm_coalesced_mmio_dev *dev = to_mmio(this); struct kvm_coalesced_mmio_zone *zone; - int next; + struct kvm_coalesced_mmio_ring *ring; + unsigned avail; int i; if (!is_write) @@ -40,10 +41,9 @@ static int coalesced_mmio_in_range(struct kvm_io_device *this, * check if we don't meet the first used entry * there is always one unused entry in the buffer */ - - next = (dev->kvm->coalesced_mmio_ring->last + 1) % - KVM_COALESCED_MMIO_MAX; - if (next == dev->kvm->coalesced_mmio_ring->first) { + ring = dev->kvm->coalesced_mmio_ring; + avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; + if (avail < 1) { /* full */ return 0; } -- cgit v1.2.3-70-g09d2 From 0cfb50e50dd27539fed9ba3cb33b0f22c0cddef0 Mon Sep 17 00:00:00 2001 From: Liu Yu Date: Fri, 5 Jun 2009 14:54:29 +0800 Subject: KVM: ppc: e500: Move to Book-3e MMU definitions According to commit 70fe3af8403f85196bb74f22ce4813db7dfedc1a. Signed-off-by: Liu Yu Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_tlb.c | 10 +++++----- arch/powerpc/kvm/e500_tlb.h | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 0e773fc2d5e4..a2048ac095ab 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -269,7 +269,7 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; pidsel = (vcpu_e500->mas4 >> 16) & 0xf; - tsized = (vcpu_e500->mas4 >> 8) & 0xf; + tsized = (vcpu_e500->mas4 >> 7) & 0x1f; vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); @@ -309,7 +309,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, vcpu_e500->shadow_pages[tlbsel][esel] = new_page; /* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */ - stlbe->mas1 = MAS1_TSIZE(BOOKE_PAGESZ_4K) + stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K) | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; stlbe->mas2 = (gvaddr & MAS2_EPN) | e500_shadow_mas2_attrib(gtlbe->mas2, @@ -545,7 +545,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) case 0: /* TLB0 */ gtlbe->mas1 &= ~MAS1_TSIZE(~0); - gtlbe->mas1 |= MAS1_TSIZE(BOOKE_PAGESZ_4K); + gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); stlbsel = 0; sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel); @@ -679,14 +679,14 @@ void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) /* Insert large initial mapping for guest. */ tlbe = &vcpu_e500->guest_tlb[1][0]; - tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_256M); + tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); tlbe->mas2 = 0; tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; tlbe->mas7 = 0; /* 4K map for serial output. Used by kernel wrapper. */ tlbe = &vcpu_e500->guest_tlb[1][1]; - tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_4K); + tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; tlbe->mas7 = 0; diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index 45b064b76906..d28e3010a5e2 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h @@ -16,7 +16,7 @@ #define __KVM_E500_TLB_H__ #include -#include +#include #include #include @@ -59,7 +59,7 @@ extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); /* TLB helper functions */ static inline unsigned int get_tlb_size(const struct tlbe *tlbe) { - return (tlbe->mas1 >> 8) & 0xf; + return (tlbe->mas1 >> 7) & 0x1f; } static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) @@ -70,7 +70,7 @@ static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) static inline u64 get_tlb_bytes(const struct tlbe *tlbe) { unsigned int pgsize = get_tlb_size(tlbe); - return 1ULL << 10 << (pgsize << 1); + return 1ULL << 10 << pgsize; } static inline gva_t get_tlb_end(const struct tlbe *tlbe) -- cgit v1.2.3-70-g09d2 From 5b7c1a2c17e77cd5416755bb9ac63278996f6c51 Mon Sep 17 00:00:00 2001 From: Liu Yu Date: Fri, 5 Jun 2009 14:54:30 +0800 Subject: KVM: ppc: e500: Directly pass pvr to guest Signed-off-by: Liu Yu Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/kvm/e500.c | 3 --- arch/powerpc/kvm/emulate.c | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index fddc3ed715fa..d4caa6127f55 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -153,7 +153,6 @@ struct kvm_vcpu_arch { u32 pid; u32 swap_pid; - u32 pvr; u32 ccr0; u32 ccr1; u32 dbcr0; diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 674e796f7aa5..64949eef43f1 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -60,9 +60,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) kvmppc_e500_tlb_setup(vcpu_e500); - /* Use the same core vertion as host's */ - vcpu->arch.pvr = mfspr(SPRN_PVR); - return 0; } diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index a561d6e8da1c..f8b8248cb9b0 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -187,7 +187,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case SPRN_SRR1: vcpu->arch.gpr[rt] = vcpu->arch.srr1; break; case SPRN_PVR: - vcpu->arch.gpr[rt] = vcpu->arch.pvr; break; + vcpu->arch.gpr[rt] = mfspr(SPRN_PVR); break; /* Note: mftb and TBRL/TBWL are user-accessible, so * the guest can always access the real TB anyways. -- cgit v1.2.3-70-g09d2 From 06579dd9c12f36ac575460a226d2020d84a0128b Mon Sep 17 00:00:00 2001 From: Liu Yu Date: Fri, 5 Jun 2009 14:54:31 +0800 Subject: KVM: ppc: e500: Add MMUCFG and PVR emulation Latest kernel started to use these two registers. Signed-off-by: Liu Yu Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_emulate.c | 3 +++ arch/powerpc/kvm/emulate.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 3f760414b9f8..be95b8d8e3b7 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -180,6 +180,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_MMUCSR0: vcpu->arch.gpr[rt] = 0; break; + case SPRN_MMUCFG: + vcpu->arch.gpr[rt] = mfspr(SPRN_MMUCFG); break; + /* extra exceptions */ case SPRN_IVOR32: vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index f8b8248cb9b0..28a8237fe78b 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -188,6 +188,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.gpr[rt] = vcpu->arch.srr1; break; case SPRN_PVR: vcpu->arch.gpr[rt] = mfspr(SPRN_PVR); break; + case SPRN_PIR: + vcpu->arch.gpr[rt] = mfspr(SPRN_PIR); break; /* Note: mftb and TBRL/TBWL are user-accessible, so * the guest can always access the real TB anyways. -- cgit v1.2.3-70-g09d2 From 238adc77051ab19b35663e7f214f52eaf44ed7ef Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 5 Jun 2009 16:13:34 +0200 Subject: KVM: Cleanup LAPIC interface None of the interface services the LAPIC emulation provides need to be exported to modules, and kvm_lapic_get_base is even totally unused today. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4bfd458a4f3e..a23f42e550af 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -194,7 +194,6 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) return highest_irr; } -EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode); @@ -768,7 +767,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) apic_set_tpr(apic, ((cr8 & 0x0f) << 4) | (apic_get_reg(apic, APIC_TASKPRI) & 4)); } -EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { @@ -781,7 +779,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) return (tpr & 0xf0) >> 4; } -EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) { @@ -805,12 +802,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) } -u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.apic_base; -} -EXPORT_SYMBOL_GPL(kvm_lapic_get_base); - void kvm_lapic_reset(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; @@ -860,7 +851,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) vcpu, kvm_apic_id(apic), vcpu->arch.apic_base, apic->base_address); } -EXPORT_SYMBOL_GPL(kvm_lapic_reset); bool kvm_apic_present(struct kvm_vcpu *vcpu) { @@ -871,7 +861,6 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu) { return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_enabled); /* *---------------------------------------------------------------------- @@ -970,7 +959,6 @@ nomem_free_apic: nomem: return -ENOMEM; } -EXPORT_SYMBOL_GPL(kvm_create_lapic); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) { -- cgit v1.2.3-70-g09d2 From 9f4cc12765ea48a40347449d6802a3322ced8709 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 4 Jun 2009 15:08:21 -0300 Subject: KVM: Grab pic lock in kvm_pic_clear_isr_ack isr_ack is protected by kvm_pic->lock. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 2520922282d5..bf94a45f4f86 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -72,8 +72,10 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); + pic_lock(s); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; + pic_unlock(s); } /* -- cgit v1.2.3-70-g09d2 From 64a2268dcfc9c3626aa7f70902690e2fc10c1630 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 4 Jun 2009 15:08:22 -0300 Subject: KVM: move coalesced_mmio locking to its own device Move coalesced_mmio locking to its own device, instead of relying on kvm->lock. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- virt/kvm/coalesced_mmio.c | 10 ++++------ virt/kvm/coalesced_mmio.h | 1 + 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 754906800999..397f41936698 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -31,10 +31,6 @@ static int coalesced_mmio_in_range(struct kvm_io_device *this, if (!is_write) return 0; - /* kvm->lock is taken by the caller and must be not released before - * dev.read/write - */ - /* Are we able to batch it ? */ /* last is the first free entry @@ -43,7 +39,7 @@ static int coalesced_mmio_in_range(struct kvm_io_device *this, */ ring = dev->kvm->coalesced_mmio_ring; avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; - if (avail < 1) { + if (avail < KVM_MAX_VCPUS) { /* full */ return 0; } @@ -70,7 +66,7 @@ static void coalesced_mmio_write(struct kvm_io_device *this, struct kvm_coalesced_mmio_dev *dev = to_mmio(this); struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; - /* kvm->lock must be taken by caller before call to in_range()*/ + spin_lock(&dev->lock); /* copy data in first free entry of the ring */ @@ -79,6 +75,7 @@ static void coalesced_mmio_write(struct kvm_io_device *this, memcpy(ring->coalesced_mmio[ring->last].data, val, len); smp_wmb(); ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; + spin_unlock(&dev->lock); } static void coalesced_mmio_destructor(struct kvm_io_device *this) @@ -101,6 +98,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm) dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); if (!dev) return -ENOMEM; + spin_lock_init(&dev->lock); kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); dev->kvm = kvm; kvm->coalesced_mmio_dev = dev; diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h index 5ac0ec628461..4b49f27fa31e 100644 --- a/virt/kvm/coalesced_mmio.h +++ b/virt/kvm/coalesced_mmio.h @@ -12,6 +12,7 @@ struct kvm_coalesced_mmio_dev { struct kvm_io_device dev; struct kvm *kvm; + spinlock_t lock; int nb_zones; struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX]; }; -- cgit v1.2.3-70-g09d2 From 60eead79ad8750f80384cbe48fc44edcc78a0305 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 4 Jun 2009 15:08:23 -0300 Subject: KVM: introduce irq_lock, use it to protect ioapic Introduce irq_lock, and use to protect ioapic data structures. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 3 ++- virt/kvm/ioapic.c | 5 +++++ virt/kvm/kvm_main.c | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 19240feefe6f..0c71688b1ee3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -124,7 +124,6 @@ struct kvm_kernel_irq_routing_entry { }; struct kvm { - struct mutex lock; /* protects the vcpus array and APIC accesses */ spinlock_t mmu_lock; spinlock_t requests_lock; struct rw_semaphore slots_lock; @@ -134,6 +133,7 @@ struct kvm { KVM_PRIVATE_MEM_SLOTS]; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; struct list_head vm_list; + struct mutex lock; struct kvm_io_bus mmio_bus; struct kvm_io_bus pio_bus; #ifdef CONFIG_HAVE_KVM_EVENTFD @@ -150,6 +150,7 @@ struct kvm { struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; #endif + struct mutex irq_lock; #ifdef CONFIG_HAVE_KVM_IRQCHIP struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */ struct hlist_head mask_notifier_list; diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 469b9faae665..2a5667173995 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -243,6 +243,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, ioapic_debug("addr %lx\n", (unsigned long)addr); ASSERT(!(addr & 0xf)); /* check alignment */ + mutex_lock(&ioapic->kvm->irq_lock); addr &= 0xff; switch (addr) { case IOAPIC_REG_SELECT: @@ -269,6 +270,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, default: printk(KERN_WARNING "ioapic: wrong length %d\n", len); } + mutex_unlock(&ioapic->kvm->irq_lock); } static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, @@ -280,6 +282,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", (void*)addr, len, val); ASSERT(!(addr & 0xf)); /* check alignment */ + + mutex_lock(&ioapic->kvm->irq_lock); if (len == 4 || len == 8) data = *(u32 *) val; else { @@ -305,6 +309,7 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, default: break; } + mutex_unlock(&ioapic->kvm->irq_lock); } void kvm_ioapic_reset(struct kvm_ioapic *ioapic) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d1f9c87daa87..d47e660fb709 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -982,6 +982,7 @@ static struct kvm *kvm_create_vm(void) kvm_io_bus_init(&kvm->pio_bus); kvm_irqfd_init(kvm); mutex_init(&kvm->lock); + mutex_init(&kvm->irq_lock); kvm_io_bus_init(&kvm->mmio_bus); init_rwsem(&kvm->slots_lock); atomic_set(&kvm->users_count, 1); -- cgit v1.2.3-70-g09d2 From fa40a8214bb9bcae8d49c234c19d8b4a6c1f37ff Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 4 Jun 2009 15:08:24 -0300 Subject: KVM: switch irq injection/acking data structures to irq_lock Protect irq injection/acking data structures with a separate irq_lock mutex. This fixes the following deadlock: CPU A CPU B kvm_vm_ioctl_deassign_dev_irq() mutex_lock(&kvm->lock); worker_thread() -> kvm_deassign_irq() -> kvm_assigned_dev_interrupt_work_handler() -> deassign_host_irq() mutex_lock(&kvm->lock); -> cancel_work_sync() [blocked] [gleb: fix ia64 path] Reported-by: Alex Williamson Signed-off-by: Marcelo Tosatti Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 4 ++-- arch/x86/kvm/i8254.c | 4 ++-- arch/x86/kvm/lapic.c | 4 ++++ arch/x86/kvm/x86.c | 19 +++++++++---------- include/linux/kvm_host.h | 3 ++- virt/kvm/eventfd.c | 4 ++-- virt/kvm/irq_comm.c | 34 ++++++++++++++++++++++++++++------ virt/kvm/kvm_main.c | 16 +++++++++------- 8 files changed, 58 insertions(+), 30 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 319922137fdd..8dde36953af3 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1000,10 +1000,10 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; if (irqchip_in_kernel(kvm)) { __s32 status; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->irq_lock); status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->irq_lock); if (ioctl == KVM_IRQ_LINE_STATUS) { irq_event.status = status; if (copy_to_user(argp, &irq_event, diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 977af7ab8193..3837db65d33e 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -654,10 +654,10 @@ static void __inject_pit_timer_intr(struct kvm *kvm) struct kvm_vcpu *vcpu; int i; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->irq_lock); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->irq_lock); /* * Provides NMI watchdog support via Virtual Wire mode. diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a23f42e550af..44f20cdb5709 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -424,7 +424,9 @@ static void apic_set_eoi(struct kvm_lapic *apic) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; + mutex_lock(&apic->vcpu->kvm->irq_lock); kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + mutex_unlock(&apic->vcpu->kvm->irq_lock); } static void apic_send_ipi(struct kvm_lapic *apic) @@ -448,7 +450,9 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, irq.vector); + mutex_lock(&apic->vcpu->kvm->irq_lock); kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); + mutex_unlock(&apic->vcpu->kvm->irq_lock); } static u32 apic_get_tmcct(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2ad8c97f58cc..05cbe83c74e2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2136,10 +2136,10 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; if (irqchip_in_kernel(kvm)) { __s32 status; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->irq_lock); status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->irq_lock); if (ioctl == KVM_IRQ_LINE_STATUS) { irq_event.status = status; if (copy_to_user(argp, &irq_event, @@ -2385,12 +2385,11 @@ mmio: */ mutex_lock(&vcpu->kvm->lock); mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); + mutex_unlock(&vcpu->kvm->lock); if (mmio_dev) { kvm_iodevice_read(mmio_dev, gpa, bytes, val); - mutex_unlock(&vcpu->kvm->lock); return X86EMUL_CONTINUE; } - mutex_unlock(&vcpu->kvm->lock); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -2440,12 +2439,11 @@ mmio: */ mutex_lock(&vcpu->kvm->lock); mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); + mutex_unlock(&vcpu->kvm->lock); if (mmio_dev) { kvm_iodevice_write(mmio_dev, gpa, bytes, val); - mutex_unlock(&vcpu->kvm->lock); return X86EMUL_CONTINUE; } - mutex_unlock(&vcpu->kvm->lock); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -2768,7 +2766,6 @@ static void kernel_pio(struct kvm_io_device *pio_dev, { /* TODO: String I/O for in kernel device */ - mutex_lock(&vcpu->kvm->lock); if (vcpu->arch.pio.in) kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, vcpu->arch.pio.size, @@ -2777,7 +2774,6 @@ static void kernel_pio(struct kvm_io_device *pio_dev, kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); - mutex_unlock(&vcpu->kvm->lock); } static void pio_string_write(struct kvm_io_device *pio_dev, @@ -2787,14 +2783,12 @@ static void pio_string_write(struct kvm_io_device *pio_dev, void *pd = vcpu->arch.pio_data; int i; - mutex_lock(&vcpu->kvm->lock); for (i = 0; i < io->cur_count; i++) { kvm_iodevice_write(pio_dev, io->port, io->size, pd); pd += io->size; } - mutex_unlock(&vcpu->kvm->lock); } static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, @@ -2831,7 +2825,9 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); + mutex_lock(&vcpu->kvm->lock); pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); + mutex_unlock(&vcpu->kvm->lock); if (pio_dev) { kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); complete_pio(vcpu); @@ -2895,9 +2891,12 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.guest_gva = address; + mutex_lock(&vcpu->kvm->lock); pio_dev = vcpu_find_pio_dev(vcpu, port, vcpu->arch.pio.cur_count, !vcpu->arch.pio.in); + mutex_unlock(&vcpu->kvm->lock); + if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0c71688b1ee3..a29ea030dd8e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -371,7 +371,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); -void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian); +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 314012323afe..4092b8dcd510 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -57,10 +57,10 @@ irqfd_inject(struct work_struct *work) struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); struct kvm *kvm = irqfd->kvm; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->irq_lock); kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->irq_lock); } /* diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index ddc17f0e2f35..08a9a49481b2 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -62,6 +62,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, int i, r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; + WARN_ON(!mutex_is_locked(&kvm->irq_lock)); + if (irq->dest_mode == 0 && irq->dest_id == 0xff && kvm_is_dm_lowest_prio(irq)) printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); @@ -113,7 +115,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return kvm_irq_delivery_to_apic(kvm, NULL, &irq); } -/* This should be called with the kvm->lock mutex held +/* This should be called with the kvm->irq_lock mutex held * Return value: * < 0 Interrupt was ignored (masked or not delivered for other reasons) * = 0 Interrupt was coalesced (previous irq is still pending) @@ -125,6 +127,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) unsigned long *irq_state, sig_level; int ret = -1; + WARN_ON(!mutex_is_locked(&kvm->irq_lock)); + if (irq < KVM_IOAPIC_NUM_PINS) { irq_state = (unsigned long *)&kvm->arch.irq_states[irq]; @@ -175,19 +179,26 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian) { + mutex_lock(&kvm->irq_lock); hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); + mutex_unlock(&kvm->irq_lock); } -void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian) +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) { + mutex_lock(&kvm->irq_lock); hlist_del_init(&kian->link); + mutex_unlock(&kvm->irq_lock); } -/* The caller must hold kvm->lock mutex */ int kvm_request_irq_source_id(struct kvm *kvm) { unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; - int irq_source_id = find_first_zero_bit(bitmap, + int irq_source_id; + + mutex_lock(&kvm->irq_lock); + irq_source_id = find_first_zero_bit(bitmap, sizeof(kvm->arch.irq_sources_bitmap)); if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { @@ -197,6 +208,7 @@ int kvm_request_irq_source_id(struct kvm *kvm) ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); set_bit(irq_source_id, bitmap); + mutex_unlock(&kvm->irq_lock); return irq_source_id; } @@ -207,6 +219,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + mutex_lock(&kvm->irq_lock); if (irq_source_id < 0 || irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); @@ -215,19 +228,24 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) clear_bit(irq_source_id, &kvm->arch.irq_states[i]); clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); + mutex_unlock(&kvm->irq_lock); } void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn) { + mutex_lock(&kvm->irq_lock); kimn->irq = irq; hlist_add_head(&kimn->link, &kvm->mask_notifier_list); + mutex_unlock(&kvm->irq_lock); } void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn) { + mutex_lock(&kvm->irq_lock); hlist_del(&kimn->link); + mutex_unlock(&kvm->irq_lock); } void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) @@ -235,6 +253,8 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) struct kvm_irq_mask_notifier *kimn; struct hlist_node *n; + WARN_ON(!mutex_is_locked(&kvm->irq_lock)); + hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link) if (kimn->irq == irq) kimn->func(kimn, mask); @@ -250,7 +270,9 @@ static void __kvm_free_irq_routing(struct list_head *irq_routing) void kvm_free_irq_routing(struct kvm *kvm) { + mutex_lock(&kvm->irq_lock); __kvm_free_irq_routing(&kvm->irq_routing); + mutex_unlock(&kvm->irq_lock); } static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, @@ -325,13 +347,13 @@ int kvm_set_irq_routing(struct kvm *kvm, e = NULL; } - mutex_lock(&kvm->lock); + mutex_lock(&kvm->irq_lock); list_splice(&kvm->irq_routing, &tmp); INIT_LIST_HEAD(&kvm->irq_routing); list_splice(&irq_list, &kvm->irq_routing); INIT_LIST_HEAD(&irq_list); list_splice(&tmp, &irq_list); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->irq_lock); r = 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d47e660fb709..0d481b282448 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -62,6 +62,12 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +/* + * Ordering of locks: + * + * kvm->lock --> kvm->irq_lock + */ + DEFINE_SPINLOCK(kvm_lock); LIST_HEAD(vm_list); @@ -126,11 +132,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) interrupt_work); kvm = assigned_dev->kvm; - /* This is taken to safely inject irq inside the guest. When - * the interrupt injection (or the ioapic code) uses a - * finer-grained lock, update this - */ - mutex_lock(&kvm->lock); + mutex_lock(&kvm->irq_lock); spin_lock_irq(&assigned_dev->assigned_dev_lock); if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { struct kvm_guest_msix_entry *guest_entries = @@ -149,7 +151,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) assigned_dev->guest_irq, 1); spin_unlock_irq(&assigned_dev->assigned_dev_lock); - mutex_unlock(&assigned_dev->kvm->lock); + mutex_unlock(&assigned_dev->kvm->irq_lock); } static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) @@ -207,7 +209,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) static void deassign_guest_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev) { - kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier); + kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); assigned_dev->ack_notifier.gsi = -1; if (assigned_dev->irq_source_id != -1) -- cgit v1.2.3-70-g09d2 From 3a624e29c7587b79abab60e279f9d1a62a3d4716 Mon Sep 17 00:00:00 2001 From: Nitin A Kamble Date: Mon, 8 Jun 2009 11:34:16 -0700 Subject: KVM: VMX: Support Unrestricted Guest feature "Unrestricted Guest" feature is added in the VMX specification. Intel Westmere and onwards processors will support this feature. It allows kvm guests to run real mode and unpaged mode code natively in the VMX mode when EPT is turned on. With the unrestricted guest there is no need to emulate the guest real mode code in the vm86 container or in the emulator. Also the guest big real mode code works like native. The attached patch enhances KVM to use the unrestricted guest feature if available on the processor. It also adds a new kernel/module parameter to disable the unrestricted guest feature at the boot time. Signed-off-by: Nitin A Kamble Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 12 +++++---- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 60 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 62 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1cc901ec4ba5..a1a96a57bb9d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -37,12 +37,14 @@ #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) -#define KVM_GUEST_CR0_MASK \ - (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ - | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) #define KVM_VM_CR0_ALWAYS_ON \ - (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ - | X86_CR0_MP) + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_GUEST_CR4_MASK \ (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 11be5ad2e0e9..e7927a639d69 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -55,6 +55,7 @@ #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 +#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 #define PIN_BASED_EXT_INTR_MASK 0x00000001 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 959cb59cfaeb..f0f9773f0b0f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -51,6 +51,10 @@ module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); static int __read_mostly enable_ept = 1; module_param_named(ept, enable_ept, bool, S_IRUGO); +static int __read_mostly enable_unrestricted_guest = 1; +module_param_named(unrestricted_guest, + enable_unrestricted_guest, bool, S_IRUGO); + static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); @@ -279,6 +283,12 @@ static inline int cpu_has_vmx_ept(void) SECONDARY_EXEC_ENABLE_EPT; } +static inline int cpu_has_vmx_unrestricted_guest(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_UNRESTRICTED_GUEST; +} + static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { return flexpriority_enabled && @@ -1210,7 +1220,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | - SECONDARY_EXEC_ENABLE_EPT; + SECONDARY_EXEC_ENABLE_EPT | + SECONDARY_EXEC_UNRESTRICTED_GUEST; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1340,8 +1351,13 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_vpid()) enable_vpid = 0; - if (!cpu_has_vmx_ept()) + if (!cpu_has_vmx_ept()) { enable_ept = 0; + enable_unrestricted_guest = 0; + } + + if (!cpu_has_vmx_unrestricted_guest()) + enable_unrestricted_guest = 0; if (!cpu_has_vmx_flexpriority()) flexpriority_enabled = 0; @@ -1440,6 +1456,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu) unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); + if (enable_unrestricted_guest) + return; + vmx->emulation_required = 1; vcpu->arch.rmode.vm86_active = 1; @@ -1593,7 +1612,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, vcpu->arch.cr4); - *hw_cr0 |= X86_CR0_PE | X86_CR0_PG; *hw_cr0 &= ~X86_CR0_WP; } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ @@ -1620,8 +1638,13 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | - KVM_VM_CR0_ALWAYS_ON; + unsigned long hw_cr0; + + if (enable_unrestricted_guest) + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) + | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; + else + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; vmx_fpu_deactivate(vcpu); @@ -1786,6 +1809,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, ar = 0xf3; } else ar = vmx_segment_access_rights(var); + + /* + * Fix the "Accessed" bit in AR field of segment registers for older + * qemu binaries. + * IA32 arch specifies that at the time of processor reset the + * "Accessed" bit in the AR field of segment registers is 1. And qemu + * is setting it to 0 in the usedland code. This causes invalid guest + * state vmexit when "unrestricted guest" mode is turned on. + * Fix for this setup issue in cpu_reset is being pushed in the qemu + * tree. Newer qemu binaries with that qemu fix would not need this + * kvm hack. + */ + if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) + ar |= 0x1; /* Accessed */ + vmcs_write32(sf->ar_bytes, ar); } @@ -2082,11 +2120,19 @@ out: static void seg_setup(int seg) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + unsigned int ar; vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0xf3); + if (enable_unrestricted_guest) { + ar = 0x93; + if (seg == VCPU_SREG_CS) + ar |= 0x08; /* code segment */ + } else + ar = 0xf3; + + vmcs_write32(sf->ar_bytes, ar); } static int alloc_apic_access_page(struct kvm *kvm) @@ -2229,6 +2275,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; if (!enable_ept) exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + if (!enable_unrestricted_guest) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } -- cgit v1.2.3-70-g09d2 From 6a4a98397331723dce25a7537270548d91523431 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 9 Jun 2009 11:33:36 +0300 Subject: KVM: Reorder ioctls in kvm.h Somehow the VM ioctls got unsorted; resort. Signed-off-by: Avi Kivity --- include/linux/kvm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 632a8560dbcf..f1dada0519eb 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -495,11 +495,6 @@ struct kvm_irqfd { * ioctls for VM fds */ #define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) -#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) -#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) -#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ - struct kvm_userspace_memory_region) -#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. @@ -507,6 +502,11 @@ struct kvm_irqfd { #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) +#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) +#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) +#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ + struct kvm_userspace_memory_region) +#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) /* Device model IOC */ #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) #define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) -- cgit v1.2.3-70-g09d2 From 7ffd92c53c5ebd0ad5a68ac3ca033c3a06374d19 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 9 Jun 2009 14:10:45 +0300 Subject: KVM: VMX: Move rmode structure to vmx-specific code rmode is only used in vmx, so move it to vmx.c Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 10 ------ arch/x86/kvm/vmx.c | 78 +++++++++++++++++++++++------------------ 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a1a96a57bb9d..c7b0cc2b7020 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -340,16 +340,6 @@ struct kvm_vcpu_arch { u8 nr; } interrupt; - struct { - int vm86_active; - u8 save_iopl; - struct kvm_save_segment { - u16 selector; - unsigned long base; - u32 limit; - u32 ar; - } tr, es, ds, fs, gs; - } rmode; int halt_request; /* real mode on Intel only */ int cpuid_nent; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f0f9773f0b0f..ae682929a642 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -88,6 +88,14 @@ struct vcpu_vmx { int guest_efer_loaded; } host_state; struct { + int vm86_active; + u8 save_iopl; + struct kvm_save_segment { + u16 selector; + unsigned long base; + u32 limit; + u32 ar; + } tr, es, ds, fs, gs; struct { bool pending; u8 vector; @@ -516,7 +524,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) eb |= 1u << BP_VECTOR; } - if (vcpu->arch.rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ @@ -752,7 +760,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (vcpu->arch.rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, rflags); } @@ -809,7 +817,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, intr_info |= INTR_INFO_DELIVER_CODE_MASK; } - if (vcpu->arch.rmode.vm86_active) { + if (vmx->rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = nr; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -1395,15 +1403,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); vmx->emulation_required = 1; - vcpu->arch.rmode.vm86_active = 0; + vmx->rmode.vm86_active = 0; - vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); - vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); - vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); + vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); + vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); + vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); + flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1414,10 +1422,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) if (emulate_invalid_guest_state) return; - fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); - fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); - fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); - fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); + fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); + fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); + fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); vmcs_write16(GUEST_SS_SELECTOR, 0); vmcs_write32(GUEST_SS_AR_BYTES, 0x93); @@ -1460,19 +1468,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu) return; vmx->emulation_required = 1; - vcpu->arch.rmode.vm86_active = 1; + vmx->rmode.vm86_active = 1; - vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); + vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); - vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); + vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); + vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vcpu->arch.rmode.save_iopl + vmx->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -1494,10 +1502,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CS_BASE, 0xf0000); vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); - fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); continue_rmode: kvm_mmu_reset_context(vcpu); @@ -1638,6 +1646,7 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long hw_cr0; if (enable_unrestricted_guest) @@ -1648,10 +1657,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx_fpu_deactivate(vcpu); - if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE)) + if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); - if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE)) + if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu); #ifdef CONFIG_X86_64 @@ -1707,7 +1716,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ? + unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; @@ -1787,20 +1796,21 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { + struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; - if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) { - vcpu->arch.rmode.tr.selector = var->selector; - vcpu->arch.rmode.tr.base = var->base; - vcpu->arch.rmode.tr.limit = var->limit; - vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); + if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { + vmx->rmode.tr.selector = var->selector; + vmx->rmode.tr.base = var->base; + vmx->rmode.tr.limit = var->limit; + vmx->rmode.tr.ar = vmx_segment_access_rights(var); return; } vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (vcpu->arch.rmode.vm86_active && var->s) { + if (vmx->rmode.vm86_active && var->s) { /* * Hack real-mode segments into vm86 compatibility. */ @@ -2394,7 +2404,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) goto out; } - vmx->vcpu.arch.rmode.vm86_active = 0; + vmx->rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; @@ -2532,7 +2542,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); ++vcpu->stat.irq_injections; - if (vcpu->arch.rmode.vm86_active) { + if (vmx->rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -2573,7 +2583,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } ++vcpu->stat.nmi_injections; - if (vcpu->arch.rmode.vm86_active) { + if (vmx->rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = NMI_VECTOR; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -2737,7 +2747,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return kvm_mmu_page_fault(vcpu, cr2, error_code); } - if (vcpu->arch.rmode.vm86_active && + if (vmx->rmode.vm86_active && handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, error_code)) { if (vcpu->arch.halt_request) { -- cgit v1.2.3-70-g09d2 From 439e218a6f4716da484314fc5a1f0a59b0212c01 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 10 Jun 2009 12:56:54 +0300 Subject: KVM: MMU: Fix is_dirty_pte() is_dirty_pte() is used on guest ptes, not shadow ptes, so it needs to avoid shadow_dirty_mask and use PT_DIRTY_MASK instead. Misdetecting dirty pages could lead to unnecessarily setting the dirty bit under EPT. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8ee67e3fb9d0..8f2cb29db2fe 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -242,7 +242,7 @@ static int is_writeble_pte(unsigned long pte) static int is_dirty_pte(unsigned long pte) { - return pte & shadow_dirty_mask; + return pte & PT_DIRTY_MASK; } static int is_rmap_pte(u64 pte) -- cgit v1.2.3-70-g09d2 From 43a3795a3a12425de31e25ce0ebc3bb41501cef7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 10 Jun 2009 14:12:05 +0300 Subject: KVM: MMU: Adjust pte accessors to explicitly indicate guest or shadow pte Since the guest and host ptes can have wildly different format, adjust the pte accessor names to indicate on which type of pte they operate on. No functional changes. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 16 ++++++++-------- arch/x86/kvm/mmu.h | 2 +- arch/x86/kvm/paging_tmpl.h | 22 +++++++++++----------- arch/x86/kvm/x86.c | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8f2cb29db2fe..a039e6bc21f7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -240,12 +240,12 @@ static int is_writeble_pte(unsigned long pte) return pte & PT_WRITABLE_MASK; } -static int is_dirty_pte(unsigned long pte) +static int is_dirty_gpte(unsigned long pte) { return pte & PT_DIRTY_MASK; } -static int is_rmap_pte(u64 pte) +static int is_rmap_spte(u64 pte) { return is_shadow_present_pte(pte); } @@ -502,7 +502,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) unsigned long *rmapp; int i, count = 0; - if (!is_rmap_pte(*spte)) + if (!is_rmap_spte(*spte)) return count; gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); @@ -567,7 +567,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) unsigned long *rmapp; int i; - if (!is_rmap_pte(*spte)) + if (!is_rmap_spte(*spte)) return; sp = page_header(__pa(spte)); pfn = spte_to_pfn(*spte); @@ -1769,7 +1769,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, __func__, *shadow_pte, pt_access, write_fault, user_fault, gfn); - if (is_rmap_pte(*shadow_pte)) { + if (is_rmap_spte(*shadow_pte)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. @@ -1805,7 +1805,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, page_header_update_slot(vcpu->kvm, shadow_pte, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); - if (!is_rmap_pte(*shadow_pte)) + if (!is_rmap_spte(*shadow_pte)) kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, gfn, largepage); @@ -1984,7 +1984,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pdptr = kvm_pdptr_read(vcpu, i); - if (!is_present_pte(pdptr)) { + if (!is_present_gpte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } @@ -2475,7 +2475,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if ((bytes == 4) && (gpa % 4 == 0)) memcpy((void *)&gpte, new, 4); } - if (!is_present_pte(gpte)) + if (!is_present_gpte(gpte)) return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 3494a2fb136e..016bf7183e9f 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -75,7 +75,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return vcpu->arch.cr0 & X86_CR0_PG; } -static inline int is_present_pte(unsigned long pte) +static inline int is_present_gpte(unsigned long pte) { return pte & PT_PRESENT_MASK; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 4cb1dbfd7c2a..238a193bbf5b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -132,7 +132,7 @@ walk: #if PTTYPE == 64 if (!is_long_mode(vcpu)) { pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); - if (!is_present_pte(pte)) + if (!is_present_gpte(pte)) goto not_present; --walker->level; } @@ -155,7 +155,7 @@ walk: kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); - if (!is_present_pte(pte)) + if (!is_present_gpte(pte)) goto not_present; rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); @@ -205,7 +205,7 @@ walk: --walker->level; } - if (write_fault && !is_dirty_pte(pte)) { + if (write_fault && !is_dirty_gpte(pte)) { bool ret; mark_page_dirty(vcpu->kvm, table_gfn); @@ -252,7 +252,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { - if (!is_present_pte(gpte)) + if (!is_present_gpte(gpte)) set_shadow_pte(spte, shadow_notrap_nonpresent_pte); return; } @@ -289,7 +289,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, pt_element_t curr_pte; struct kvm_shadow_walk_iterator iterator; - if (!is_present_pte(gw->ptes[gw->level - 1])) + if (!is_present_gpte(gw->ptes[gw->level - 1])) return NULL; for_each_shadow_entry(vcpu, addr, iterator) { @@ -318,7 +318,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { direct = 1; - if (!is_dirty_pte(gw->ptes[level - 1])) + if (!is_dirty_gpte(gw->ptes[level - 1])) access &= ~ACC_WRITE_MASK; table_gfn = gpte_to_gfn(gw->ptes[level - 1]); } else { @@ -489,7 +489,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, sizeof(pt_element_t))) return; - if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { + if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) { if (mmu_topup_memory_caches(vcpu)) return; kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, @@ -536,7 +536,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); for (j = 0; j < ARRAY_SIZE(pt); ++j) - if (r || is_present_pte(pt[j])) + if (r || is_present_gpte(pt[j])) sp->spt[i+j] = shadow_trap_nonpresent_pte; else sp->spt[i+j] = shadow_notrap_nonpresent_pte; @@ -574,12 +574,12 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || + if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; rmap_remove(vcpu->kvm, &sp->spt[i]); - if (is_present_pte(gpte)) + if (is_present_gpte(gpte)) nonpresent = shadow_trap_nonpresent_pte; else nonpresent = shadow_notrap_nonpresent_pte; @@ -590,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - is_dirty_pte(gpte), 0, gfn, + is_dirty_gpte(gpte), 0, gfn, spte_to_pfn(sp->spt[i]), true, false); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05cbe83c74e2..e877efa37620 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -237,7 +237,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if (is_present_pte(pdpte[i]) && + if (is_present_gpte(pdpte[i]) && (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { ret = 0; goto out; -- cgit v1.2.3-70-g09d2 From d555c333aa544b222fe077adcd5dfea024b2c913 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 10 Jun 2009 14:24:23 +0300 Subject: KVM: MMU: s/shadow_pte/spte/ We use shadow_pte and spte inconsistently, switch to the shorter spelling. Rename set_shadow_pte() to __set_spte() to avoid a conflict with the existing set_spte(), and to indicate its lowlevelness. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 102 ++++++++++++++++++++++----------------------- arch/x86/kvm/paging_tmpl.h | 16 +++---- 2 files changed, 59 insertions(+), 59 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a039e6bc21f7..d443a421ca3e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -143,7 +143,7 @@ module_param(oos_shadow, bool, 0644); #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) struct kvm_rmap_desc { - u64 *shadow_ptes[RMAP_EXT]; + u64 *sptes[RMAP_EXT]; struct kvm_rmap_desc *more; }; @@ -262,7 +262,7 @@ static gfn_t pse36_gfn_delta(u32 gpte) return (gpte & PT32_DIR_PSE36_MASK) << shift; } -static void set_shadow_pte(u64 *sptep, u64 spte) +static void __set_spte(u64 *sptep, u64 spte) { #ifdef CONFIG_X86_64 set_64bit((unsigned long *)sptep, spte); @@ -514,23 +514,23 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) } else if (!(*rmapp & 1)) { rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); desc = mmu_alloc_rmap_desc(vcpu); - desc->shadow_ptes[0] = (u64 *)*rmapp; - desc->shadow_ptes[1] = spte; + desc->sptes[0] = (u64 *)*rmapp; + desc->sptes[1] = spte; *rmapp = (unsigned long)desc | 1; } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { + while (desc->sptes[RMAP_EXT-1] && desc->more) { desc = desc->more; count += RMAP_EXT; } - if (desc->shadow_ptes[RMAP_EXT-1]) { + if (desc->sptes[RMAP_EXT-1]) { desc->more = mmu_alloc_rmap_desc(vcpu); desc = desc->more; } - for (i = 0; desc->shadow_ptes[i]; ++i) + for (i = 0; desc->sptes[i]; ++i) ; - desc->shadow_ptes[i] = spte; + desc->sptes[i] = spte; } return count; } @@ -542,14 +542,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp, { int j; - for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) + for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) ; - desc->shadow_ptes[i] = desc->shadow_ptes[j]; - desc->shadow_ptes[j] = NULL; + desc->sptes[i] = desc->sptes[j]; + desc->sptes[j] = NULL; if (j != 0) return; if (!prev_desc && !desc->more) - *rmapp = (unsigned long)desc->shadow_ptes[0]; + *rmapp = (unsigned long)desc->sptes[0]; else if (prev_desc) prev_desc->more = desc->more; @@ -594,8 +594,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) - if (desc->shadow_ptes[i] == spte) { + for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) + if (desc->sptes[i] == spte) { rmap_desc_remove_entry(rmapp, desc, i, prev_desc); @@ -626,10 +626,10 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) prev_desc = NULL; prev_spte = NULL; while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { + for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { if (prev_spte == spte) - return desc->shadow_ptes[i]; - prev_spte = desc->shadow_ptes[i]; + return desc->sptes[i]; + prev_spte = desc->sptes[i]; } desc = desc->more; } @@ -651,7 +651,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); if (is_writeble_pte(*spte)) { - set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); + __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); @@ -675,7 +675,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) if (is_writeble_pte(*spte)) { rmap_remove(kvm, spte); --kvm->stat.lpages; - set_shadow_pte(spte, shadow_trap_nonpresent_pte); + __set_spte(spte, shadow_trap_nonpresent_pte); spte = NULL; write_protected = 1; } @@ -694,7 +694,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); rmap_remove(kvm, spte); - set_shadow_pte(spte, shadow_trap_nonpresent_pte); + __set_spte(spte, shadow_trap_nonpresent_pte); need_tlb_flush = 1; } return need_tlb_flush; @@ -1369,7 +1369,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } BUG_ON(!parent_pte); kvm_mmu_put_page(sp, parent_pte); - set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); + __set_spte(parent_pte, shadow_trap_nonpresent_pte); } } @@ -1517,7 +1517,7 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp) for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { if (pt[i] == shadow_notrap_nonpresent_pte) - set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); + __set_spte(&pt[i], shadow_trap_nonpresent_pte); } } @@ -1683,7 +1683,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return 0; } -static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, +static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int user_fault, int write_fault, int dirty, int largepage, gfn_t gfn, pfn_t pfn, bool speculative, @@ -1733,7 +1733,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, * is responsibility of mmu_get_page / kvm_sync_page. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writeble_pte(*shadow_pte)) + if (!can_unsync && is_writeble_pte(*sptep)) goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { @@ -1750,62 +1750,62 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, mark_page_dirty(vcpu->kvm, gfn); set_pte: - set_shadow_pte(shadow_pte, spte); + __set_spte(sptep, spte); return ret; } -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, int *ptwrite, int largepage, gfn_t gfn, pfn_t pfn, bool speculative) { int was_rmapped = 0; - int was_writeble = is_writeble_pte(*shadow_pte); + int was_writeble = is_writeble_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", - __func__, *shadow_pte, pt_access, + __func__, *sptep, pt_access, write_fault, user_fault, gfn); - if (is_rmap_spte(*shadow_pte)) { + if (is_rmap_spte(*sptep)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ - if (largepage && !is_large_pte(*shadow_pte)) { + if (largepage && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; - u64 pte = *shadow_pte; + u64 pte = *sptep; child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, shadow_pte); - } else if (pfn != spte_to_pfn(*shadow_pte)) { + mmu_page_remove_parent_pte(child, sptep); + } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %lx new %lx\n", - spte_to_pfn(*shadow_pte), pfn); - rmap_remove(vcpu->kvm, shadow_pte); + spte_to_pfn(*sptep), pfn); + rmap_remove(vcpu->kvm, sptep); } else was_rmapped = 1; } - if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, + if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, dirty, largepage, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); } - pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); + pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", - is_large_pte(*shadow_pte)? "2MB" : "4kB", - is_present_pte(*shadow_pte)?"RW":"R", gfn, - *shadow_pte, shadow_pte); - if (!was_rmapped && is_large_pte(*shadow_pte)) + is_large_pte(*sptep)? "2MB" : "4kB", + is_present_pte(*sptep)?"RW":"R", gfn, + *shadow_pte, sptep); + if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; - page_header_update_slot(vcpu->kvm, shadow_pte, gfn); + page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { - rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); - if (!is_rmap_spte(*shadow_pte)) + rmap_count = rmap_add(vcpu, sptep, gfn, largepage); + if (!is_rmap_spte(*sptep)) kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, gfn, largepage); @@ -1816,7 +1816,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, kvm_release_pfn_clean(pfn); } if (speculative) { - vcpu->arch.last_pte_updated = shadow_pte; + vcpu->arch.last_pte_updated = sptep; vcpu->arch.last_pte_gfn = gfn; } } @@ -1854,10 +1854,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, return -ENOMEM; } - set_shadow_pte(iterator.sptep, - __pa(sp->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); + __set_spte(iterator.sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); } } return pt_write; @@ -2389,7 +2389,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, mmu_page_remove_parent_pte(child, spte); } } - set_shadow_pte(spte, shadow_trap_nonpresent_pte); + __set_spte(spte, shadow_trap_nonpresent_pte); if (is_large_pte(pte)) --vcpu->kvm->stat.lpages; } @@ -3125,7 +3125,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); while (d) { for (k = 0; k < RMAP_EXT; ++k) - if (d->shadow_ptes[k]) + if (d->sptes[k]) ++nmaps; else break; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 238a193bbf5b..322e8113aeea 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -253,7 +253,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { if (!is_present_gpte(gpte)) - set_shadow_pte(spte, shadow_notrap_nonpresent_pte); + __set_spte(spte, shadow_notrap_nonpresent_pte); return; } pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); @@ -311,7 +311,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (is_large_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); - set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + __set_spte(sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } @@ -369,7 +369,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int user_fault = error_code & PFERR_USER_MASK; int fetch_fault = error_code & PFERR_FETCH_MASK; struct guest_walker walker; - u64 *shadow_pte; + u64 *sptep; int write_pt = 0; int r; pfn_t pfn; @@ -422,11 +422,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - largepage, &write_pt, pfn); + sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, + largepage, &write_pt, pfn); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, - shadow_pte, *shadow_pte, write_pt); + sptep, *sptep, write_pt); if (!write_pt) vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ @@ -472,7 +472,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) --vcpu->kvm->stat.lpages; need_flush = 1; } - set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + __set_spte(sptep, shadow_trap_nonpresent_pte); break; } @@ -583,7 +583,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nonpresent = shadow_trap_nonpresent_pte; else nonpresent = shadow_notrap_nonpresent_pte; - set_shadow_pte(&sp->spt[i], nonpresent); + __set_spte(&sp->spt[i], nonpresent); continue; } -- cgit v1.2.3-70-g09d2 From c5af89b68abb26eea5e745f33228f4d672f115e5 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 9 Jun 2009 15:56:26 +0300 Subject: KVM: Introduce kvm_vcpu_is_bsp() function. Use it instead of open code "vcpu_id zero is BSP" assumption. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 2 +- arch/ia64/kvm/vcpu.c | 2 +- arch/x86/kvm/i8254.c | 4 ++-- arch/x86/kvm/i8259.c | 6 +++--- arch/x86/kvm/lapic.c | 7 ++++--- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 6 +++--- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 5 +++++ virt/kvm/ioapic.c | 4 +++- virt/kvm/kvm_main.c | 2 ++ 11 files changed, 28 insertions(+), 18 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 8dde36953af3..4082665ace0a 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1216,7 +1216,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (IS_ERR(vmm_vcpu)) return PTR_ERR(vmm_vcpu); - if (vcpu->vcpu_id == 0) { + if (kvm_vcpu_is_bsp(vcpu)) { vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; /*Set entry address for first run.*/ diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c index cc406d064a09..61a3320b62c1 100644 --- a/arch/ia64/kvm/vcpu.c +++ b/arch/ia64/kvm/vcpu.c @@ -830,7 +830,7 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val) kvm = (struct kvm *)KVM_VM_BASE; - if (vcpu->vcpu_id == 0) { + if (kvm_vcpu_is_bsp(vcpu)) { for (i = 0; i < kvm->arch.online_vcpus; i++) { v = (struct kvm_vcpu *)((char *)vcpu + sizeof(struct kvm_vcpu_data) * i); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 3837db65d33e..008a83185067 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -231,7 +231,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; - if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) + if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack) return atomic_read(&pit->pit_state.pit_timer.pending); return 0; } @@ -252,7 +252,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) struct kvm_pit *pit = vcpu->kvm->arch.vpit; struct hrtimer *timer; - if (vcpu->vcpu_id != 0 || !pit) + if (!kvm_vcpu_is_bsp(vcpu) || !pit) return; timer = &pit->pit_state.pit_timer.timer; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index bf94a45f4f86..148c52a608d6 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -57,7 +57,7 @@ static void pic_unlock(struct kvm_pic *s) } if (wakeup) { - vcpu = s->kvm->vcpus[0]; + vcpu = s->kvm->bsp_vcpu; if (vcpu) kvm_vcpu_kick(vcpu); } @@ -254,7 +254,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) { int irq, irqbase, n; struct kvm *kvm = s->pics_state->irq_request_opaque; - struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; + struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; if (s == &s->pics_state->pics[0]) irqbase = 0; @@ -512,7 +512,7 @@ static void picdev_read(struct kvm_io_device *this, static void pic_irq_request(void *opaque, int level) { struct kvm *kvm = opaque; - struct kvm_vcpu *vcpu = kvm->vcpus[0]; + struct kvm_vcpu *vcpu = kvm->bsp_vcpu; struct kvm_pic *s = pic_irqchip(kvm); int irq = pic_get_irq(&s->pics[0]); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 44f20cdb5709..b0661300eb28 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -793,7 +793,8 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; return; } - if (apic->vcpu->vcpu_id) + + if (!kvm_vcpu_is_bsp(apic->vcpu)) value &= ~MSR_IA32_APICBASE_BSP; vcpu->arch.apic_base = value; @@ -844,7 +845,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) } update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); - if (vcpu->vcpu_id == 0) + if (kvm_vcpu_is_bsp(vcpu)) vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; apic_update_ppr(apic); @@ -985,7 +986,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; - if (vcpu->vcpu_id == 0) { + if (kvm_vcpu_is_bsp(vcpu)) { if (!apic_hw_enabled(vcpu->arch.apic)) r = 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7749b0692cb2..28b981409a8f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -642,7 +642,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) init_vmcb(svm); - if (vcpu->vcpu_id != 0) { + if (!kvm_vcpu_is_bsp(vcpu)) { kvm_rip_write(vcpu, 0); svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; @@ -706,7 +706,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) fx_init(&svm->vcpu); svm->vcpu.fpu_active = 1; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (svm->vcpu.vcpu_id == 0) + if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ae682929a642..c08bb4cf372e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2411,7 +2411,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (vmx->vcpu.vcpu_id == 0) + if (kvm_vcpu_is_bsp(&vmx->vcpu)) msr |= MSR_IA32_APICBASE_BSP; kvm_set_apic_base(&vmx->vcpu, msr); @@ -2422,7 +2422,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. */ - if (vmx->vcpu.vcpu_id == 0) { + if (kvm_vcpu_is_bsp(&vmx->vcpu)) { vmcs_write16(GUEST_CS_SELECTOR, 0xf000); vmcs_writel(GUEST_CS_BASE, 0x000f0000); } else { @@ -2451,7 +2451,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_SYSENTER_EIP, 0); vmcs_writel(GUEST_RFLAGS, 0x02); - if (vmx->vcpu.vcpu_id == 0) + if (kvm_vcpu_is_bsp(&vmx->vcpu)) kvm_rip_write(vcpu, 0xfff0); else kvm_rip_write(vcpu, 0); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e877efa37620..d8adc1da76dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4330,7 +4330,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); /* Older userspace won't unhalt the vcpu on reset. */ - if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && + if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && !(vcpu->arch.cr0 & X86_CR0_PE)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -4601,7 +4601,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm = vcpu->kvm; vcpu->arch.mmu.root_hpa = INVALID_PAGE; - if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) + if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a29ea030dd8e..a5bd429e9bd3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -131,6 +131,7 @@ struct kvm { int nmemslots; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS]; + struct kvm_vcpu *bsp_vcpu; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; struct list_head vm_list; struct mutex lock; @@ -549,4 +550,8 @@ static inline void kvm_irqfd_release(struct kvm *kvm) {} #endif /* CONFIG_HAVE_KVM_EVENTFD */ +static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->bsp_vcpu == vcpu; +} #endif diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 2a5667173995..0532fa68f5d1 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -164,7 +164,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) /* Always delivery PIT interrupt to vcpu 0 */ if (irq == 0) { irqe.dest_mode = 0; /* Physical mode. */ - irqe.dest_id = ioapic->kvm->vcpus[0]->vcpu_id; + /* need to read apic_id from apic regiest since + * it can be rewritten */ + irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id; } #endif return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0d481b282448..0d54edecbc70 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1747,6 +1747,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) goto vcpu_destroy; } kvm->vcpus[n] = vcpu; + if (n == 0) + kvm->bsp_vcpu = vcpu; mutex_unlock(&kvm->lock); /* Now it's all set up, let userspace reach it */ -- cgit v1.2.3-70-g09d2 From 1ed0ce000a6c20c36ec649e32fc24393ef418ed8 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 9 Jun 2009 15:56:27 +0300 Subject: KVM: Use pointer to vcpu instead of vcpu_id in timer code. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 2 +- arch/x86/kvm/kvm_timer.h | 2 +- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/timer.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 008a83185067..06d8f84ae8a2 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -294,7 +294,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) pt->timer.function = kvm_timer_fn; pt->t_ops = &kpit_ops; pt->kvm = ps->pit->kvm; - pt->vcpu_id = 0; + pt->vcpu = pt->kvm->bsp_vcpu; atomic_set(&pt->pending, 0); ps->irq_ack = 1; diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h index 26bd6ba74e1c..55c7524dda54 100644 --- a/arch/x86/kvm/kvm_timer.h +++ b/arch/x86/kvm/kvm_timer.h @@ -6,7 +6,7 @@ struct kvm_timer { bool reinject; struct kvm_timer_ops *t_ops; struct kvm *kvm; - int vcpu_id; + struct kvm_vcpu *vcpu; }; struct kvm_timer_ops { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b0661300eb28..b1694dc45729 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -950,7 +950,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->lapic_timer.timer.function = kvm_timer_fn; apic->lapic_timer.t_ops = &lapic_timer_ops; apic->lapic_timer.kvm = vcpu->kvm; - apic->lapic_timer.vcpu_id = vcpu->vcpu_id; + apic->lapic_timer.vcpu = vcpu; apic->base_address = APIC_DEFAULT_PHYS_BASE; vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 86dbac072d0c..85cc743a8203 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -33,7 +33,7 @@ enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) struct kvm_vcpu *vcpu; struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id]; + vcpu = ktimer->vcpu; if (!vcpu) return HRTIMER_NORESTART; -- cgit v1.2.3-70-g09d2 From 73880c80aa9c8dc353cd0ad26579023213cd5314 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 9 Jun 2009 15:56:28 +0300 Subject: KVM: Break dependency between vcpu index in vcpus array and vcpu_id. Archs are free to use vcpu_id as they see fit. For x86 it is used as vcpu's apic id. New ioctl is added to configure boot vcpu id that was assumed to be 0 till now. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 1 - arch/ia64/kvm/Kconfig | 1 + arch/ia64/kvm/kvm-ia64.c | 8 ++--- arch/ia64/kvm/vcpu.c | 2 +- arch/x86/kvm/Kconfig | 1 + include/linux/kvm.h | 2 ++ include/linux/kvm_host.h | 6 ++++ virt/kvm/Kconfig | 3 ++ virt/kvm/kvm_main.c | 64 ++++++++++++++++++++++++---------------- 9 files changed, 55 insertions(+), 33 deletions(-) diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 5f43697aed30..9cf1c4b1f92f 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -465,7 +465,6 @@ struct kvm_arch { unsigned long metaphysical_rr4; unsigned long vmm_init_rr; - int online_vcpus; int is_sn2; struct kvm_ioapic *vioapic; diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index f922bbba3797..cbadd8a65233 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -25,6 +25,7 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_IRQCHIP + select KVM_APIC_ARCHITECTURE ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 4082665ace0a..d1f7bcda2c7f 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -338,7 +338,7 @@ static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id, union ia64_lid lid; int i; - for (i = 0; i < kvm->arch.online_vcpus; i++) { + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) { if (kvm->vcpus[i]) { lid.val = VCPU_LID(kvm->vcpus[i]); if (lid.id == id && lid.eid == eid) @@ -412,7 +412,7 @@ static int handle_global_purge(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) call_data.ptc_g_data = p->u.ptc_g_data; - for (i = 0; i < kvm->arch.online_vcpus; i++) { + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) { if (!kvm->vcpus[i] || kvm->vcpus[i]->arch.mp_state == KVM_MP_STATE_UNINITIALIZED || vcpu == kvm->vcpus[i]) @@ -852,8 +852,6 @@ struct kvm *kvm_arch_create_vm(void) kvm_init_vm(kvm); - kvm->arch.online_vcpus = 0; - return kvm; } @@ -1356,8 +1354,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, goto fail; } - kvm->arch.online_vcpus++; - return vcpu; fail: return ERR_PTR(r); diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c index 61a3320b62c1..dce75b70cdd5 100644 --- a/arch/ia64/kvm/vcpu.c +++ b/arch/ia64/kvm/vcpu.c @@ -831,7 +831,7 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val) kvm = (struct kvm *)KVM_VM_BASE; if (kvm_vcpu_is_bsp(vcpu)) { - for (i = 0; i < kvm->arch.online_vcpus; i++) { + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) { v = (struct kvm_vcpu *)((char *)vcpu + sizeof(struct kvm_vcpu_data) * i); VMX(v, itc_offset) = itc_offset; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8cd2a4efe238..7fbedfd34d6c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -27,6 +27,7 @@ config KVM select ANON_INODES select HAVE_KVM_IRQCHIP select HAVE_KVM_EVENTFD + select KVM_APIC_ARCHITECTURE ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/include/linux/kvm.h b/include/linux/kvm.h index f1dada0519eb..5037e170a70d 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -430,6 +430,7 @@ struct kvm_trace_rec { #ifdef __KVM_HAVE_PIT #define KVM_CAP_PIT2 33 #endif +#define KVM_CAP_SET_BOOT_CPU_ID 34 #ifdef KVM_CAP_IRQ_ROUTING @@ -537,6 +538,7 @@ struct kvm_irqfd { #define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) #define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd) #define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config) +#define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78) /* * ioctls for vcpu fds diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a5bd429e9bd3..d3fdf1a738c9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -131,8 +131,12 @@ struct kvm { int nmemslots; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS]; +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + u32 bsp_vcpu_id; struct kvm_vcpu *bsp_vcpu; +#endif struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + atomic_t online_vcpus; struct list_head vm_list; struct mutex lock; struct kvm_io_bus mmio_bus; @@ -550,8 +554,10 @@ static inline void kvm_irqfd_release(struct kvm *kvm) {} #endif /* CONFIG_HAVE_KVM_EVENTFD */ +#ifdef CONFIG_KVM_APIC_ARCHITECTURE static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->bsp_vcpu == vcpu; } #endif +#endif diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 56c6848d2df8..daece36c0a57 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -9,3 +9,6 @@ config HAVE_KVM_IRQCHIP config HAVE_KVM_EVENTFD bool select EVENTFD + +config KVM_APIC_ARCHITECTURE + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0d54edecbc70..25e1f9c97b1a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -689,11 +689,6 @@ out: } #endif -static inline int valid_vcpu(int n) -{ - return likely(n >= 0 && n < KVM_MAX_VCPUS); -} - inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { @@ -1714,24 +1709,18 @@ static struct file_operations kvm_vcpu_fops = { */ static int create_vcpu_fd(struct kvm_vcpu *vcpu) { - int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); - if (fd < 0) - kvm_put_kvm(vcpu->kvm); - return fd; + return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); } /* * Creates some virtual cpus. Good luck creating more than one. */ -static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) { int r; struct kvm_vcpu *vcpu; - if (!valid_vcpu(n)) - return -EINVAL; - - vcpu = kvm_arch_vcpu_create(kvm, n); + vcpu = kvm_arch_vcpu_create(kvm, id); if (IS_ERR(vcpu)) return PTR_ERR(vcpu); @@ -1742,25 +1731,38 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) return r; mutex_lock(&kvm->lock); - if (kvm->vcpus[n]) { - r = -EEXIST; + if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { + r = -EINVAL; goto vcpu_destroy; } - kvm->vcpus[n] = vcpu; - if (n == 0) - kvm->bsp_vcpu = vcpu; - mutex_unlock(&kvm->lock); + + for (r = 0; r < atomic_read(&kvm->online_vcpus); r++) + if (kvm->vcpus[r]->vcpu_id == id) { + r = -EEXIST; + goto vcpu_destroy; + } + + BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); - if (r < 0) - goto unlink; + if (r < 0) { + kvm_put_kvm(kvm); + goto vcpu_destroy; + } + + kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; + smp_wmb(); + atomic_inc(&kvm->online_vcpus); + +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + if (kvm->bsp_vcpu_id == id) + kvm->bsp_vcpu = vcpu; +#endif + mutex_unlock(&kvm->lock); return r; -unlink: - mutex_lock(&kvm->lock); - kvm->vcpus[n] = NULL; vcpu_destroy: mutex_unlock(&kvm->lock); kvm_arch_vcpu_destroy(vcpu); @@ -2233,6 +2235,15 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); break; } +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + case KVM_SET_BOOT_CPU_ID: + r = 0; + if (atomic_read(&kvm->online_vcpus) != 0) + r = -EBUSY; + else + kvm->bsp_vcpu_id = arg; + break; +#endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } @@ -2299,6 +2310,9 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) case KVM_CAP_USER_MEMORY: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + case KVM_CAP_SET_BOOT_CPU_ID: +#endif return 1; #ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQ_ROUTING: -- cgit v1.2.3-70-g09d2 From 988a2cae6a3c0dea6df59808a935a9a697bfc28c Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 9 Jun 2009 15:56:29 +0300 Subject: KVM: Use macro to iterate over vcpus. [christian: remove unused variables on s390] Signed-off-by: Gleb Natapov Signed-off-by: Christian Borntraeger Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 29 ++++++++++++++--------------- arch/powerpc/kvm/powerpc.c | 16 ++++++++++------ arch/s390/kvm/kvm-s390.c | 31 +++++++++++++++---------------- arch/x86/kvm/i8254.c | 7 ++----- arch/x86/kvm/mmu.c | 6 +++--- arch/x86/kvm/x86.c | 25 ++++++++++++------------- include/linux/kvm_host.h | 11 +++++++++++ virt/kvm/irq_comm.c | 6 ++---- virt/kvm/kvm_main.c | 19 +++++++------------ 9 files changed, 76 insertions(+), 74 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index d1f7bcda2c7f..5c766bd82b05 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -337,13 +337,12 @@ static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id, { union ia64_lid lid; int i; + struct kvm_vcpu *vcpu; - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) { - if (kvm->vcpus[i]) { - lid.val = VCPU_LID(kvm->vcpus[i]); - if (lid.id == id && lid.eid == eid) - return kvm->vcpus[i]; - } + kvm_for_each_vcpu(i, vcpu, kvm) { + lid.val = VCPU_LID(vcpu); + if (lid.id == id && lid.eid == eid) + return vcpu; } return NULL; @@ -409,21 +408,21 @@ static int handle_global_purge(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) struct kvm *kvm = vcpu->kvm; struct call_data call_data; int i; + struct kvm_vcpu *vcpui; call_data.ptc_g_data = p->u.ptc_g_data; - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) { - if (!kvm->vcpus[i] || kvm->vcpus[i]->arch.mp_state == - KVM_MP_STATE_UNINITIALIZED || - vcpu == kvm->vcpus[i]) + kvm_for_each_vcpu(i, vcpui, kvm) { + if (vcpui->arch.mp_state == KVM_MP_STATE_UNINITIALIZED || + vcpu == vcpui) continue; - if (waitqueue_active(&kvm->vcpus[i]->wq)) - wake_up_interruptible(&kvm->vcpus[i]->wq); + if (waitqueue_active(&vcpui->wq)) + wake_up_interruptible(&vcpui->wq); - if (kvm->vcpus[i]->cpu != -1) { - call_data.vcpu = kvm->vcpus[i]; - smp_call_function_single(kvm->vcpus[i]->cpu, + if (vcpui->cpu != -1) { + call_data.vcpu = vcpui; + smp_call_function_single(vcpui->cpu, vcpu_global_purge, &call_data, 1); } else printk(KERN_WARNING"kvm: Uninit vcpu received ipi!\n"); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2cf915e51e7e..7ad30e0a1b9a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -122,13 +122,17 @@ struct kvm *kvm_arch_create_vm(void) static void kvmppc_free_vcpus(struct kvm *kvm) { unsigned int i; + struct kvm_vcpu *vcpu; - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - kvm_arch_vcpu_free(kvm->vcpus[i]); - kvm->vcpus[i] = NULL; - } - } + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_arch_vcpu_free(vcpu); + + mutex_lock(&kvm->lock); + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) + kvm->vcpus[i] = NULL; + + atomic_set(&kvm->online_vcpus, 0); + mutex_unlock(&kvm->lock); } void kvm_arch_sync_events(struct kvm *kvm) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 098bfa6fbdf6..07ced89740d7 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -211,13 +211,17 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) static void kvm_free_vcpus(struct kvm *kvm) { unsigned int i; + struct kvm_vcpu *vcpu; - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - kvm_arch_vcpu_destroy(kvm->vcpus[i]); - kvm->vcpus[i] = NULL; - } - } + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_arch_vcpu_destroy(vcpu); + + mutex_lock(&kvm->lock); + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) + kvm->vcpus[i] = NULL; + + atomic_set(&kvm->online_vcpus, 0); + mutex_unlock(&kvm->lock); } void kvm_arch_sync_events(struct kvm *kvm) @@ -314,8 +318,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, BUG_ON(!kvm->arch.sca); if (!kvm->arch.sca->cpu[id].sda) kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block; - else - BUG_ON(!kvm->vcpus[id]); /* vcpu does already exist */ vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32); vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; @@ -683,6 +685,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, int user_alloc) { int i; + struct kvm_vcpu *vcpu; /* A few sanity checks. We can have exactly one memory slot which has to start at guest virtual zero and which has to be located at a @@ -707,14 +710,10 @@ int kvm_arch_set_memory_region(struct kvm *kvm, return -EINVAL; /* request update of sie control block for all available vcpus */ - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - if (test_and_set_bit(KVM_REQ_MMU_RELOAD, - &kvm->vcpus[i]->requests)) - continue; - kvm_s390_inject_sigp_stop(kvm->vcpus[i], - ACTION_RELOADVCPU_ON_STOP); - } + kvm_for_each_vcpu(i, vcpu, kvm) { + if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) + continue; + kvm_s390_inject_sigp_stop(vcpu, ACTION_RELOADVCPU_ON_STOP); } return 0; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 06d8f84ae8a2..15fc95b2fc05 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -669,11 +669,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm) * VCPU0, and only if its LVT0 is in EXTINT mode. */ if (kvm->arch.vapics_in_nmi_mode > 0) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (vcpu) - kvm_apic_nmi_wd_deliver(vcpu); - } + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_apic_nmi_wd_deliver(vcpu); } void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d443a421ca3e..5f97dbd24291 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1347,10 +1347,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) { int i; + struct kvm_vcpu *vcpu; - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i]) - kvm->vcpus[i]->arch.last_pte_updated = NULL; + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.last_pte_updated = NULL; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d8adc1da76dd..89862a80e32c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2946,10 +2946,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - continue; + kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; if (!kvm_request_guest_time_update(vcpu)) @@ -4678,20 +4675,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) static void kvm_free_vcpus(struct kvm *kvm) { unsigned int i; + struct kvm_vcpu *vcpu; /* * Unpin any mmu pages first. */ - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i]) - kvm_unload_vcpu_mmu(kvm->vcpus[i]); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - kvm_arch_vcpu_free(kvm->vcpus[i]); - kvm->vcpus[i] = NULL; - } - } + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_unload_vcpu_mmu(vcpu); + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_arch_vcpu_free(vcpu); + + mutex_lock(&kvm->lock); + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) + kvm->vcpus[i] = NULL; + atomic_set(&kvm->online_vcpus, 0); + mutex_unlock(&kvm->lock); } void kvm_arch_sync_events(struct kvm *kvm) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d3fdf1a738c9..c6e4d02067fe 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -179,6 +179,17 @@ struct kvm { #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) +static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) +{ + smp_rmb(); + return kvm->vcpus[i]; +} + +#define kvm_for_each_vcpu(idx, vcpup, kvm) \ + for (idx = 0, vcpup = kvm_get_vcpu(kvm, idx); \ + idx < atomic_read(&kvm->online_vcpus) && vcpup; \ + vcpup = kvm_get_vcpu(kvm, ++idx)) + int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 08a9a49481b2..bb8a1b5e41c1 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -68,10 +68,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, kvm_is_dm_lowest_prio(irq)) printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); - for (i = 0; i < KVM_MAX_VCPUS; i++) { - vcpu = kvm->vcpus[i]; - - if (!vcpu || !kvm_apic_present(vcpu)) + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) continue; if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 25e1f9c97b1a..777fe533cfe7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -738,10 +738,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) me = get_cpu(); spin_lock(&kvm->requests_lock); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - continue; + kvm_for_each_vcpu(i, vcpu, kvm) { if (test_and_set_bit(req, &vcpu->requests)) continue; cpu = vcpu->cpu; @@ -1718,7 +1715,7 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) { int r; - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu, *v; vcpu = kvm_arch_vcpu_create(kvm, id); if (IS_ERR(vcpu)) @@ -1736,8 +1733,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto vcpu_destroy; } - for (r = 0; r < atomic_read(&kvm->online_vcpus); r++) - if (kvm->vcpus[r]->vcpu_id == id) { + kvm_for_each_vcpu(r, v, kvm) + if (v->vcpu_id == id) { r = -EEXIST; goto vcpu_destroy; } @@ -2526,11 +2523,9 @@ static int vcpu_stat_get(void *_offset, u64 *val) *val = 0; spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (vcpu) - *val += *(u32 *)((void *)vcpu + offset); - } + kvm_for_each_vcpu(i, vcpu, kvm) + *val += *(u32 *)((void *)vcpu + offset); + spin_unlock(&kvm_lock); return 0; } -- cgit v1.2.3-70-g09d2 From 3f5d18a96577fd78277e08c467041573b9a65eaf Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 11 Jun 2009 15:43:28 +0300 Subject: KVM: Return to userspace on emulation failure Instead of mindlessly retrying to execute the instruction, report the failure to userspace. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- include/linux/kvm.h | 7 +++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5f97dbd24291..b6e4cda77047 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2673,8 +2673,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) ++vcpu->stat.mmio_exits; return 0; case EMULATE_FAIL: - kvm_report_emulation_failure(vcpu, "pagetable"); - return 1; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + return 0; default: BUG(); } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 5037e170a70d..671051829da6 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -95,6 +95,10 @@ struct kvm_pit_config { #define KVM_EXIT_S390_RESET 14 #define KVM_EXIT_DCR 15 #define KVM_EXIT_NMI 16 +#define KVM_EXIT_INTERNAL_ERROR 17 + +/* For KVM_EXIT_INTERNAL_ERROR */ +#define KVM_INTERNAL_ERROR_EMULATION 1 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { @@ -181,6 +185,9 @@ struct kvm_run { __u32 data; __u8 is_write; } dcr; + struct { + __u32 suberror; + } internal; /* Fix the size of the union. */ char padding[256]; }; -- cgit v1.2.3-70-g09d2 From 776e6633363b5616be6fa4493a8b70ef8e2ea04b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 10 Jun 2009 12:27:03 -0300 Subject: KVM: MMU: introduce is_last_spte helper Hiding some of the last largepage / level interaction (which is useful for gbpages and for zero based levels). Also merge the PT_PAGE_TABLE_LEVEL clearing loop in unlink_children. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b6e4cda77047..f85d9953a620 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -250,6 +250,15 @@ static int is_rmap_spte(u64 pte) return is_shadow_present_pte(pte); } +static int is_last_spte(u64 pte, int level) +{ + if (level == PT_PAGE_TABLE_LEVEL) + return 1; + if (level == PT_DIRECTORY_LEVEL && is_large_pte(pte)) + return 1; + return 0; +} + static pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -1313,25 +1322,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, pt = sp->spt; - if (sp->role.level == PT_PAGE_TABLE_LEVEL) { - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (is_shadow_present_pte(pt[i])) - rmap_remove(kvm, &pt[i]); - pt[i] = shadow_trap_nonpresent_pte; - } - return; - } - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { ent = pt[i]; if (is_shadow_present_pte(ent)) { - if (!is_large_pte(ent)) { + if (!is_last_spte(ent, sp->role.level)) { ent &= PT64_BASE_ADDR_MASK; mmu_page_remove_parent_pte(page_header(ent), &pt[i]); } else { - --kvm->stat.lpages; + if (is_large_pte(ent)) + --kvm->stat.lpages; rmap_remove(kvm, &pt[i]); } } @@ -2381,8 +2382,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, pte = *spte; if (is_shadow_present_pte(pte)) { - if (sp->role.level == PT_PAGE_TABLE_LEVEL || - is_large_pte(pte)) + if (is_last_spte(pte, sp->role.level)) rmap_remove(vcpu->kvm, spte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); -- cgit v1.2.3-70-g09d2 From 08a3732bf2e68048c9166e929ca2115127a412ab Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 10 Jun 2009 12:27:04 -0300 Subject: KVM: MMU audit: update count_writable_mappings / count_rmaps Under testing, count_writable_mappings returns a value that is 2 integers larger than what count_rmaps returns. Suspicion is that either of the two functions is counting a duplicate (either positively or negatively). Modifying check_writable_mappings_rmap to check for rmap existance on all present MMU pages fails to trigger an error, which should keep Avi happy. Also introduce mmu_spte_walk to invoke a callback on all present sptes visible to the current vcpu, might be useful in the future. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 94 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f85d9953a620..fd5579cc8abd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3045,6 +3045,55 @@ static gva_t canonicalize(gva_t gva) return gva; } + +typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *sptep); + +static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, + inspect_spte_fn fn) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent)) { + if (sp->role.level > 1 && !is_large_pte(ent)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + __mmu_spte_walk(kvm, child, fn); + } + if (sp->role.level == 1) + fn(kvm, sp, &sp->spt[i]); + } + } +} + +static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + __mmu_spte_walk(vcpu->kvm, sp, fn); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + __mmu_spte_walk(vcpu->kvm, sp, fn); + } + } + return; +} + static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, gva_t va, int level) { @@ -3137,9 +3186,47 @@ static int count_rmaps(struct kvm_vcpu *vcpu) return nmaps; } -static int count_writable_mappings(struct kvm_vcpu *vcpu) +void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) +{ + unsigned long *rmapp; + struct kvm_mmu_page *rev_sp; + gfn_t gfn; + + if (*sptep & PT_WRITABLE_MASK) { + rev_sp = page_header(__pa(sptep)); + gfn = rev_sp->gfns[sptep - rev_sp->spt]; + + if (!gfn_to_memslot(kvm, gfn)) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no memslot for gfn %ld\n", + audit_msg, gfn); + printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", + audit_msg, sptep - rev_sp->spt, + rev_sp->gfn); + dump_stack(); + return; + } + + rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 0); + if (!*rmapp) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no rmap for writable spte %llx\n", + audit_msg, *sptep); + dump_stack(); + } + } + +} + +void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) +{ + mmu_spte_walk(vcpu, inspect_spte_has_rmap); +} + +static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) { - int nmaps = 0; struct kvm_mmu_page *sp; int i; @@ -3156,20 +3243,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu) continue; if (!(ent & PT_WRITABLE_MASK)) continue; - ++nmaps; + inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); } } - return nmaps; + return; } static void audit_rmap(struct kvm_vcpu *vcpu) { - int n_rmap = count_rmaps(vcpu); - int n_actual = count_writable_mappings(vcpu); - - if (n_rmap != n_actual) - printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", - __func__, audit_msg, n_rmap, n_actual); + check_writable_mappings_rmap(vcpu); + count_rmaps(vcpu); } static void audit_write_protection(struct kvm_vcpu *vcpu) @@ -3203,6 +3286,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) audit_rmap(vcpu); audit_write_protection(vcpu); audit_mappings(vcpu); + audit_writable_sptes_have_rmaps(vcpu); dbg = olddbg; } -- cgit v1.2.3-70-g09d2 From e58b0f9e0e2c17112e375a3f0ca1ef7e57730f68 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 10 Jun 2009 12:27:05 -0300 Subject: KVM: MMU audit: update audit_write_protection - Unsync pages contain writable sptes in the rmap. - rmaps do not exclusively contain writable sptes anymore. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fd5579cc8abd..4c2585cab189 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3260,20 +3260,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) struct kvm_mmu_page *sp; struct kvm_memory_slot *slot; unsigned long *rmapp; + u64 *spte; gfn_t gfn; list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { if (sp->role.direct) continue; + if (sp->unsync) + continue; gfn = unalias_gfn(vcpu->kvm, sp->gfn); slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); rmapp = &slot->rmap[gfn - slot->base_gfn]; - if (*rmapp) - printk(KERN_ERR "%s: (%s) shadow page has writable" - " mappings: gfn %lx role %x\n", + + spte = rmap_next(vcpu->kvm, rmapp, NULL); + while (spte) { + if (*spte & PT_WRITABLE_MASK) + printk(KERN_ERR "%s: (%s) shadow page has " + "writable mappings: gfn %lx role %x\n", __func__, audit_msg, sp->gfn, sp->role.word); + spte = rmap_next(vcpu->kvm, rmapp, spte); + } } } -- cgit v1.2.3-70-g09d2 From 48fc03174b125238c541cf00acd5e9b9dff6b9ba Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 10 Jun 2009 12:27:06 -0300 Subject: KVM: MMU audit: nontrapping ptes in nonleaf level It is valid to set non leaf sptes as notrap. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4c2585cab189..86433513fb71 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3109,12 +3109,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, va = canonicalize(va); if (level > 1) { - if (ent == shadow_notrap_nonpresent_pte) - printk(KERN_ERR "audit: (%s) nontrapping pte" - " in nonleaf level: levels %d gva %lx" - " level %d pte %llx\n", audit_msg, - vcpu->arch.mmu.root_level, va, level, ent); - else + if (is_shadow_present_pte(ent)) audit_mappings_page(vcpu, ent, va, level - 1); } else { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); -- cgit v1.2.3-70-g09d2 From 2aaf65e8c40468b198b709a765abe311f91c1a1d Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 10 Jun 2009 12:27:07 -0300 Subject: KVM: MMU audit: audit_mappings tweaks - Fail early in case gfn_to_pfn returns is_error_pfn. - For the pre pte write case, avoid spurious "gva is valid but spte is notrap" messages (the emulation code does the guest write first, so this particular case is OK). Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 86433513fb71..50fe8541ed39 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3117,6 +3117,11 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + continue; + } + if (is_shadow_present_pte(ent) && (ent & PT64_BASE_ADDR_MASK) != hpa) printk(KERN_ERR "xx audit error: (%s) levels %d" @@ -3288,7 +3293,8 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) audit_msg = msg; audit_rmap(vcpu); audit_write_protection(vcpu); - audit_mappings(vcpu); + if (strcmp("pre pte write", audit_msg) != 0) + audit_mappings(vcpu); audit_writable_sptes_have_rmaps(vcpu); dbg = olddbg; } -- cgit v1.2.3-70-g09d2 From 2920d7285740582d6101f32c37d8d54f82531e1e Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 10 Jun 2009 12:27:08 -0300 Subject: KVM: MMU audit: largepage handling Make the audit code aware of largepages. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 50fe8541ed39..780ce3fe7911 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3058,12 +3058,11 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, u64 ent = sp->spt[i]; if (is_shadow_present_pte(ent)) { - if (sp->role.level > 1 && !is_large_pte(ent)) { + if (!is_last_spte(ent, sp->role.level)) { struct kvm_mmu_page *child; child = page_header(ent & PT64_BASE_ADDR_MASK); __mmu_spte_walk(kvm, child, fn); - } - if (sp->role.level == 1) + } else fn(kvm, sp, &sp->spt[i]); } } @@ -3108,10 +3107,9 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, continue; va = canonicalize(va); - if (level > 1) { - if (is_shadow_present_pte(ent)) - audit_mappings_page(vcpu, ent, va, level - 1); - } else { + if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) + audit_mappings_page(vcpu, ent, va, level - 1); + else { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); gfn_t gfn = gpa >> PAGE_SHIFT; pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); @@ -3208,7 +3206,8 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) return; } - rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 0); + rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], + is_large_pte(*sptep)); if (!*rmapp) { if (!printk_ratelimit()) return; -- cgit v1.2.3-70-g09d2 From 71db602322b1197e7951655c46339324b6208bf9 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 12 Jun 2009 22:01:29 +0200 Subject: KVM: Move performance counter MSR access interception to generic x86 path The performance counter MSRs are different for AMD and Intel CPUs and they are chosen mainly by the CPUID vendor string. This patch catches writes to all addresses (regardless of VMX/SVM path) and handles them in the generic MSR handler routine. Writing a 0 into the event select register is something we perfectly emulate ;-), so don't print out a warning to dmesg in this case. This fixes booting a 64bit Windows guest with an AMD CPUID on an Intel host. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 16 ---------------- arch/x86/kvm/vmx.c | 12 ------------ arch/x86/kvm/x86.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 28b981409a8f..060aa9f5571e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2142,22 +2142,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) else svm_disable_lbrv(svm); break; - case MSR_K7_EVNTSEL0: - case MSR_K7_EVNTSEL1: - case MSR_K7_EVNTSEL2: - case MSR_K7_EVNTSEL3: - case MSR_K7_PERFCTR0: - case MSR_K7_PERFCTR1: - case MSR_K7_PERFCTR2: - case MSR_K7_PERFCTR3: - /* - * Just discard all writes to the performance counters; this - * should keep both older linux and windows 64-bit guests - * happy - */ - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data); - - break; case MSR_VM_HSAVE_PA: svm->hsave_msr = data; break; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c08bb4cf372e..6ee929255a3d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1024,18 +1024,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_TSC: rdtscll(host_tsc); guest_write_tsc(data, host_tsc); - break; - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: - /* - * Just discard all writes to the performance counters; this - * should keep both older linux and windows 64-bit guests - * happy - */ - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); - break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 89862a80e32c..30492f0ba4ea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -886,6 +886,36 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return set_msr_mce(vcpu, msr, data); + + /* Performance counters are not protected by a CPUID bit, + * so we should check all of them in the generic path for the sake of + * cross vendor migration. + * Writing a zero into the event select MSRs disables them, + * which we perfectly emulate ;-). Any other value should be at least + * reported, some guests depend on them. + */ + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + if (data != 0) + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; + /* at least RHEL 4 unconditionally writes to the perfctr registers, + * so we ignore writes to make it happy. + */ + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; default: pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); return 1; -- cgit v1.2.3-70-g09d2 From e799794e02a368f79c3fae26aabaaadd0b7466ce Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 11 Jun 2009 12:07:40 -0300 Subject: KVM: VMX: more MSR_IA32_VMX_EPT_VPID_CAP capability bits Required for EPT misconfiguration handler. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/vmx.h | 7 +++++++ arch/x86/kvm/vmx.c | 20 ++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index e7927a639d69..272514c2d456 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -352,9 +352,16 @@ enum vmcs_field { #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 + +#define VMX_EPT_EXECUTE_ONLY_BIT (1ull) +#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) +#define VMX_EPTP_UC_BIT (1ull << 8) +#define VMX_EPTP_WB_BIT (1ull << 14) +#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) + #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6ee929255a3d..6610181267b1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -270,6 +270,26 @@ static inline bool cpu_has_vmx_flexpriority(void) cpu_has_vmx_virtualize_apic_accesses(); } +static inline bool cpu_has_vmx_ept_execute_only(void) +{ + return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); +} + +static inline bool cpu_has_vmx_eptp_uncacheable(void) +{ + return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); +} + +static inline bool cpu_has_vmx_eptp_writeback(void) +{ + return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); +} + +static inline bool cpu_has_vmx_ept_2m_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); +} + static inline int cpu_has_vmx_invept_individual_addr(void) { return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); -- cgit v1.2.3-70-g09d2 From 4d88954d6246d7d43bb8903981731002179f1a1c Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 11 Jun 2009 12:07:41 -0300 Subject: KVM: MMU: make for_each_shadow_entry aware of largepages This way there is no need to add explicit checks in every for_each_shadow_entry user. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 780ce3fe7911..e18f65bf2de7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1302,6 +1302,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { if (iterator->level < PT_PAGE_TABLE_LEVEL) return false; + + if (iterator->level == PT_PAGE_TABLE_LEVEL) + if (is_large_pte(*iterator->sptep)) + return false; + iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; -- cgit v1.2.3-70-g09d2 From 94d8b056a20bac4f9905d6dafcf7b7005207684a Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 11 Jun 2009 12:07:42 -0300 Subject: KVM: MMU: add kvm_mmu_get_spte_hierarchy helper Required by EPT misconfiguration handler. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 18 ++++++++++++++++++ arch/x86/kvm/mmu.h | 2 ++ 2 files changed, 20 insertions(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e18f65bf2de7..12974de88aa5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3038,6 +3038,24 @@ out: return r; } +int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) +{ + struct kvm_shadow_walk_iterator iterator; + int nr_sptes = 0; + + spin_lock(&vcpu->kvm->mmu_lock); + for_each_shadow_entry(vcpu, addr, iterator) { + sptes[iterator.level-1] = *iterator.sptep; + nr_sptes++; + if (!is_shadow_present_pte(*iterator.sptep)) + break; + } + spin_unlock(&vcpu->kvm->mmu_lock); + + return nr_sptes; +} +EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); + #ifdef AUDIT static const char *audit_msg; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 016bf7183e9f..61a1b3884b49 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -37,6 +37,8 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); + static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) -- cgit v1.2.3-70-g09d2 From 68f89400bc92421d6da22e1ec8e3ec599c3c8244 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 11 Jun 2009 12:07:43 -0300 Subject: KVM: VMX: EPT misconfiguration handler Handler for EPT misconfiguration which checks for valid state in the shadow pagetables, printing the spte on each level. The separate WARN_ONs are useful for kerneloops.org. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6610181267b1..94c07ada103e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3227,6 +3227,89 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); } +static u64 ept_rsvd_mask(u64 spte, int level) +{ + int i; + u64 mask = 0; + + for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) + mask |= (1ULL << i); + + if (level > 2) + /* bits 7:3 reserved */ + mask |= 0xf8; + else if (level == 2) { + if (spte & (1ULL << 7)) + /* 2MB ref, bits 20:12 reserved */ + mask |= 0x1ff000; + else + /* bits 6:3 reserved */ + mask |= 0x78; + } + + return mask; +} + +static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, + int level) +{ + printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); + + /* 010b (write-only) */ + WARN_ON((spte & 0x7) == 0x2); + + /* 110b (write/execute) */ + WARN_ON((spte & 0x7) == 0x6); + + /* 100b (execute-only) and value not supported by logical processor */ + if (!cpu_has_vmx_ept_execute_only()) + WARN_ON((spte & 0x7) == 0x4); + + /* not 000b */ + if ((spte & 0x7)) { + u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); + + if (rsvd_bits != 0) { + printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", + __func__, rsvd_bits); + WARN_ON(1); + } + + if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) { + u64 ept_mem_type = (spte & 0x38) >> 3; + + if (ept_mem_type == 2 || ept_mem_type == 3 || + ept_mem_type == 7) { + printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", + __func__, ept_mem_type); + WARN_ON(1); + } + } + } +} + +static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 sptes[4]; + int nr_sptes, i; + gpa_t gpa; + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + + printk(KERN_ERR "EPT: Misconfiguration.\n"); + printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); + + nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); + + for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) + ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); + + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; + + return 0; +} + static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u32 cpu_based_vm_exec_control; @@ -3306,8 +3389,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, - [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, }; static const int kvm_vmx_max_exit_handlers = -- cgit v1.2.3-70-g09d2 From 54dee9933e8d93589ad63ec3d6be39f1921b0767 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 11 Jun 2009 12:07:44 -0300 Subject: KVM: VMX: conditionally disable 2M pages Disable usage of 2M pages if VMX_EPT_2MB_PAGE_BIT (bit 16) is clear in MSR_IA32_VMX_EPT_VPID_CAP and EPT is enabled. [avi: s/largepages_disabled/largepages_enabled/ to avoid negative logic] Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 +++ include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 14 ++++++++++++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 94c07ada103e..fc8d49c6bc51 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1381,6 +1381,9 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; + if (enable_ept && !cpu_has_vmx_ept_2m_page()) + kvm_disable_largepages(); + return alloc_kvm_area(); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c6e4d02067fe..6988858dc56e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -224,6 +224,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, int user_alloc); +void kvm_disable_largepages(void); void kvm_arch_flush_shadow(struct kvm *kvm); gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 777fe533cfe7..48d5e697bf44 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -85,6 +85,8 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, static bool kvm_rebooting; +static bool largepages_enabled = true; + #ifdef KVM_CAP_DEVICE_ASSIGNMENT static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, int assigned_dev_id) @@ -1174,9 +1176,11 @@ int __kvm_set_memory_region(struct kvm *kvm, ugfn = new.userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each - * other, disable large page support for this slot + * other, or if explicitly asked to, disable large page + * support for this slot */ - if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1)) + if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1) || + !largepages_enabled) for (i = 0; i < largepages; ++i) new.lpage_info[i].write_count = 1; } @@ -1291,6 +1295,12 @@ out: return r; } +void kvm_disable_largepages(void) +{ + largepages_enabled = false; +} +EXPORT_SYMBOL_GPL(kvm_disable_largepages); + int is_error_page(struct page *page) { return page == bad_page; -- cgit v1.2.3-70-g09d2 From 6edf14d8d0df144d6928799040f46fa37b5460ae Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 11 Jun 2009 11:26:11 +0300 Subject: KVM: Replace pending exception by PF if it happens serially Replace previous exception with a new one in a hope that instruction re-execution will regenerate lost exception. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 30492f0ba4ea..a066876f1373 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -181,16 +181,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, ++vcpu->stat.pf_guest; if (vcpu->arch.exception.pending) { - if (vcpu->arch.exception.nr == PF_VECTOR) { - printk(KERN_DEBUG "kvm: inject_page_fault:" - " double fault 0x%lx\n", addr); - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - } else if (vcpu->arch.exception.nr == DF_VECTOR) { + switch(vcpu->arch.exception.nr) { + case DF_VECTOR: /* triple fault -> shutdown */ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + case PF_VECTOR: + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + return; + default: + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + vcpu->arch.exception.pending = false; + break; } - return; } vcpu->arch.cr2 = addr; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); -- cgit v1.2.3-70-g09d2 From 33e4c68656a2e461b296ce714ec322978de85412 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 11 Jun 2009 11:06:51 +0300 Subject: KVM: Optimize searching for highest IRR Most of the time IRR is empty, so instead of scanning the whole IRR on each VM entry keep a variable that tells us if IRR is not empty. IRR will have to be scanned twice on each IRQ delivery, but this is much more rare than VM entry. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 24 +++++++++++++++++++++--- arch/x86/kvm/lapic.h | 1 + 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b1694dc45729..3bde43c3789e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -165,29 +165,46 @@ static int find_highest_vector(void *bitmap) static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) { + apic->irr_pending = true; return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); } -static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) +static inline int apic_search_irr(struct kvm_lapic *apic) { - apic_clear_vector(vec, apic->regs + APIC_IRR); + return find_highest_vector(apic->regs + APIC_IRR); } static inline int apic_find_highest_irr(struct kvm_lapic *apic) { int result; - result = find_highest_vector(apic->regs + APIC_IRR); + if (!apic->irr_pending) + return -1; + + result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); return result; } +static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) +{ + apic->irr_pending = false; + apic_clear_vector(vec, apic->regs + APIC_IRR); + if (apic_search_irr(apic) != -1) + apic->irr_pending = true; +} + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; int highest_irr; + /* This may race with setting of irr in __apic_accept_irq() and + * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq + * will cause vmexit immediately and the value will be recalculated + * on the next vmentry. + */ if (!apic) return 0; highest_irr = apic_find_highest_irr(apic); @@ -843,6 +860,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } + apic->irr_pending = false; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); if (kvm_vcpu_is_bsp(vcpu)) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index a587f8349c46..3f3ecc6edbf5 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -12,6 +12,7 @@ struct kvm_lapic { struct kvm_timer lapic_timer; u32 divide_count; struct kvm_vcpu *vcpu; + bool irr_pending; struct page *regs_page; void *regs; gpa_t vapic_addr; -- cgit v1.2.3-70-g09d2 From f7104db26ab2bc5f642892774ac8fb0f15400969 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 9 Jun 2009 15:37:01 +0200 Subject: KVM: Fix racy event propagation in timer Minor issue that likely had no practical relevance: the kvm timer function so far incremented the pending counter and then may reset it again to 1 in case reinjection was disabled. This opened a small racy window with the corresponding VCPU loop that may have happened to run on another (real) CPU and already consumed the value. Fix it by skipping the incrementation in case pending is already > 0. This opens a different race windows, but may only rarely cause lost events in case we do not care about them anyway (!reinject). Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/timer.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 85cc743a8203..1baed414b57a 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -9,12 +9,16 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) int restart_timer = 0; wait_queue_head_t *q = &vcpu->wq; - /* FIXME: this code should not know anything about vcpus */ - if (!atomic_inc_and_test(&ktimer->pending)) - set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); - - if (!ktimer->reinject) - atomic_set(&ktimer->pending, 1); + /* + * There is a race window between reading and incrementing, but we do + * not care about potentially loosing timer events in the !reinject + * case anyway. + */ + if (ktimer->reinject || !atomic_read(&ktimer->pending)) { + /* FIXME: this code should not know anything about vcpus */ + if (!atomic_inc_and_test(&ktimer->pending)) + set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + } if (waitqueue_active(q)) wake_up_interruptible(q); -- cgit v1.2.3-70-g09d2 From 681405bfc73a2717ae9b03b2bad465b009106f31 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 9 Jun 2009 15:37:03 +0200 Subject: KVM: Drop useless atomic test from timer function The current code tries to optimize the setting of KVM_REQ_PENDING_TIMER but used atomic_inc_and_test - which always returns true unless pending had the invalid value of -1 on entry. This patch drops the test part preserving the original semantic but expressing it less confusingly. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 1baed414b57a..eea40439066c 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -15,9 +15,9 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) * case anyway. */ if (ktimer->reinject || !atomic_read(&ktimer->pending)) { + atomic_inc(&ktimer->pending); /* FIXME: this code should not know anything about vcpus */ - if (!atomic_inc_and_test(&ktimer->pending)) - set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); } if (waitqueue_active(q)) -- cgit v1.2.3-70-g09d2 From d3edefc0035669e8817d1d5c32ef03e882477323 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 16 Jun 2009 12:33:56 +0300 Subject: KVM: VMX: Only reload guest cr2 if different from host cr2 cr2 changes only rarely, and writing it is expensive. Avoid the costly cr2 writes by checking if it does not already hold the desired value. Shaves 70 cycles off the vmexit latency. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fc8d49c6bc51..1a84ca191cd1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3651,11 +3651,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%"R"sp, %c[host_rsp](%0) \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" "1: \n\t" + /* Reload cr2 if changed */ + "mov %c[cr2](%0), %%"R"ax \n\t" + "mov %%cr2, %%"R"dx \n\t" + "cmp %%"R"ax, %%"R"dx \n\t" + "je 2f \n\t" + "mov %%"R"ax, %%cr2 \n\t" + "2: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ - "mov %c[cr2](%0), %%"R"ax \n\t" - "mov %%"R"ax, %%cr2 \n\t" "mov %c[rax](%0), %%"R"ax \n\t" "mov %c[rbx](%0), %%"R"bx \n\t" "mov %c[rdx](%0), %%"R"dx \n\t" -- cgit v1.2.3-70-g09d2 From b3dbf89e676e47ef3b10802f9aba5a8e04aba132 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 16 Jun 2009 12:36:33 +0300 Subject: KVM: SVM: Don't save/restore host cr2 The host never reads cr2 in process context, so are free to clobber it. The vmx code does this, so we can safely remove the save/restore code. Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 060aa9f5571e..132be0c7c3eb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -81,7 +81,6 @@ struct vcpu_svm { u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; u64 host_gs_base; - unsigned long host_cr2; u32 *msrpm; struct vmcb *hsave; @@ -186,19 +185,6 @@ static inline void invlpga(unsigned long addr, u32 asid) asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); } -static inline unsigned long kvm_read_cr2(void) -{ - unsigned long cr2; - - asm volatile ("mov %%cr2, %0" : "=r" (cr2)); - return cr2; -} - -static inline void kvm_write_cr2(unsigned long val) -{ - asm volatile ("mov %0, %%cr2" :: "r" (val)); -} - static inline void force_new_asid(struct kvm_vcpu *vcpu) { to_svm(vcpu)->asid_generation--; @@ -2527,7 +2513,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) fs_selector = kvm_read_fs(); gs_selector = kvm_read_gs(); ldt_selector = kvm_read_ldt(); - svm->host_cr2 = kvm_read_cr2(); if (!is_nested(svm)) svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ @@ -2614,8 +2599,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - kvm_write_cr2(svm->host_cr2); - kvm_load_fs(fs_selector); kvm_load_gs(gs_selector); kvm_load_ldt(ldt_selector); -- cgit v1.2.3-70-g09d2 From 0367b4330e463c45981437083991b90d25a9d78d Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 15 Jun 2009 15:21:22 +0200 Subject: x86: Add definition for IGNNE MSR Hyper-V accesses MSR_IGNNE while running under KVM. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/include/asm/msr-index.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6be7fc254b59..bd5549034a95 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -374,6 +374,7 @@ /* AMD-V MSRs */ #define MSR_VM_CR 0xc0010114 +#define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 #endif /* _ASM_X86_MSR_INDEX_H */ -- cgit v1.2.3-70-g09d2 From 3c5d0a44b011e0a1d857452f05c698e1008b4b4a Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 15 Jun 2009 15:21:23 +0200 Subject: KVM: Implement MSRs used by Hyper-V Hyper-V uses some MSRs, some of which are actually reserved for BIOS usage. But let's be nice today and have it its way, because otherwise it fails terribly. [jaswinder: fix build for linux-next changes] Signed-off-by: Alexander Graf Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 132be0c7c3eb..146c17d406ed 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2131,6 +2131,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) case MSR_VM_HSAVE_PA: svm->hsave_msr = data; break; + case MSR_VM_CR: + case MSR_VM_IGNNE: + case MSR_K7_HWCR: + pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + break; default: return kvm_set_msr_common(vcpu, ecx, data); } -- cgit v1.2.3-70-g09d2 From ff092385e8285c03d8b148f42f46f98c5f4becd5 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 15 Jun 2009 15:21:24 +0200 Subject: KVM: SVM: Implement INVLPGA SVM adds another way to do INVLPG by ASID which Hyper-V makes use of, so let's implement it! For now we just do the same thing invlpg does, as asid switching means we flush the mmu anyways. That might change one day though. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 146c17d406ed..be699795d70f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1880,6 +1880,19 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + nsvm_printk("INVLPGA\n"); + + /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ + kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + return 1; +} + static int invalid_op_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { @@ -2227,7 +2240,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_INVD] = emulate_on_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, - [SVM_EXIT_INVLPGA] = invalid_op_interception, + [SVM_EXIT_INVLPGA] = invlpga_interception, [SVM_EXIT_IOIO] = io_interception, [SVM_EXIT_MSR] = msr_interception, [SVM_EXIT_TASK_SWITCH] = task_switch_interception, -- cgit v1.2.3-70-g09d2 From 219b65dcf6c0bad83d51bfa12e25891c02de2414 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 15 Jun 2009 15:21:25 +0200 Subject: KVM: SVM: Improve nested interrupt injection While trying to get Hyper-V running, I realized that the interrupt injection mechanisms that are in place right now are not 100% correct. This patch makes nested SVM's interrupt injection behave more like on a real machine. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index be699795d70f..456666183770 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1612,7 +1612,8 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, /* Kill any pending exceptions */ if (svm->vcpu.arch.exception.pending == true) nsvm_printk("WARNING: Pending Exception\n"); - svm->vcpu.arch.exception.pending = false; + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); /* Restore selected save entries */ svm->vmcb->save.es = hsave->save.es; @@ -1680,7 +1681,8 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm->nested_vmcb = svm->vmcb->save.rax; /* Clear internal status */ - svm->vcpu.arch.exception.pending = false; + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); /* Save the old vmcb, so we don't need to pick what we save, but can restore everything when a VMEXIT occurs */ @@ -2362,21 +2364,14 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); } -static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.event_inj = nr | - SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; -} - static void svm_set_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - nested_svm_intr(svm); + BUG_ON(!(svm->vcpu.arch.hflags & HF_GIF_MASK)); - svm_queue_irq(vcpu, vcpu->arch.interrupt.nr); + svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | + SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) @@ -2404,13 +2399,25 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; return (vmcb->save.rflags & X86_EFLAGS_IF) && !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && - (svm->vcpu.arch.hflags & HF_GIF_MASK); + (svm->vcpu.arch.hflags & HF_GIF_MASK) && + !is_nested(svm); } static void enable_irq_window(struct kvm_vcpu *vcpu) { - svm_set_vintr(to_svm(vcpu)); - svm_inject_irq(to_svm(vcpu), 0x0); + struct vcpu_svm *svm = to_svm(vcpu); + nsvm_printk("Trying to open IRQ window\n"); + + nested_svm_intr(svm); + + /* In case GIF=0 we can't rely on the CPU to tell us when + * GIF becomes 1, because that's a separate STGI/VMRUN intercept. + * The next time we get that intercept, this function will be + * called again though and we'll get the vintr intercept. */ + if (svm->vcpu.arch.hflags & HF_GIF_MASK) { + svm_set_vintr(svm); + svm_inject_irq(svm, 0x0); + } } static void enable_nmi_window(struct kvm_vcpu *vcpu) @@ -2489,6 +2496,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) case SVM_EXITINTINFO_TYPE_EXEPT: /* In case of software exception do not reinject an exception vector, but re-execute and instruction instead */ + if (is_nested(svm)) + break; if (kvm_exception_is_soft(vector)) break; if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { -- cgit v1.2.3-70-g09d2 From 229456fc34b1c9031b04f7581e7b755d1cebfe9c Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 17 Jun 2009 09:22:14 -0300 Subject: KVM: convert custom marker based tracing to event traces This allows use of the powerful ftrace infrastructure. See Documentation/trace/ for usage information. [avi, stephen: various build fixes] [sheng: fix control register breakage] Signed-off-by: Marcelo Tosatti Signed-off-by: Stephen Rothwell Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kvm/Makefile | 4 + arch/x86/kvm/lapic.c | 7 +- arch/x86/kvm/svm.c | 84 +++++++++---- arch/x86/kvm/trace.h | 260 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx.c | 78 +++++++----- arch/x86/kvm/x86.c | 48 +++----- include/trace/events/kvm.h | 57 +++++++++ virt/kvm/irq_comm.c | 5 + virt/kvm/kvm_main.c | 4 + 10 files changed, 463 insertions(+), 86 deletions(-) create mode 100644 arch/x86/kvm/trace.h create mode 100644 include/trace/events/kvm.h diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c7b0cc2b7020..19027ab20412 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -527,6 +528,7 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + const struct trace_print_flags *exit_reasons_str; }; extern struct kvm_x86_ops *kvm_x86_ops; diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 01e3c61f749a..7c56850b82cb 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,6 +1,10 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm +CFLAGS_x86.o := -I. +CFLAGS_svm.o := -I. +CFLAGS_vmx.o := -I. + kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o) kvm-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3bde43c3789e..2e0286596387 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -34,6 +34,7 @@ #include #include "kvm_cache_regs.h" #include "irq.h" +#include "trace.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -515,8 +516,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) { u32 val = 0; - KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); - if (offset >= LAPIC_MMIO_LENGTH) return 0; @@ -562,6 +561,8 @@ static void apic_mmio_read(struct kvm_io_device *this, } result = __apic_read(apic, offset & ~0xf); + trace_kvm_apic_read(offset, result); + switch (len) { case 1: case 2: @@ -657,7 +658,7 @@ static void apic_mmio_write(struct kvm_io_device *this, offset &= 0xff0; - KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); + trace_kvm_apic_write(offset, val); switch (offset) { case APIC_ID: /* Local APIC ID */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 456666183770..b1c446208867 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -25,10 +25,12 @@ #include #include #include +#include #include #include +#include "trace.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -1096,7 +1098,6 @@ static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) val = 0; } - KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); return val; } @@ -1105,8 +1106,6 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, { struct vcpu_svm *svm = to_svm(vcpu); - KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler); - *exception = 0; switch (dr) { @@ -1154,14 +1153,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; - if (!npt_enabled) - KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code, - (u32)fault_address, (u32)(fault_address >> 32), - handler); - else - KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, - (u32)fault_address, (u32)(fault_address >> 32), - handler); + trace_kvm_page_fault(fault_address, error_code); /* * FIXME: Tis shouldn't be necessary here, but there is a flush * missing in the MMU code. Until we find this bug, flush the @@ -1288,14 +1280,12 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - KVMTRACE_0D(NMI, &svm->vcpu, handler); return 1; } static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { ++svm->vcpu.stat.irq_exits; - KVMTRACE_0D(INTR, &svm->vcpu, handler); return 1; } @@ -2077,8 +2067,7 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) if (svm_get_msr(&svm->vcpu, ecx, &data)) kvm_inject_gp(&svm->vcpu, 0); else { - KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, - (u32)(data >> 32), handler); + trace_kvm_msr_read(ecx, data); svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; @@ -2163,8 +2152,7 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; if (svm_set_msr(&svm->vcpu, ecx, data)) @@ -2185,8 +2173,6 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int interrupt_window_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler); - svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* @@ -2265,8 +2251,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u32 exit_code = svm->vmcb->control.exit_code; - KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, - (u32)((u64)svm->vmcb->save.rip >> 32), entryexit); + trace_kvm_exit(exit_code, svm->vmcb->save.rip); if (is_nested(svm)) { nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", @@ -2354,7 +2339,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; - KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); + trace_kvm_inj_virq(irq); ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; @@ -2717,6 +2702,59 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } +static const struct trace_print_flags svm_exit_reasons_str[] = { + { SVM_EXIT_READ_CR0, "read_cr0" }, + { SVM_EXIT_READ_CR3, "read_cr3" }, + { SVM_EXIT_READ_CR4, "read_cr4" }, + { SVM_EXIT_READ_CR8, "read_cr8" }, + { SVM_EXIT_WRITE_CR0, "write_cr0" }, + { SVM_EXIT_WRITE_CR3, "write_cr3" }, + { SVM_EXIT_WRITE_CR4, "write_cr4" }, + { SVM_EXIT_WRITE_CR8, "write_cr8" }, + { SVM_EXIT_READ_DR0, "read_dr0" }, + { SVM_EXIT_READ_DR1, "read_dr1" }, + { SVM_EXIT_READ_DR2, "read_dr2" }, + { SVM_EXIT_READ_DR3, "read_dr3" }, + { SVM_EXIT_WRITE_DR0, "write_dr0" }, + { SVM_EXIT_WRITE_DR1, "write_dr1" }, + { SVM_EXIT_WRITE_DR2, "write_dr2" }, + { SVM_EXIT_WRITE_DR3, "write_dr3" }, + { SVM_EXIT_WRITE_DR5, "write_dr5" }, + { SVM_EXIT_WRITE_DR7, "write_dr7" }, + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, + { SVM_EXIT_INTR, "interrupt" }, + { SVM_EXIT_NMI, "nmi" }, + { SVM_EXIT_SMI, "smi" }, + { SVM_EXIT_INIT, "init" }, + { SVM_EXIT_VINTR, "vintr" }, + { SVM_EXIT_CPUID, "cpuid" }, + { SVM_EXIT_INVD, "invd" }, + { SVM_EXIT_HLT, "hlt" }, + { SVM_EXIT_INVLPG, "invlpg" }, + { SVM_EXIT_INVLPGA, "invlpga" }, + { SVM_EXIT_IOIO, "io" }, + { SVM_EXIT_MSR, "msr" }, + { SVM_EXIT_TASK_SWITCH, "task_switch" }, + { SVM_EXIT_SHUTDOWN, "shutdown" }, + { SVM_EXIT_VMRUN, "vmrun" }, + { SVM_EXIT_VMMCALL, "hypercall" }, + { SVM_EXIT_VMLOAD, "vmload" }, + { SVM_EXIT_VMSAVE, "vmsave" }, + { SVM_EXIT_STGI, "stgi" }, + { SVM_EXIT_CLGI, "clgi" }, + { SVM_EXIT_SKINIT, "skinit" }, + { SVM_EXIT_WBINVD, "wbinvd" }, + { SVM_EXIT_MONITOR, "monitor" }, + { SVM_EXIT_MWAIT, "mwait" }, + { SVM_EXIT_NPF, "npf" }, + { -1, NULL } +}; + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -2778,6 +2816,8 @@ static struct kvm_x86_ops svm_x86_ops = { .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, .get_mt_mask = svm_get_mt_mask, + + .exit_reasons_str = svm_exit_reasons_str, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h new file mode 100644 index 000000000000..cd8c90db41a5 --- /dev/null +++ b/arch/x86/kvm/trace.h @@ -0,0 +1,260 @@ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm +#define TRACE_INCLUDE_PATH arch/x86/kvm +#define TRACE_INCLUDE_FILE trace + +/* + * Tracepoint for guest mode entry. + */ +TRACE_EVENT(kvm_entry, + TP_PROTO(unsigned int vcpu_id), + TP_ARGS(vcpu_id), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + ), + + TP_printk("vcpu %u", __entry->vcpu_id) +); + +/* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hypercall, + TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3), + TP_ARGS(nr, a0, a1, a2, a3), + + TP_STRUCT__entry( + __field( unsigned long, nr ) + __field( unsigned long, a0 ) + __field( unsigned long, a1 ) + __field( unsigned long, a2 ) + __field( unsigned long, a3 ) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->a0 = a0; + __entry->a1 = a1; + __entry->a2 = a2; + __entry->a3 = a3; + ), + + TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx", + __entry->nr, __entry->a0, __entry->a1, __entry->a2, + __entry->a3) +); + +/* + * Tracepoint for PIO. + */ +TRACE_EVENT(kvm_pio, + TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, + unsigned int count), + TP_ARGS(rw, port, size, count), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, port ) + __field( unsigned int, size ) + __field( unsigned int, count ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->port = port; + __entry->size = size; + __entry->count = count; + ), + + TP_printk("pio_%s at 0x%x size %d count %d", + __entry->rw ? "write" : "read", + __entry->port, __entry->size, __entry->count) +); + +/* + * Tracepoint for cpuid. + */ +TRACE_EVENT(kvm_cpuid, + TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, + unsigned long rcx, unsigned long rdx), + TP_ARGS(function, rax, rbx, rcx, rdx), + + TP_STRUCT__entry( + __field( unsigned int, function ) + __field( unsigned long, rax ) + __field( unsigned long, rbx ) + __field( unsigned long, rcx ) + __field( unsigned long, rdx ) + ), + + TP_fast_assign( + __entry->function = function; + __entry->rax = rax; + __entry->rbx = rbx; + __entry->rcx = rcx; + __entry->rdx = rdx; + ), + + TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", + __entry->function, __entry->rax, + __entry->rbx, __entry->rcx, __entry->rdx) +); + +/* + * Tracepoint for apic access. + */ +TRACE_EVENT(kvm_apic, + TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val), + TP_ARGS(rw, reg, val), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, reg ) + __field( unsigned int, val ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->reg = reg; + __entry->val = val; + ), + + TP_printk("apic_%s 0x%x = 0x%x", + __entry->rw ? "write" : "read", + __entry->reg, __entry->val) +); + +#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) +#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) + +/* + * Tracepoint for kvm guest exit: + */ +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), + TP_ARGS(exit_reason, guest_rip), + + TP_STRUCT__entry( + __field( unsigned int, exit_reason ) + __field( unsigned long, guest_rip ) + ), + + TP_fast_assign( + __entry->exit_reason = exit_reason; + __entry->guest_rip = guest_rip; + ), + + TP_printk("reason %s rip 0x%lx", + ftrace_print_symbols_seq(p, __entry->exit_reason, + kvm_x86_ops->exit_reasons_str), + __entry->guest_rip) +); + +/* + * Tracepoint for kvm interrupt injection: + */ +TRACE_EVENT(kvm_inj_virq, + TP_PROTO(unsigned int irq), + TP_ARGS(irq), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + ), + + TP_fast_assign( + __entry->irq = irq; + ), + + TP_printk("irq %u", __entry->irq) +); + +/* + * Tracepoint for page fault. + */ +TRACE_EVENT(kvm_page_fault, + TP_PROTO(unsigned long fault_address, unsigned int error_code), + TP_ARGS(fault_address, error_code), + + TP_STRUCT__entry( + __field( unsigned long, fault_address ) + __field( unsigned int, error_code ) + ), + + TP_fast_assign( + __entry->fault_address = fault_address; + __entry->error_code = error_code; + ), + + TP_printk("address %lx error_code %x", + __entry->fault_address, __entry->error_code) +); + +/* + * Tracepoint for guest MSR access. + */ +TRACE_EVENT(kvm_msr, + TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), + TP_ARGS(rw, ecx, data), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, ecx ) + __field( unsigned long, data ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->ecx = ecx; + __entry->data = data; + ), + + TP_printk("msr_%s %x = 0x%lx", + __entry->rw ? "write" : "read", + __entry->ecx, __entry->data) +); + +#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) +#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) + +/* + * Tracepoint for guest CR access. + */ +TRACE_EVENT(kvm_cr, + TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val), + TP_ARGS(rw, cr, val), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, cr ) + __field( unsigned long, val ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->cr = cr; + __entry->val = val; + ), + + TP_printk("cr_%s %x = 0x%lx", + __entry->rw ? "write" : "read", + __entry->cr, __entry->val) +); + +#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val) +#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val) + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1a84ca191cd1..c6256b98f078 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "kvm_cache_regs.h" #include "x86.h" @@ -34,6 +35,8 @@ #include #include +#include "trace.h" + #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); @@ -2550,7 +2553,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) uint32_t intr; int irq = vcpu->arch.interrupt.nr; - KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); + trace_kvm_inj_virq(irq); ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { @@ -2751,8 +2754,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (enable_ept) BUG(); cr2 = vmcs_readl(EXIT_QUALIFICATION); - KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, - (u32)((u64)cr2 >> 32), handler); + trace_kvm_page_fault(cr2, error_code); + if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); @@ -2799,7 +2802,6 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { ++vcpu->stat.irq_exits; - KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler); return 1; } @@ -2847,7 +2849,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - unsigned long exit_qualification; + unsigned long exit_qualification, val; int cr; int reg; @@ -2856,21 +2858,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), - (u32)((u64)kvm_register_read(vcpu, reg) >> 32), - handler); + val = kvm_register_read(vcpu, reg); + trace_kvm_cr_write(cr, val); switch (cr) { case 0: - kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr0(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 3: - kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr3(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 4: - kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr4(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 8: { @@ -2892,23 +2892,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); vmx_fpu_activate(vcpu); - KVMTRACE_0D(CLTS, vcpu, handler); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { case 3: kvm_register_write(vcpu, reg, vcpu->arch.cr3); - KVMTRACE_3D(CR_READ, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), - (u32)((u64)kvm_register_read(vcpu, reg) >> 32), - handler); + trace_kvm_cr_read(cr, vcpu->arch.cr3); skip_emulated_instruction(vcpu); return 1; case 8: - kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); - KVMTRACE_2D(CR_READ, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), handler); + val = kvm_get_cr8(vcpu); + kvm_register_write(vcpu, reg, val); + trace_kvm_cr_read(cr, val); skip_emulated_instruction(vcpu); return 1; } @@ -2976,7 +2972,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) val = 0; } kvm_register_write(vcpu, reg, val); - KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { val = vcpu->arch.regs[reg]; switch (dr) { @@ -3009,7 +3004,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } break; } - KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler); } skip_emulated_instruction(vcpu); return 1; @@ -3031,8 +3025,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } - KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_read(ecx, data); /* FIXME: handling of bits 32:63 of rax, rdx */ vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; @@ -3047,8 +3040,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_write(ecx, data); if (vmx_set_msr(vcpu, ecx, data) != 0) { kvm_inject_gp(vcpu, 0); @@ -3075,7 +3067,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - KVMTRACE_0D(PEND_INTR, vcpu, handler); ++vcpu->stat.irq_window_exits; /* @@ -3227,6 +3218,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + trace_kvm_page_fault(gpa, exit_qualification); return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); } @@ -3410,8 +3402,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), - (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); + trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); /* If we need to emulate an MMIO from handle_invalid_guest_state * we just return 0 */ @@ -3500,10 +3491,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) /* We need to handle NMIs before interrupts are enabled */ if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (exit_intr_info & INTR_INFO_VALID_MASK)) { - KVMTRACE_0D(NMI, &vmx->vcpu, handler); + (exit_intr_info & INTR_INFO_VALID_MASK)) asm("int $2"); - } idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; @@ -3891,6 +3880,29 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return ret; } +static const struct trace_print_flags vmx_exit_reasons_str[] = { + { EXIT_REASON_EXCEPTION_NMI, "exception" }, + { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, + { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, + { EXIT_REASON_NMI_WINDOW, "nmi_window" }, + { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, + { EXIT_REASON_CR_ACCESS, "cr_access" }, + { EXIT_REASON_DR_ACCESS, "dr_access" }, + { EXIT_REASON_CPUID, "cpuid" }, + { EXIT_REASON_MSR_READ, "rdmsr" }, + { EXIT_REASON_MSR_WRITE, "wrmsr" }, + { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, + { EXIT_REASON_HLT, "halt" }, + { EXIT_REASON_INVLPG, "invlpg" }, + { EXIT_REASON_VMCALL, "hypercall" }, + { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, + { EXIT_REASON_APIC_ACCESS, "apic_access" }, + { EXIT_REASON_WBINVD, "wbinvd" }, + { EXIT_REASON_TASK_SWITCH, "task_switch" }, + { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, + { -1, NULL } +}; + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -3950,6 +3962,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, .get_mt_mask = vmx_get_mt_mask, + + .exit_reasons_str = vmx_exit_reasons_str, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a066876f1373..892a7a60c815 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -37,6 +37,8 @@ #include #include #include +#define CREATE_TRACE_POINTS +#include "trace.h" #include #include @@ -347,9 +349,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); - KVMTRACE_1D(LMSW, vcpu, - (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), - handler); } EXPORT_SYMBOL_GPL(kvm_lmsw); @@ -2568,7 +2567,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - KVMTRACE_0D(CLTS, vcpu, handler); kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); return X86EMUL_CONTINUE; } @@ -2851,12 +2849,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.down = 0; vcpu->arch.pio.rep = 0; - if (vcpu->run->io.direction == KVM_EXIT_IO_IN) - KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, - handler); - else - KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, - handler); + trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, + size, 1); val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); @@ -2892,12 +2886,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.down = down; vcpu->arch.pio.rep = rep; - if (vcpu->run->io.direction == KVM_EXIT_IO_IN) - KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, - handler); - else - KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, - handler); + trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, + size, count); if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); @@ -3075,7 +3065,6 @@ void kvm_arch_exit(void) int kvm_emulate_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; - KVMTRACE_0D(HLT, vcpu, handler); if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; return 1; @@ -3106,7 +3095,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); - KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); + trace_kvm_hypercall(nr, a0, a1, a2, a3); if (!is_long_mode(vcpu)) { nr &= 0xFFFFFFFF; @@ -3206,8 +3195,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); return 0; } - KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, - (u32)((u64)value >> 32), handler); return value; } @@ -3215,9 +3202,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, unsigned long *rflags) { - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, - (u32)((u64)val >> 32), handler); - switch (cr) { case 0: kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); @@ -3327,11 +3311,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); } kvm_x86_ops->skip_emulated_instruction(vcpu); - KVMTRACE_5D(CPUID, vcpu, function, - (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), - (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), - (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), - (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); + trace_kvm_cpuid(function, + kvm_register_read(vcpu, VCPU_REGS_RAX), + kvm_register_read(vcpu, VCPU_REGS_RBX), + kvm_register_read(vcpu, VCPU_REGS_RCX), + kvm_register_read(vcpu, VCPU_REGS_RDX)); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); @@ -3527,7 +3511,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) set_debugreg(vcpu->arch.eff_db[3], 3); } - KVMTRACE_0D(VMENTRY, vcpu, entryexit); + trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu, kvm_run); if (unlikely(vcpu->arch.switch_db_regs)) { @@ -4842,3 +4826,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { return kvm_x86_ops->interrupt_allowed(vcpu); } + +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h new file mode 100644 index 000000000000..d74b23d803f1 --- /dev/null +++ b/include/trace/events/kvm.h @@ -0,0 +1,57 @@ +#if !defined(_TRACE_KVM_MAIN_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_MAIN_H + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm +#define TRACE_INCLUDE_FILE kvm + +#if defined(__KVM_HAVE_IOAPIC) +TRACE_EVENT(kvm_set_irq, + TP_PROTO(unsigned int gsi), + TP_ARGS(gsi), + + TP_STRUCT__entry( + __field( unsigned int, gsi ) + ), + + TP_fast_assign( + __entry->gsi = gsi; + ), + + TP_printk("gsi %u", __entry->gsi) +); + + +#define kvm_irqchips \ + {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \ + {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \ + {KVM_IRQCHIP_IOAPIC, "IOAPIC"} + +TRACE_EVENT(kvm_ack_irq, + TP_PROTO(unsigned int irqchip, unsigned int pin), + TP_ARGS(irqchip, pin), + + TP_STRUCT__entry( + __field( unsigned int, irqchip ) + __field( unsigned int, pin ) + ), + + TP_fast_assign( + __entry->irqchip = irqchip; + __entry->pin = pin; + ), + + TP_printk("irqchip %s pin %u", + __print_symbolic(__entry->irqchip, kvm_irqchips), + __entry->pin) +); + + + +#endif /* defined(__KVM_HAVE_IOAPIC) */ +#endif /* _TRACE_KVM_MAIN_H */ + +/* This part must be outside protection */ +#include diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index bb8a1b5e41c1..94759ed96b64 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -20,6 +20,7 @@ */ #include +#include #include #ifdef CONFIG_IA64 @@ -125,6 +126,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) unsigned long *irq_state, sig_level; int ret = -1; + trace_kvm_set_irq(irq); + WARN_ON(!mutex_is_locked(&kvm->irq_lock)); if (irq < KVM_IOAPIC_NUM_PINS) { @@ -161,6 +164,8 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) struct hlist_node *n; unsigned gsi = pin; + trace_kvm_ack_irq(irqchip, pin); + list_for_each_entry(e, &kvm->irq_routing, link) if (e->type == KVM_IRQ_ROUTING_IRQCHIP && e->irqchip.irqchip == irqchip && diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 48d5e697bf44..04bdeddebdac 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -59,6 +59,9 @@ #include "irq.h" #endif +#define CREATE_TRACE_POINTS +#include + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -2718,6 +2721,7 @@ EXPORT_SYMBOL_GPL(kvm_init); void kvm_exit(void) { kvm_trace_cleanup(); + tracepoint_synchronize_unregister(); misc_deregister(&kvm_dev); kmem_cache_destroy(kvm_vcpu_cache); sysdev_unregister(&kvm_sysdev); -- cgit v1.2.3-70-g09d2 From 0cb5762ed2b3113b3b8aa84d1d26b815aea71787 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 17 Jun 2009 15:50:31 +0200 Subject: KVM: Allow emulation of syscalls instructions on #UD Add the opcodes for syscall, sysenter and sysexit to the list of instructions handled by the undefined opcode handler. Signed-off-by: Christoph Egger Signed-off-by: Amit Shah Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 892a7a60c815..57e76b37242f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2667,14 +2667,33 @@ int emulate_instruction(struct kvm_vcpu *vcpu, r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - /* Reject the instructions other than VMCALL/VMMCALL when - * try to emulate invalid opcode */ + /* Only allow emulation of specific instructions on #UD + * (namely VMMCALL, sysenter, sysexit, syscall)*/ c = &vcpu->arch.emulate_ctxt.decode; - if ((emulation_type & EMULTYPE_TRAP_UD) && - (!(c->twobyte && c->b == 0x01 && - (c->modrm_reg == 0 || c->modrm_reg == 3) && - c->modrm_mod == 3 && c->modrm_rm == 1))) - return EMULATE_FAIL; + if (emulation_type & EMULTYPE_TRAP_UD) { + if (!c->twobyte) + return EMULATE_FAIL; + switch (c->b) { + case 0x01: /* VMMCALL */ + if (c->modrm_mod != 3 || c->modrm_rm != 1) + return EMULATE_FAIL; + break; + case 0x34: /* sysenter */ + case 0x35: /* sysexit */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return EMULATE_FAIL; + break; + case 0x05: /* syscall */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return EMULATE_FAIL; + break; + default: + return EMULATE_FAIL; + } + + if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) + return EMULATE_FAIL; + } ++vcpu->stat.insn_emulation; if (r) { -- cgit v1.2.3-70-g09d2 From b1d861431ed58f752b31e8c07da029072989bec7 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 17 Jun 2009 15:50:32 +0200 Subject: KVM: x86 emulator: Add missing EFLAGS bit definitions Signed-off-by: Christoph Egger Signed-off-by: Amit Shah Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ef4dfca3ed7e..67af33aeb3f3 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -320,8 +320,11 @@ static u32 group2_table[] = { }; /* EFLAGS bit definitions. */ +#define EFLG_VM (1<<17) +#define EFLG_RF (1<<16) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) +#define EFLG_IF (1<<9) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) -- cgit v1.2.3-70-g09d2 From e99f0507125f45b723a9069e9e854c3c4758e7ba Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 17 Jun 2009 15:50:33 +0200 Subject: KVM: x86 emulator: Prepare for emulation of syscall instructions Add the flags needed for syscall, sysenter and sysexit to the opcode table. Catch (but for now ignore) the opcodes in the emulation switch/case. Signed-off-by: Andre Przywara Signed-off-by: Amit Shah Signed-off-by: Christoph Egger Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 67af33aeb3f3..b0da29d74032 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -32,6 +32,8 @@ #include #include +#include "mmu.h" /* for is_long_mode() */ + /* * Opcode effective-address decode tables. * Note that we only emulate instructions that have at least one memory @@ -209,7 +211,7 @@ static u32 opcode_table[256] = { static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, + 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, @@ -217,7 +219,9 @@ static u32 twobyte_table[256] = { ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ImplicitOps, 0, ImplicitOps, 0, + ImplicitOps, ImplicitOps, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -1988,6 +1992,9 @@ twobyte_insn: goto cannot_emulate; } break; + case 0x05: /* syscall */ + goto cannot_emulate; + break; case 0x06: emulate_clts(ctxt->vcpu); c->dst.type = OP_NONE; @@ -2054,6 +2061,12 @@ twobyte_insn: rc = X86EMUL_CONTINUE; c->dst.type = OP_NONE; break; + case 0x34: /* sysenter */ + goto cannot_emulate; + break; + case 0x35: /* sysexit */ + goto cannot_emulate; + break; case 0x40 ... 0x4f: /* cmov */ c->dst.val = c->dst.orig_val = c->src.val; if (!test_cc(c->b, ctxt->eflags)) -- cgit v1.2.3-70-g09d2 From e66bb2ccdcf76d032bbb464b35c292bb3ee58f9b Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 18 Jun 2009 12:56:00 +0200 Subject: KVM: x86 emulator: add syscall emulation Handle #UD intercept of the syscall instruction in 32bit compat mode on an Intel host. Setup the segment descriptors for CS and SS and the EIP/ESP registers according to the manual. Save the RIP and EFLAGS to the correct registers. [avi: fix build on i386 due to missing R11] Signed-off-by: Christoph Egger Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 84 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index b0da29d74032..4d7256da59d8 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1397,6 +1397,85 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) ctxt->interruptibility = mask; } +static inline void +setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, + struct kvm_segment *cs, struct kvm_segment *ss) +{ + memset(cs, 0, sizeof(struct kvm_segment)); + kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); + memset(ss, 0, sizeof(struct kvm_segment)); + + cs->l = 0; /* will be adjusted later */ + cs->base = 0; /* flat segment */ + cs->g = 1; /* 4kb granularity */ + cs->limit = 0xffffffff; /* 4GB limit */ + cs->type = 0x0b; /* Read, Execute, Accessed */ + cs->s = 1; + cs->dpl = 0; /* will be adjusted later */ + cs->present = 1; + cs->db = 1; + + ss->unusable = 0; + ss->base = 0; /* flat segment */ + ss->limit = 0xffffffff; /* 4GB limit */ + ss->g = 1; /* 4kb granularity */ + ss->s = 1; + ss->type = 0x03; /* Read/Write, Accessed */ + ss->db = 1; /* 32bit stack segment */ + ss->dpl = 0; + ss->present = 1; +} + +static int +emulate_syscall(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + + /* syscall is not available in real mode */ + if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL + || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) + return -1; + + setup_syscalls_segments(ctxt, &cs, &ss); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + msr_data >>= 32; + cs.selector = (u16)(msr_data & 0xfffc); + ss.selector = (u16)(msr_data + 8); + + if (is_long_mode(ctxt->vcpu)) { + cs.db = 0; + cs.l = 1; + } + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + c->regs[VCPU_REGS_RCX] = c->eip; + if (is_long_mode(ctxt->vcpu)) { +#ifdef CONFIG_X86_64 + c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; + + kvm_x86_ops->get_msr(ctxt->vcpu, + ctxt->mode == X86EMUL_MODE_PROT64 ? + MSR_LSTAR : MSR_CSTAR, &msr_data); + c->eip = msr_data; + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ctxt->eflags &= ~(msr_data | EFLG_RF); +#endif + } else { + /* legacy mode */ + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + c->eip = (u32)msr_data; + + ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + } + + return 0; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1993,7 +2072,10 @@ twobyte_insn: } break; case 0x05: /* syscall */ - goto cannot_emulate; + if (emulate_syscall(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; break; case 0x06: emulate_clts(ctxt->vcpu); -- cgit v1.2.3-70-g09d2 From 8c60435261deaefeb53ce3222d04d7d5bea81296 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 18 Jun 2009 12:56:01 +0200 Subject: KVM: x86 emulator: Add sysenter emulation Handle #UD intercept of the sysenter instruction in 32bit compat mode on an AMD host. Setup the segment descriptors for CS and SS and the EIP/ESP registers according to the manual. Signed-off-by: Christoph Egger Signed-off-by: Amit Shah Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 70 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 4d7256da59d8..7a9bddb3ebd4 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1476,6 +1476,71 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) return 0; } +static int +emulate_sysenter(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + + /* inject #UD if LOCK prefix is used */ + if (c->lock_prefix) + return -1; + + /* inject #GP if in real mode or paging is disabled */ + if (ctxt->mode == X86EMUL_MODE_REAL || + !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + /* XXX sysenter/sysexit have not been tested in 64bit mode. + * Therefore, we inject an #UD. + */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + return -1; + + setup_syscalls_segments(ctxt, &cs, &ss); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + switch (ctxt->mode) { + case X86EMUL_MODE_PROT32: + if ((msr_data & 0xfffc) == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + break; + case X86EMUL_MODE_PROT64: + if (msr_data == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + break; + } + + ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + cs.selector = (u16)msr_data; + cs.selector &= ~SELECTOR_RPL_MASK; + ss.selector = cs.selector + 8; + ss.selector &= ~SELECTOR_RPL_MASK; + if (ctxt->mode == X86EMUL_MODE_PROT64 + || is_long_mode(ctxt->vcpu)) { + cs.db = 0; + cs.l = 1; + } + + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + c->eip = msr_data; + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + c->regs[VCPU_REGS_RSP] = msr_data; + + return 0; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -2144,7 +2209,10 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - goto cannot_emulate; + if (emulate_sysenter(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; break; case 0x35: /* sysexit */ goto cannot_emulate; -- cgit v1.2.3-70-g09d2 From 4668f050787015805a7e8ea29cc4f81d8f07cedb Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 18 Jun 2009 12:56:02 +0200 Subject: KVM: x86 emulator: Add sysexit emulation Handle #UD intercept of the sysexit instruction in 64bit mode returning to 32bit compat mode on an AMD host. Setup the segment descriptors for CS and SS and the EIP/ESP registers according to the manual. Signed-off-by: Christoph Egger Signed-off-by: Amit Shah Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 72 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 7a9bddb3ebd4..c6663d46f328 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1541,6 +1541,73 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) return 0; } +static int +emulate_sysexit(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + int usermode; + + /* inject #UD if LOCK prefix is used */ + if (c->lock_prefix) + return -1; + + /* inject #GP if in real mode or paging is disabled */ + if (ctxt->mode == X86EMUL_MODE_REAL + || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + /* sysexit must be called from CPL 0 */ + if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + setup_syscalls_segments(ctxt, &cs, &ss); + + if ((c->rex_prefix & 0x8) != 0x0) + usermode = X86EMUL_MODE_PROT64; + else + usermode = X86EMUL_MODE_PROT32; + + cs.dpl = 3; + ss.dpl = 3; + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + switch (usermode) { + case X86EMUL_MODE_PROT32: + cs.selector = (u16)(msr_data + 16); + if ((msr_data & 0xfffc) == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + ss.selector = (u16)(msr_data + 24); + break; + case X86EMUL_MODE_PROT64: + cs.selector = (u16)(msr_data + 32); + if (msr_data == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + ss.selector = cs.selector + 8; + cs.db = 0; + cs.l = 1; + break; + } + cs.selector |= SELECTOR_RPL_MASK; + ss.selector |= SELECTOR_RPL_MASK; + + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; + c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; + + return 0; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -2215,7 +2282,10 @@ twobyte_insn: goto writeback; break; case 0x35: /* sysexit */ - goto cannot_emulate; + if (emulate_sysexit(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; break; case 0x40 ... 0x4f: /* cmov */ c->dst.val = c->dst.orig_val = c->src.val; -- cgit v1.2.3-70-g09d2 From 3eea8437f61ca1f9192d146124425e759c3e5758 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 23 Jun 2009 17:24:06 +0200 Subject: KVM: s390: Fix memslot initialization for userspace_addr != 0 Since commit 854b5338196b1175706e99d63be43a4f8d8ab607 Author: Christian Ehrhardt KVM: s390: streamline memslot handling s390 uses the values of the memslot instead of doing everything in the arch ioctl handler of the KVM_SET_USER_MEMORY_REGION. Unfortunately we missed to set the userspace_addr of our memslot due to our s390 ifdef in __kvm_set_memory_region. Old s390 userspace launchers did not notice, since they started the guest at userspace address 0. Because of CONFIG_DEFAULT_MMAP_MIN_ADDR we now put the guest at 1M userspace, which does not work. This patch makes sure that new.userspace_addr is set on s390. This fix should go in quickly. Nevertheless, looking at the code we should clean up that ifdef in the long term. Any kernel janitors? Signed-off-by: Christian Borntraeger Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 04bdeddebdac..1da8072d61b1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1199,6 +1199,10 @@ int __kvm_set_memory_region(struct kvm *kvm, if (old.npages) kvm_arch_flush_shadow(kvm); } +#else /* not defined CONFIG_S390 */ + new.user_alloc = user_alloc; + if (user_alloc) + new.userspace_addr = mem->userspace_addr; #endif /* not defined CONFIG_S390 */ if (!npages) -- cgit v1.2.3-70-g09d2 From f340ca0f065ecf3e7549687e763370106dacb2c2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 19 Jun 2009 15:16:22 +0200 Subject: hugetlbfs: export vma_kernel_pagsize to modules This function is required by KVM. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cafdcee154e8..b16d63634777 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) return 1UL << (hstate->order + PAGE_SHIFT); } +EXPORT_SYMBOL_GPL(vma_kernel_pagesize); /* * Return the page size being used by the MMU to back a VMA. In the majority -- cgit v1.2.3-70-g09d2 From ec04b2604c3707a46db1d26d98f82b11d0844669 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 19 Jun 2009 15:16:23 +0200 Subject: KVM: Prepare memslot data structures for multiple hugepage sizes [avi: fix build on non-x86] Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 3 +- arch/powerpc/include/asm/kvm_host.h | 3 +- arch/s390/include/asm/kvm_host.h | 6 +++- arch/x86/include/asm/kvm_host.h | 12 ++++---- arch/x86/kvm/mmu.c | 30 ++++++++++---------- arch/x86/kvm/paging_tmpl.h | 3 +- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 56 ++++++++++++++++++++++++++----------- 8 files changed, 73 insertions(+), 42 deletions(-) diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 9cf1c4b1f92f..d9b6325a9328 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -235,7 +235,8 @@ struct kvm_vm_data { #define KVM_REQ_PTC_G 32 #define KVM_REQ_RESUME 33 -#define KVM_PAGES_PER_HPAGE 1 +#define KVM_NR_PAGE_SIZES 1 +#define KVM_PAGES_PER_HPAGE(x) 1 struct kvm; struct kvm_vcpu; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d4caa6127f55..c9c930ed11d7 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -34,7 +34,8 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 /* We don't currently support large pages. */ -#define KVM_PAGES_PER_HPAGE (1UL << 31) +#define KVM_NR_PAGE_SIZES 1 +#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) struct kvm; struct kvm_run; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 75535d4d7a05..78e07a622b45 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -40,7 +40,11 @@ struct sca_block { struct sca_entry cpu[64]; } __attribute__((packed)); -#define KVM_PAGES_PER_HPAGE 256 +#define KVM_NR_PAGE_SIZES 2 +#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8) +#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) +#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) +#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) #define CPUSTAT_HOST 0x80000000 #define CPUSTAT_WAIT 0x10000000 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 19027ab20412..30b625d8e5f0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -54,12 +54,12 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) -/* shadow tables are PAE even on non-PAE hosts */ -#define KVM_HPAGE_SHIFT 21 -#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT) -#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1)) - -#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) +/* KVM Hugepage definitions for x86 */ +#define KVM_NR_PAGE_SIZES 2 +#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) +#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) +#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) +#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) #define DE_VECTOR 0 #define DB_VECTOR 1 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 12974de88aa5..b67585c1ef08 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -394,9 +394,9 @@ static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) { unsigned long idx; - idx = (gfn / KVM_PAGES_PER_HPAGE) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE); - return &slot->lpage_info[idx].write_count; + idx = (gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)); + return &slot->lpage_info[0][idx].write_count; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) @@ -485,10 +485,10 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) if (!lpage) return &slot->rmap[gfn - slot->base_gfn]; - idx = (gfn / KVM_PAGES_PER_HPAGE) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE); + idx = (gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)); - return &slot->lpage_info[idx].rmap_pde; + return &slot->lpage_info[0][idx].rmap_pde; } /* @@ -731,11 +731,11 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + int idx = gfn_offset / + KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL); retval |= handler(kvm, &memslot->rmap[gfn_offset]); retval |= handler(kvm, - &memslot->lpage_info[ - gfn_offset / - KVM_PAGES_PER_HPAGE].rmap_pde); + &memslot->lpage_info[0][idx].rmap_pde); } } @@ -1876,8 +1876,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) pfn_t pfn; unsigned long mmu_seq; - if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); + if (is_largepage_backed(vcpu, gfn & + ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1))) { + gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); largepage = 1; } @@ -2082,8 +2083,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); + if (is_largepage_backed(vcpu, gfn & + ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1))) { + gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); largepage = 1; } mmu_seq = vcpu->kvm->mmu_notifier_seq; @@ -2485,7 +2487,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); + gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); vcpu->arch.update_pte.largepage = 1; } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 322e8113aeea..53e129cec5fd 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -401,7 +401,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (walker.level == PT_DIRECTORY_LEVEL) { gfn_t large_gfn; - large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); + large_gfn = walker.gfn & + ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); if (is_largepage_backed(vcpu, large_gfn)) { walker.gfn = large_gfn; largepage = 1; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6988858dc56e..06af936a250a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -103,7 +103,7 @@ struct kvm_memory_slot { struct { unsigned long rmap_pde; int write_count; - } *lpage_info; + } *lpage_info[KVM_NR_PAGE_SIZES - 1]; unsigned long userspace_addr; int user_alloc; }; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1da8072d61b1..8361662e7e0a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1001,19 +1001,25 @@ out: static void kvm_free_physmem_slot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { + int i; + if (!dont || free->rmap != dont->rmap) vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) vfree(free->dirty_bitmap); - if (!dont || free->lpage_info != dont->lpage_info) - vfree(free->lpage_info); + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { + vfree(free->lpage_info[i]); + free->lpage_info[i] = NULL; + } + } free->npages = 0; free->dirty_bitmap = NULL; free->rmap = NULL; - free->lpage_info = NULL; } void kvm_free_physmem(struct kvm *kvm) @@ -1087,7 +1093,8 @@ int __kvm_set_memory_region(struct kvm *kvm, int r; gfn_t base_gfn; unsigned long npages, ugfn; - unsigned long largepages, i; + int lpages; + unsigned long i, j; struct kvm_memory_slot *memslot; struct kvm_memory_slot old, new; @@ -1161,33 +1168,48 @@ int __kvm_set_memory_region(struct kvm *kvm, else new.userspace_addr = 0; } - if (npages && !new.lpage_info) { - largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE; - largepages -= base_gfn / KVM_PAGES_PER_HPAGE; + if (!npages) + goto skip_lpage; - new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + int level = i + 2; - if (!new.lpage_info) + /* Avoid unused variable warning if no large pages */ + (void)level; + + if (new.lpage_info[i]) + continue; + + lpages = 1 + (base_gfn + npages - 1) / + KVM_PAGES_PER_HPAGE(level); + lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); + + new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); + + if (!new.lpage_info[i]) goto out_free; - memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); + memset(new.lpage_info[i], 0, + lpages * sizeof(*new.lpage_info[i])); - if (base_gfn % KVM_PAGES_PER_HPAGE) - new.lpage_info[0].write_count = 1; - if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) - new.lpage_info[largepages-1].write_count = 1; + if (base_gfn % KVM_PAGES_PER_HPAGE(level)) + new.lpage_info[i][0].write_count = 1; + if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) + new.lpage_info[i][lpages - 1].write_count = 1; ugfn = new.userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each * other, or if explicitly asked to, disable large page * support for this slot */ - if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1) || + if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || !largepages_enabled) - for (i = 0; i < largepages; ++i) - new.lpage_info[i].write_count = 1; + for (j = 0; j < lpages; ++j) + new.lpage_info[i][j].write_count = 1; } +skip_lpage: + /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; -- cgit v1.2.3-70-g09d2 From 894a9c5543abf6f88d36dc1b9f5d90f35db09cb3 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Jun 2009 15:05:14 -0300 Subject: KVM: x86: missing locking in PIT/IRQCHIP/SET_BSP_CPU ioctl paths Correct missing locking in a few places in x86's vm_ioctl handling path. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 2 -- arch/x86/kvm/x86.c | 12 ++++++++++++ virt/kvm/kvm_main.c | 2 ++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 15fc95b2fc05..bcd00c76d69e 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -345,9 +345,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val) { - mutex_lock(&kvm->arch.vpit->pit_state.lock); pit_load_count(kvm, channel, val); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); } static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 57e76b37242f..e9b0982d2589 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1987,19 +1987,25 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: + mutex_lock(&kvm->irq_lock); memcpy(ioapic_irqchip(kvm), &chip->chip.ioapic, sizeof(struct kvm_ioapic_state)); + mutex_unlock(&kvm->irq_lock); break; default: r = -EINVAL; @@ -2013,7 +2019,9 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) { int r = 0; + mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); return r; } @@ -2021,8 +2029,10 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) { int r = 0; + mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); kvm_pit_load_count(kvm, 0, ps->channels[0].count); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); return r; } @@ -2031,7 +2041,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, { if (!kvm->arch.vpit) return -ENXIO; + mutex_lock(&kvm->arch.vpit->pit_state.lock); kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; + mutex_unlock(&kvm->arch.vpit->pit_state.lock); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8361662e7e0a..f1e2e8c373c6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2274,10 +2274,12 @@ static long kvm_vm_ioctl(struct file *filp, #ifdef CONFIG_KVM_APIC_ARCHITECTURE case KVM_SET_BOOT_CPU_ID: r = 0; + mutex_lock(&kvm->lock); if (atomic_read(&kvm->online_vcpus) != 0) r = -EBUSY; else kvm->bsp_vcpu_id = arg; + mutex_unlock(&kvm->lock); break; #endif default: -- cgit v1.2.3-70-g09d2 From 8f1589d95e5eab1ed287f217a33656e922cdbdd0 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 24 Jun 2009 12:44:33 +0200 Subject: KVM: ignore AMDs HWCR register access to set the FFDIS bit Linux tries to disable the flush filter on all AMD K8 CPUs. Since KVM does not handle the needed MSR, the injected #GP will panic the Linux kernel. Ignore setting of the HWCR.FFDIS bit in this MSR to let Linux boot with an AMD K8 family guest CPU. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 1 - arch/x86/kvm/x86.c | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b1c446208867..8728e514c851 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2137,7 +2137,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) break; case MSR_VM_CR: case MSR_VM_IGNNE: - case MSR_K7_HWCR: pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e9b0982d2589..cae5b12bf938 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -833,6 +833,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_EFER: set_efer(vcpu, data); break; + case MSR_K7_HWCR: + data &= ~(u64)0x40; /* ignore flush filter disable */ + if (data != 0) { + pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", + data); + return 1; + } + break; case MSR_IA32_DEBUGCTLMSR: if (!data) { /* We support the non-activated case already */ -- cgit v1.2.3-70-g09d2 From 1fdbd48c242db996107f72ae4140ffe8163e26a8 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 24 Jun 2009 12:44:34 +0200 Subject: KVM: ignore reads from AMDs C1E enabled MSR If the Linux kernel detects an C1E capable AMD processor (K8 RevF and higher), it will access a certain MSR on every attempt to go to halt. Explicitly handle this read and return 0 to let KVM run a Linux guest with the native AMD host CPU propagated to the guest. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cae5b12bf938..6aace61fdb62 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1038,6 +1038,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: + case MSR_K8_INT_PENDING_MSG: data = 0; break; case MSR_MTRRcap: -- cgit v1.2.3-70-g09d2 From ed85c0685321a139cefd6622b21467643f0159e1 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 25 Jun 2009 12:36:49 +0200 Subject: KVM: introduce module parameter for ignoring unknown MSRs accesses KVM will inject a #GP into the guest if that tries to access unhandled MSRs. This will crash many guests. Although it would be the correct way to actually handle these MSRs, we introduce a runtime switchable module param called "ignore_msrs" (defaults to 0). If this is Y, unknown MSR reads will return 0, while MSR writes are simply dropped. In both cases we print a message to dmesg to inform the user about that. You can change the behaviour at any time by saying: # echo 1 > /sys/modules/kvm/parameters/ignore_msrs Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6aace61fdb62..0be75d53b7fd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -83,6 +83,9 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); +int ignore_msrs = 0; +module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); + struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, { "pf_guest", VCPU_STAT(pf_guest) }, @@ -930,8 +933,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) "0x%x data 0x%llx\n", msr, data); break; default: - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); - return 1; + if (!ignore_msrs) { + pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", + msr, data); + return 1; + } else { + pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", + msr, data); + break; + } } return 0; } @@ -1078,8 +1088,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); default: - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); - return 1; + if (!ignore_msrs) { + pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + return 1; + } else { + pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); + data = 0; + } + break; } *pdata = data; return 0; -- cgit v1.2.3-70-g09d2 From 46f43c6ee022c3aeb9686b104234b9f27fac03c2 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 18 Jun 2009 11:47:27 -0300 Subject: KVM: powerpc: convert marker probes to event trace [avi: make it build] [avi: fold trace-arch.h into trace.h] CC: Hollis Blanchard Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/powerpc/kvm/44x_tlb.c | 11 ++--- arch/powerpc/kvm/Makefile | 4 ++ arch/powerpc/kvm/e500_tlb.c | 16 +++---- arch/powerpc/kvm/emulate.c | 3 +- arch/powerpc/kvm/powerpc.c | 3 ++ arch/powerpc/kvm/trace.h | 104 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 126 insertions(+), 15 deletions(-) create mode 100644 arch/powerpc/kvm/trace.h diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 4a16f472cc18..ff3cb63b8117 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -30,6 +30,7 @@ #include "timing.h" #include "44x_tlb.h" +#include "trace.h" #ifndef PPC44x_TLBE_SIZE #define PPC44x_TLBE_SIZE PPC44x_TLB_4K @@ -263,7 +264,7 @@ static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x, /* XXX set tlb_44x_index to stlb_index? */ - KVMTRACE_1D(STLB_INVAL, &vcpu_44x->vcpu, stlb_index, handler); + trace_kvm_stlb_inval(stlb_index); } void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) @@ -365,8 +366,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, /* Insert shadow mapping into hardware TLB. */ kvmppc_44x_tlbe_set_modified(vcpu_44x, victim); kvmppc_44x_tlbwe(victim, &stlbe); - KVMTRACE_5D(STLB_WRITE, vcpu, victim, stlbe.tid, stlbe.word0, stlbe.word1, - stlbe.word2, handler); + trace_kvm_stlb_write(victim, stlbe.tid, stlbe.word0, stlbe.word1, + stlbe.word2); } /* For a particular guest TLB entry, invalidate the corresponding host TLB @@ -485,8 +486,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); } - KVMTRACE_5D(GTLB_WRITE, vcpu, gtlb_index, tlbe->tid, tlbe->word0, - tlbe->word1, tlbe->word2, handler); + trace_kvm_gtlb_write(gtlb_index, tlbe->tid, tlbe->word0, tlbe->word1, + tlbe->word2); kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); return EMULATE_DONE; diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 459c7ee580f7..4f407f2662f7 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -10,6 +10,10 @@ common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) common-objs-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o) +CFLAGS_44x_tlb.o := -I. +CFLAGS_e500_tlb.o := -I. +CFLAGS_emulate.o := -I. + kvm-objs := $(common-objs-y) powerpc.o emulate.o obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index a2048ac095ab..fb1e1dc11ba5 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -22,6 +22,7 @@ #include "../mm/mmu_decl.h" #include "e500_tlb.h" +#include "trace.h" #define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) @@ -224,9 +225,8 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); stlbe->mas1 = 0; - KVMTRACE_5D(STLB_INVAL, &vcpu_e500->vcpu, index_of(tlbsel, esel), - stlbe->mas1, stlbe->mas2, stlbe->mas3, stlbe->mas7, - handler); + trace_kvm_stlb_inval(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, + stlbe->mas3, stlbe->mas7); } static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, @@ -319,9 +319,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, vcpu_e500->vcpu.arch.msr & MSR_PR); stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN; - KVMTRACE_5D(STLB_WRITE, &vcpu_e500->vcpu, index_of(tlbsel, esel), - stlbe->mas1, stlbe->mas2, stlbe->mas3, stlbe->mas7, - handler); + trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, + stlbe->mas3, stlbe->mas7); } /* XXX only map the one-one case, for now use TLB0 */ @@ -535,9 +534,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) gtlbe->mas3 = vcpu_e500->mas3; gtlbe->mas7 = vcpu_e500->mas7; - KVMTRACE_5D(GTLB_WRITE, vcpu, vcpu_e500->mas0, - gtlbe->mas1, gtlbe->mas2, gtlbe->mas3, gtlbe->mas7, - handler); + trace_kvm_gtlb_write(vcpu_e500->mas0, gtlbe->mas1, gtlbe->mas2, + gtlbe->mas3, gtlbe->mas7); /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ if (tlbe_is_host_safe(vcpu, gtlbe)) { diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 28a8237fe78b..7737146af3fb 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -29,6 +29,7 @@ #include #include #include "timing.h" +#include "trace.h" #define OP_TRAP 3 @@ -419,7 +420,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) } } - KVMTRACE_3D(PPC_INSTR, vcpu, inst, (int)vcpu->arch.pc, emulated, entryexit); + trace_kvm_ppc_instr(inst, vcpu->arch.pc, emulated); if (advance) vcpu->arch.pc += 4; /* Advance past emulated instruction. */ diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 7ad30e0a1b9a..0341391eff12 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -31,6 +31,9 @@ #include "timing.h" #include "../mm/mmu_decl.h" +#define CREATE_TRACE_POINTS +#include "trace.h" + gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { return gfn; diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h new file mode 100644 index 000000000000..67f219de0455 --- /dev/null +++ b/arch/powerpc/kvm/trace.h @@ -0,0 +1,104 @@ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace + +/* + * Tracepoint for guest mode entry. + */ +TRACE_EVENT(kvm_ppc_instr, + TP_PROTO(unsigned int inst, unsigned long pc, unsigned int emulate), + TP_ARGS(inst, pc, emulate), + + TP_STRUCT__entry( + __field( unsigned int, inst ) + __field( unsigned long, pc ) + __field( unsigned int, emulate ) + ), + + TP_fast_assign( + __entry->inst = inst; + __entry->pc = pc; + __entry->emulate = emulate; + ), + + TP_printk("inst %u pc 0x%lx emulate %u\n", + __entry->inst, __entry->pc, __entry->emulate) +); + +TRACE_EVENT(kvm_stlb_inval, + TP_PROTO(unsigned int stlb_index), + TP_ARGS(stlb_index), + + TP_STRUCT__entry( + __field( unsigned int, stlb_index ) + ), + + TP_fast_assign( + __entry->stlb_index = stlb_index; + ), + + TP_printk("stlb_index %u", __entry->stlb_index) +); + +TRACE_EVENT(kvm_stlb_write, + TP_PROTO(unsigned int victim, unsigned int tid, unsigned int word0, + unsigned int word1, unsigned int word2), + TP_ARGS(victim, tid, word0, word1, word2), + + TP_STRUCT__entry( + __field( unsigned int, victim ) + __field( unsigned int, tid ) + __field( unsigned int, word0 ) + __field( unsigned int, word1 ) + __field( unsigned int, word2 ) + ), + + TP_fast_assign( + __entry->victim = victim; + __entry->tid = tid; + __entry->word0 = word0; + __entry->word1 = word1; + __entry->word2 = word2; + ), + + TP_printk("victim %u tid %u w0 %u w1 %u w2 %u", + __entry->victim, __entry->tid, __entry->word0, + __entry->word1, __entry->word2) +); + +TRACE_EVENT(kvm_gtlb_write, + TP_PROTO(unsigned int gtlb_index, unsigned int tid, unsigned int word0, + unsigned int word1, unsigned int word2), + TP_ARGS(gtlb_index, tid, word0, word1, word2), + + TP_STRUCT__entry( + __field( unsigned int, gtlb_index ) + __field( unsigned int, tid ) + __field( unsigned int, word0 ) + __field( unsigned int, word1 ) + __field( unsigned int, word2 ) + ), + + TP_fast_assign( + __entry->gtlb_index = gtlb_index; + __entry->tid = tid; + __entry->word0 = word0; + __entry->word1 = word1; + __entry->word2 = word2; + ), + + TP_printk("gtlb_index %u tid %u w0 %u w1 %u w2 %u", + __entry->gtlb_index, __entry->tid, __entry->word0, + __entry->word1, __entry->word2) +); + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3-70-g09d2 From 2023a29cbe34139afcea8f65f8aef78c325c5dc0 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 18 Jun 2009 11:47:28 -0300 Subject: KVM: remove old KVMTRACE support code Return EOPNOTSUPP for KVM_TRACE_ENABLE/PAUSE/DISABLE ioctls. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/ia64/kvm/Kconfig | 3 - arch/powerpc/kvm/Kconfig | 11 -- arch/powerpc/kvm/Makefile | 2 - arch/s390/kvm/Kconfig | 3 - arch/x86/kvm/Kconfig | 12 -- arch/x86/kvm/Makefile | 1 - include/linux/kvm.h | 31 +---- include/linux/kvm_host.h | 31 ----- virt/kvm/kvm_main.c | 3 +- virt/kvm/kvm_trace.c | 285 ---------------------------------------------- 10 files changed, 2 insertions(+), 380 deletions(-) delete mode 100644 virt/kvm/kvm_trace.c diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index cbadd8a65233..ef3e7be29caf 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -47,9 +47,6 @@ config KVM_INTEL Provides support for KVM on Itanium 2 processors equipped with the VT extensions. -config KVM_TRACE - bool - source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 46019dccce1c..c29926846613 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -58,17 +58,6 @@ config KVM_E500 If unsure, say N. -config KVM_TRACE - bool "KVM trace support" - depends on KVM && MARKERS && SYSFS - select RELAY - select DEBUG_FS - default n - ---help--- - This option allows reading a trace of kvm-related events through - relayfs. Note the ABI is not considered stable and will be - modified in future updates. - source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 4f407f2662f7..37655fe19f2f 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -8,8 +8,6 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) -common-objs-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o) - CFLAGS_44x_tlb.o := -I. CFLAGS_e500_tlb.o := -I. CFLAGS_emulate.o := -I. diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index ad75ce33be12..bf164fc21864 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -34,9 +34,6 @@ config KVM If unsure, say N. -config KVM_TRACE - bool - # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/virtio/Kconfig diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 7fbedfd34d6c..b84e571f4175 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -62,18 +62,6 @@ config KVM_AMD To compile this as a module, choose M here: the module will be called kvm-amd. -config KVM_TRACE - bool "KVM trace support" - depends on KVM && SYSFS - select MARKERS - select RELAY - select DEBUG_FS - default n - ---help--- - This option allows reading a trace of kvm-related events through - relayfs. Note the ABI is not considered stable and will be - modified in future updates. - # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/lguest/Kconfig diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 7c56850b82cb..afaaa7627d95 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,6 @@ CFLAGS_vmx.o := -I. kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o) -kvm-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-y += x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 671051829da6..76c640834ea6 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -14,7 +14,7 @@ #define KVM_API_VERSION 12 -/* for KVM_TRACE_ENABLE */ +/* for KVM_TRACE_ENABLE, deprecated */ struct kvm_user_trace_setup { __u32 buf_size; /* sub_buffer size of each per-cpu */ __u32 buf_nr; /* the number of sub_buffers of each per-cpu */ @@ -325,35 +325,6 @@ struct kvm_guest_debug { #define KVM_TRC_CYCLE_SIZE 8 #define KVM_TRC_EXTRA_MAX 7 -/* This structure represents a single trace buffer record. */ -struct kvm_trace_rec { - /* variable rec_val - * is split into: - * bits 0 - 27 -> event id - * bits 28 -30 -> number of extra data args of size u32 - * bits 31 -> binary indicator for if tsc is in record - */ - __u32 rec_val; - __u32 pid; - __u32 vcpu_id; - union { - struct { - __u64 timestamp; - __u32 extra_u32[KVM_TRC_EXTRA_MAX]; - } __attribute__((packed)) timestamp; - struct { - __u32 extra_u32[KVM_TRC_EXTRA_MAX]; - } notimestamp; - } u; -}; - -#define TRACE_REC_EVENT_ID(val) \ - (0x0fffffff & (val)) -#define TRACE_REC_NUM_DATA_ARGS(val) \ - (0x70000000 & ((val) << 28)) -#define TRACE_REC_TCS(val) \ - (0x80000000 & ((val) << 31)) - #define KVMIO 0xAE /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 06af936a250a..0604d56f6eed 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -482,37 +482,6 @@ struct kvm_stats_debugfs_item { extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; -#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \ - trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ - vcpu, 5, d1, d2, d3, d4, d5) -#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \ - trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ - vcpu, 4, d1, d2, d3, d4, 0) -#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \ - trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ - vcpu, 3, d1, d2, d3, 0, 0) -#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \ - trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ - vcpu, 2, d1, d2, 0, 0, 0) -#define KVMTRACE_1D(evt, vcpu, d1, name) \ - trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ - vcpu, 1, d1, 0, 0, 0, 0) -#define KVMTRACE_0D(evt, vcpu, name) \ - trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ - vcpu, 0, 0, 0, 0, 0, 0) - -#ifdef CONFIG_KVM_TRACE -int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg); -void kvm_trace_cleanup(void); -#else -static inline -int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg) -{ - return -EINVAL; -} -#define kvm_trace_cleanup() ((void)0) -#endif - #ifdef KVM_ARCH_WANT_MMU_NOTIFIER static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f1e2e8c373c6..bbb4029d7c4d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2398,7 +2398,7 @@ static long kvm_dev_ioctl(struct file *filp, case KVM_TRACE_ENABLE: case KVM_TRACE_PAUSE: case KVM_TRACE_DISABLE: - r = kvm_trace_ioctl(ioctl, arg); + r = -EOPNOTSUPP; break; default: return kvm_arch_dev_ioctl(filp, ioctl, arg); @@ -2748,7 +2748,6 @@ EXPORT_SYMBOL_GPL(kvm_init); void kvm_exit(void) { - kvm_trace_cleanup(); tracepoint_synchronize_unregister(); misc_deregister(&kvm_dev); kmem_cache_destroy(kvm_vcpu_cache); diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c deleted file mode 100644 index f59874446440..000000000000 --- a/virt/kvm/kvm_trace.c +++ /dev/null @@ -1,285 +0,0 @@ -/* - * kvm trace - * - * It is designed to allow debugging traces of kvm to be generated - * on UP / SMP machines. Each trace entry can be timestamped so that - * it's possible to reconstruct a chronological record of trace events. - * The implementation refers to blktrace kernel support. - * - * Copyright (c) 2008 Intel Corporation - * Copyright (C) 2006 Jens Axboe - * - * Authors: Feng(Eric) Liu, eric.e.liu@intel.com - * - * Date: Feb 2008 - */ - -#include -#include -#include -#include - -#include - -#define KVM_TRACE_STATE_RUNNING (1 << 0) -#define KVM_TRACE_STATE_PAUSE (1 << 1) -#define KVM_TRACE_STATE_CLEARUP (1 << 2) - -struct kvm_trace { - int trace_state; - struct rchan *rchan; - struct dentry *lost_file; - atomic_t lost_records; -}; -static struct kvm_trace *kvm_trace; - -struct kvm_trace_probe { - const char *name; - const char *format; - u32 timestamp_in; - marker_probe_func *probe_func; -}; - -static inline int calc_rec_size(int timestamp, int extra) -{ - int rec_size = KVM_TRC_HEAD_SIZE; - - rec_size += extra; - return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size; -} - -static void kvm_add_trace(void *probe_private, void *call_data, - const char *format, va_list *args) -{ - struct kvm_trace_probe *p = probe_private; - struct kvm_trace *kt = kvm_trace; - struct kvm_trace_rec rec; - struct kvm_vcpu *vcpu; - int i, size; - u32 extra; - - if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING)) - return; - - rec.rec_val = TRACE_REC_EVENT_ID(va_arg(*args, u32)); - vcpu = va_arg(*args, struct kvm_vcpu *); - rec.pid = current->tgid; - rec.vcpu_id = vcpu->vcpu_id; - - extra = va_arg(*args, u32); - WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX)); - extra = min_t(u32, extra, KVM_TRC_EXTRA_MAX); - - rec.rec_val |= TRACE_REC_TCS(p->timestamp_in) - | TRACE_REC_NUM_DATA_ARGS(extra); - - if (p->timestamp_in) { - rec.u.timestamp.timestamp = ktime_to_ns(ktime_get()); - - for (i = 0; i < extra; i++) - rec.u.timestamp.extra_u32[i] = va_arg(*args, u32); - } else { - for (i = 0; i < extra; i++) - rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32); - } - - size = calc_rec_size(p->timestamp_in, extra * sizeof(u32)); - relay_write(kt->rchan, &rec, size); -} - -static struct kvm_trace_probe kvm_trace_probes[] = { - { "kvm_trace_entryexit", "%u %p %u %u %u %u %u %u", 1, kvm_add_trace }, - { "kvm_trace_handler", "%u %p %u %u %u %u %u %u", 0, kvm_add_trace }, -}; - -static int lost_records_get(void *data, u64 *val) -{ - struct kvm_trace *kt = data; - - *val = atomic_read(&kt->lost_records); - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(kvm_trace_lost_ops, lost_records_get, NULL, "%llu\n"); - -/* - * The relay channel is used in "no-overwrite" mode, it keeps trace of how - * many times we encountered a full subbuffer, to tell user space app the - * lost records there were. - */ -static int kvm_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct kvm_trace *kt; - - if (!relay_buf_full(buf)) { - if (!prev_subbuf) { - /* - * executed only once when the channel is opened - * save metadata as first record - */ - subbuf_start_reserve(buf, sizeof(u32)); - *(u32 *)subbuf = 0x12345678; - } - - return 1; - } - - kt = buf->chan->private_data; - atomic_inc(&kt->lost_records); - - return 0; -} - -static struct dentry *kvm_create_buf_file_callack(const char *filename, - struct dentry *parent, - int mode, - struct rchan_buf *buf, - int *is_global) -{ - return debugfs_create_file(filename, mode, parent, buf, - &relay_file_operations); -} - -static int kvm_remove_buf_file_callback(struct dentry *dentry) -{ - debugfs_remove(dentry); - return 0; -} - -static struct rchan_callbacks kvm_relay_callbacks = { - .subbuf_start = kvm_subbuf_start_callback, - .create_buf_file = kvm_create_buf_file_callack, - .remove_buf_file = kvm_remove_buf_file_callback, -}; - -static int do_kvm_trace_enable(struct kvm_user_trace_setup *kuts) -{ - struct kvm_trace *kt; - int i, r = -ENOMEM; - - if (!kuts->buf_size || !kuts->buf_nr) - return -EINVAL; - - kt = kzalloc(sizeof(*kt), GFP_KERNEL); - if (!kt) - goto err; - - r = -EIO; - atomic_set(&kt->lost_records, 0); - kt->lost_file = debugfs_create_file("lost_records", 0444, kvm_debugfs_dir, - kt, &kvm_trace_lost_ops); - if (!kt->lost_file) - goto err; - - kt->rchan = relay_open("trace", kvm_debugfs_dir, kuts->buf_size, - kuts->buf_nr, &kvm_relay_callbacks, kt); - if (!kt->rchan) - goto err; - - kvm_trace = kt; - - for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) { - struct kvm_trace_probe *p = &kvm_trace_probes[i]; - - r = marker_probe_register(p->name, p->format, p->probe_func, p); - if (r) - printk(KERN_INFO "Unable to register probe %s\n", - p->name); - } - - kvm_trace->trace_state = KVM_TRACE_STATE_RUNNING; - - return 0; -err: - if (kt) { - if (kt->lost_file) - debugfs_remove(kt->lost_file); - if (kt->rchan) - relay_close(kt->rchan); - kfree(kt); - } - return r; -} - -static int kvm_trace_enable(char __user *arg) -{ - struct kvm_user_trace_setup kuts; - int ret; - - ret = copy_from_user(&kuts, arg, sizeof(kuts)); - if (ret) - return -EFAULT; - - ret = do_kvm_trace_enable(&kuts); - if (ret) - return ret; - - return 0; -} - -static int kvm_trace_pause(void) -{ - struct kvm_trace *kt = kvm_trace; - int r = -EINVAL; - - if (kt == NULL) - return r; - - if (kt->trace_state == KVM_TRACE_STATE_RUNNING) { - kt->trace_state = KVM_TRACE_STATE_PAUSE; - relay_flush(kt->rchan); - r = 0; - } - - return r; -} - -void kvm_trace_cleanup(void) -{ - struct kvm_trace *kt = kvm_trace; - int i; - - if (kt == NULL) - return; - - if (kt->trace_state == KVM_TRACE_STATE_RUNNING || - kt->trace_state == KVM_TRACE_STATE_PAUSE) { - - kt->trace_state = KVM_TRACE_STATE_CLEARUP; - - for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) { - struct kvm_trace_probe *p = &kvm_trace_probes[i]; - marker_probe_unregister(p->name, p->probe_func, p); - } - marker_synchronize_unregister(); - - relay_close(kt->rchan); - debugfs_remove(kt->lost_file); - kfree(kt); - } -} - -int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - long r = -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (ioctl) { - case KVM_TRACE_ENABLE: - r = kvm_trace_enable(argp); - break; - case KVM_TRACE_PAUSE: - r = kvm_trace_pause(); - break; - case KVM_TRACE_DISABLE: - r = 0; - kvm_trace_cleanup(); - break; - } - - return r; -} -- cgit v1.2.3-70-g09d2 From d3efc8efdbaa81e9db3089810fcd526fccba234d Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 17 Jun 2009 10:07:59 -0300 Subject: KVM: use vcpu_id instead of bsp_vcpu pointer in kvm_vcpu_is_bsp Change kvm_vcpu_is_bsp to use vcpu_id instead of bsp_vcpu pointer, which is only initialized at the end of kvm_vm_ioctl_create_vcpu. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0604d56f6eed..4ea42c950539 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -538,7 +538,7 @@ static inline void kvm_irqfd_release(struct kvm *kvm) {} #ifdef CONFIG_KVM_APIC_ARCHITECTURE static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { - return vcpu->kvm->bsp_vcpu == vcpu; + return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; } #endif #endif -- cgit v1.2.3-70-g09d2 From 69fa2d78653918d200220ca62d44715827e6d617 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Jun 2009 22:24:07 +0300 Subject: KVM: document locking for kvm_io_device_ops slots_lock is taken everywhere when device ops are called. Document this as we will use this to rework locking for io. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- virt/kvm/iodev.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h index 2c67f5acd6db..06e38b23fa61 100644 --- a/virt/kvm/iodev.h +++ b/virt/kvm/iodev.h @@ -20,6 +20,9 @@ struct kvm_io_device; +/** + * kvm_io_device_ops are called under kvm slots_lock. + **/ struct kvm_io_device_ops { void (*read)(struct kvm_io_device *this, gpa_t addr, -- cgit v1.2.3-70-g09d2 From d5c2dcc36aaf4dfdf9e01513493c71d71715f817 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Jun 2009 22:24:14 +0300 Subject: KVM: switch coalesced mmio changes to slots_lock switch coalesced mmio slots_lock. slots_lock is already taken for read everywhere, so we only need to take it for write when changing zones. This is in preparation to removing in_range and kvm->lock around it. [avi: fix build] Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- virt/kvm/coalesced_mmio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 397f41936698..b40946c1237d 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -115,16 +115,16 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, if (dev == NULL) return -EINVAL; - mutex_lock(&kvm->lock); + down_write(&kvm->slots_lock); if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { - mutex_unlock(&kvm->lock); + up_write(&kvm->slots_lock); return -ENOBUFS; } dev->zone[dev->nb_zones] = *zone; dev->nb_zones++; - mutex_unlock(&kvm->lock); + up_write(&kvm->slots_lock); return 0; } @@ -138,7 +138,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, if (dev == NULL) return -EINVAL; - mutex_lock(&kvm->lock); + down_write(&kvm->slots_lock); i = dev->nb_zones; while(i) { @@ -156,7 +156,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, i--; } - mutex_unlock(&kvm->lock); + up_write(&kvm->slots_lock); return 0; } -- cgit v1.2.3-70-g09d2 From 108b56690f35e083c5559116d6656f59a557a815 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Jun 2009 22:24:21 +0300 Subject: KVM: switch pit creation to slots_lock switch pit creation to slots_lock. slots_lock is already taken for read everywhere, so we only need to take it for write when creating pit. This is in preparation to removing in_range and kvm->lock around it. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0be75d53b7fd..7ce6367c1976 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2188,7 +2188,7 @@ long kvm_arch_vm_ioctl(struct file *filp, sizeof(struct kvm_pit_config))) goto out; create_pit: - mutex_lock(&kvm->lock); + down_write(&kvm->slots_lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; @@ -2197,7 +2197,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpit) r = 0; create_pit_unlock: - mutex_unlock(&kvm->lock); + up_write(&kvm->slots_lock); break; case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { -- cgit v1.2.3-70-g09d2 From 6c474694530f377507f9aca438c17206e051e6e7 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Jun 2009 22:24:26 +0300 Subject: KVM: convert bus to slots_lock Use slots_lock to protect device list on the bus. slots_lock is already taken for read everywhere, so we only need to take it for write when registering devices. This is in preparation to removing in_range and kvm->lock around it. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 5 +++-- arch/x86/kvm/i8259.c | 2 +- include/linux/kvm_host.h | 5 ++++- virt/kvm/coalesced_mmio.c | 2 +- virt/kvm/ioapic.c | 2 +- virt/kvm/kvm_main.c | 12 +++++++++++- 6 files changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index bcd00c76d69e..4082cdd468ed 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -583,6 +583,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = { .in_range = speaker_in_range, }; +/* Caller must have writers lock on slots_lock */ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; @@ -621,11 +622,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); + __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); } return pit; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 148c52a608d6..1851aec8a7da 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -548,6 +548,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) * Initialize PIO device */ kvm_iodevice_init(&s->dev, &picdev_ops); - kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); + kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); return s; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4ea42c950539..96c8c0b01929 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -42,6 +42,7 @@ #define KVM_USERSPACE_IRQ_SOURCE_ID 0 +struct kvm; struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; @@ -61,7 +62,9 @@ void kvm_io_bus_init(struct kvm_io_bus *bus); void kvm_io_bus_destroy(struct kvm_io_bus *bus); struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr, int len, int is_write); -void kvm_io_bus_register_dev(struct kvm_io_bus *bus, +void __kvm_io_bus_register_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev); +void kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, struct kvm_io_device *dev); struct kvm_vcpu { diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index b40946c1237d..7b7cc9fe5ee3 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -102,7 +102,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm) kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); dev->kvm = kvm; kvm->coalesced_mmio_dev = dev; - kvm_io_bus_register_dev(&kvm->mmio_bus, &dev->dev); + kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev); return 0; } diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 0532fa68f5d1..0eca54e06326 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -343,7 +343,7 @@ int kvm_ioapic_init(struct kvm *kvm) kvm_ioapic_reset(ioapic); kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); ioapic->kvm = kvm; - kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); + kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bbb4029d7c4d..0edc366ecf89 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2527,7 +2527,17 @@ struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, return NULL; } -void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) +void kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, + struct kvm_io_device *dev) +{ + down_write(&kvm->slots_lock); + __kvm_io_bus_register_dev(bus, dev); + up_write(&kvm->slots_lock); +} + +/* An unlocked version. Caller must have write lock on slots_lock. */ +void __kvm_io_bus_register_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev) { BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); -- cgit v1.2.3-70-g09d2 From bda9020e2463ec94db9f97e8615f3bae22069838 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Jun 2009 22:24:32 +0300 Subject: KVM: remove in_range from io devices This changes bus accesses to use high-level kvm_io_bus_read/kvm_io_bus_write functions. in_range now becomes unused so it is removed from device ops in favor of read/write callbacks performing range checks internally. This allows aliasing (mostly for in-kernel virtio), as well as better error handling by making it possible to pass errors up to userspace. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 28 ++++-------- arch/x86/kvm/i8254.c | 49 +++++++++++---------- arch/x86/kvm/i8259.c | 20 +++++---- arch/x86/kvm/lapic.c | 44 +++++++++---------- arch/x86/kvm/x86.c | 110 ++++++++++++++-------------------------------- include/linux/kvm_host.h | 6 ++- virt/kvm/coalesced_mmio.c | 16 +++---- virt/kvm/ioapic.c | 22 +++++----- virt/kvm/iodev.h | 39 +++++++--------- virt/kvm/kvm_main.c | 26 ++++++----- 10 files changed, 152 insertions(+), 208 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 5c766bd82b05..d7aa6bb8f477 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -210,16 +210,6 @@ int kvm_dev_ioctl_check_extension(long ext) } -static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, - gpa_t addr, int len, int is_write) -{ - struct kvm_io_device *dev; - - dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write); - - return dev; -} - static int handle_vm_error(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { kvm_run->exit_reason = KVM_EXIT_UNKNOWN; @@ -231,6 +221,7 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct kvm_mmio_req *p; struct kvm_io_device *mmio_dev; + int r; p = kvm_get_vcpu_ioreq(vcpu); @@ -247,16 +238,13 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->exit_reason = KVM_EXIT_MMIO; return 0; mmio: - mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, !p->dir); - if (mmio_dev) { - if (!p->dir) - kvm_iodevice_write(mmio_dev, p->addr, p->size, - &p->data); - else - kvm_iodevice_read(mmio_dev, p->addr, p->size, - &p->data); - - } else + if (p->dir) + r = kvm_io_bus_read(&vcpu->kvm->mmio_bus, p->addr, + p->size, &p->data); + else + r = kvm_io_bus_write(&vcpu->kvm->mmio_bus, p->addr, + p->size, &p->data); + if (r) printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr); p->state = STATE_IORESP_READY; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4082cdd468ed..8c3ac30ef9bd 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -358,8 +358,14 @@ static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev) return container_of(dev, struct kvm_pit, speaker_dev); } -static void pit_ioport_write(struct kvm_io_device *this, - gpa_t addr, int len, const void *data) +static inline int pit_in_range(gpa_t addr) +{ + return ((addr >= KVM_PIT_BASE_ADDRESS) && + (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); +} + +static int pit_ioport_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *data) { struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; @@ -367,6 +373,8 @@ static void pit_ioport_write(struct kvm_io_device *this, int channel, access; struct kvm_kpit_channel_state *s; u32 val = *(u32 *) data; + if (!pit_in_range(addr)) + return -EOPNOTSUPP; val &= 0xff; addr &= KVM_PIT_CHANNEL_MASK; @@ -429,16 +437,19 @@ static void pit_ioport_write(struct kvm_io_device *this, } mutex_unlock(&pit_state->lock); + return 0; } -static void pit_ioport_read(struct kvm_io_device *this, - gpa_t addr, int len, void *data) +static int pit_ioport_read(struct kvm_io_device *this, + gpa_t addr, int len, void *data) { struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; int ret, count; struct kvm_kpit_channel_state *s; + if (!pit_in_range(addr)) + return -EOPNOTSUPP; addr &= KVM_PIT_CHANNEL_MASK; s = &pit_state->channels[addr]; @@ -493,37 +504,36 @@ static void pit_ioport_read(struct kvm_io_device *this, memcpy(data, (char *)&ret, len); mutex_unlock(&pit_state->lock); + return 0; } -static int pit_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) -{ - return ((addr >= KVM_PIT_BASE_ADDRESS) && - (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); -} - -static void speaker_ioport_write(struct kvm_io_device *this, - gpa_t addr, int len, const void *data) +static int speaker_ioport_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *data) { struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; u32 val = *(u32 *) data; + if (addr != KVM_SPEAKER_BASE_ADDRESS) + return -EOPNOTSUPP; mutex_lock(&pit_state->lock); pit_state->speaker_data_on = (val >> 1) & 1; pit_set_gate(kvm, 2, val & 1); mutex_unlock(&pit_state->lock); + return 0; } -static void speaker_ioport_read(struct kvm_io_device *this, - gpa_t addr, int len, void *data) +static int speaker_ioport_read(struct kvm_io_device *this, + gpa_t addr, int len, void *data) { struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; unsigned int refresh_clock; int ret; + if (addr != KVM_SPEAKER_BASE_ADDRESS) + return -EOPNOTSUPP; /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; @@ -535,12 +545,7 @@ static void speaker_ioport_read(struct kvm_io_device *this, len = sizeof(ret); memcpy(data, (char *)&ret, len); mutex_unlock(&pit_state->lock); -} - -static int speaker_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) -{ - return (addr == KVM_SPEAKER_BASE_ADDRESS); + return 0; } void kvm_pit_reset(struct kvm_pit *pit) @@ -574,13 +579,11 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) static const struct kvm_io_device_ops pit_dev_ops = { .read = pit_ioport_read, .write = pit_ioport_write, - .in_range = pit_in_range, }; static const struct kvm_io_device_ops speaker_dev_ops = { .read = speaker_ioport_read, .write = speaker_ioport_write, - .in_range = speaker_in_range, }; /* Caller must have writers lock on slots_lock */ diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 1851aec8a7da..1d1bb75dc7bc 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -430,8 +430,7 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1) return s->elcr; } -static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) +static int picdev_in_range(gpa_t addr) { switch (addr) { case 0x20: @@ -451,16 +450,18 @@ static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) return container_of(dev, struct kvm_pic, dev); } -static void picdev_write(struct kvm_io_device *this, +static int picdev_write(struct kvm_io_device *this, gpa_t addr, int len, const void *val) { struct kvm_pic *s = to_pic(this); unsigned char data = *(unsigned char *)val; + if (!picdev_in_range(addr)) + return -EOPNOTSUPP; if (len != 1) { if (printk_ratelimit()) printk(KERN_ERR "PIC: non byte write\n"); - return; + return 0; } pic_lock(s); switch (addr) { @@ -476,18 +477,21 @@ static void picdev_write(struct kvm_io_device *this, break; } pic_unlock(s); + return 0; } -static void picdev_read(struct kvm_io_device *this, - gpa_t addr, int len, void *val) +static int picdev_read(struct kvm_io_device *this, + gpa_t addr, int len, void *val) { struct kvm_pic *s = to_pic(this); unsigned char data = 0; + if (!picdev_in_range(addr)) + return -EOPNOTSUPP; if (len != 1) { if (printk_ratelimit()) printk(KERN_ERR "PIC: non byte read\n"); - return; + return 0; } pic_lock(s); switch (addr) { @@ -504,6 +508,7 @@ static void picdev_read(struct kvm_io_device *this, } *(unsigned char *)val = data; pic_unlock(s); + return 0; } /* @@ -526,7 +531,6 @@ static void pic_irq_request(void *opaque, int level) static const struct kvm_io_device_ops picdev_ops = { .read = picdev_read, .write = picdev_write, - .in_range = picdev_in_range, }; struct kvm_pic *kvm_create_pic(struct kvm *kvm) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2e0286596387..265a765f038f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -546,18 +546,27 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) return container_of(dev, struct kvm_lapic, dev); } -static void apic_mmio_read(struct kvm_io_device *this, - gpa_t address, int len, void *data) +static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) +{ + return apic_hw_enabled(apic) && + addr >= apic->base_address && + addr < apic->base_address + LAPIC_MMIO_LENGTH; +} + +static int apic_mmio_read(struct kvm_io_device *this, + gpa_t address, int len, void *data) { struct kvm_lapic *apic = to_lapic(this); unsigned int offset = address - apic->base_address; unsigned char alignment = offset & 0xf; u32 result; + if (!apic_mmio_in_range(apic, address)) + return -EOPNOTSUPP; if ((alignment + len) > 4) { printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", (unsigned long)address, len); - return; + return 0; } result = __apic_read(apic, offset & ~0xf); @@ -574,6 +583,7 @@ static void apic_mmio_read(struct kvm_io_device *this, "should be 1,2, or 4 instead\n", len); break; } + return 0; } static void update_divide_count(struct kvm_lapic *apic) @@ -629,13 +639,15 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) apic->vcpu->kvm->arch.vapics_in_nmi_mode--; } -static void apic_mmio_write(struct kvm_io_device *this, - gpa_t address, int len, const void *data) +static int apic_mmio_write(struct kvm_io_device *this, + gpa_t address, int len, const void *data) { struct kvm_lapic *apic = to_lapic(this); unsigned int offset = address - apic->base_address; unsigned char alignment = offset & 0xf; u32 val; + if (!apic_mmio_in_range(apic, address)) + return -EOPNOTSUPP; /* * APIC register must be aligned on 128-bits boundary. @@ -646,7 +658,7 @@ static void apic_mmio_write(struct kvm_io_device *this, /* Don't shout loud, $infamous_os would cause only noise. */ apic_debug("apic write: bad size=%d %lx\n", len, (long)address); - return; + return 0; } val = *(u32 *) data; @@ -729,7 +741,7 @@ static void apic_mmio_write(struct kvm_io_device *this, hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); - return; + return 0; case APIC_TDCR: if (val & 4) @@ -743,22 +755,7 @@ static void apic_mmio_write(struct kvm_io_device *this, offset); break; } - -} - -static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, - int len, int size) -{ - struct kvm_lapic *apic = to_lapic(this); - int ret = 0; - - - if (apic_hw_enabled(apic) && - (addr >= apic->base_address) && - (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) - ret = 1; - - return ret; + return 0; } void kvm_free_lapic(struct kvm_vcpu *vcpu) @@ -938,7 +935,6 @@ static struct kvm_timer_ops lapic_timer_ops = { static const struct kvm_io_device_ops apic_mmio_ops = { .read = apic_mmio_read, .write = apic_mmio_write, - .in_range = apic_mmio_range, }; int kvm_create_lapic(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7ce6367c1976..96f0ae7d97b6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2333,35 +2333,23 @@ static void kvm_init_msr_list(void) num_msrs_to_save = j; } -/* - * Only apic need an MMIO device hook, so shortcut now.. - */ -static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, - gpa_t addr, int len, - int is_write) +static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, + const void *v) { - struct kvm_io_device *dev; + if (vcpu->arch.apic && + !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) + return 0; - if (vcpu->arch.apic) { - dev = &vcpu->arch.apic->dev; - if (kvm_iodevice_in_range(dev, addr, len, is_write)) - return dev; - } - return NULL; + return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); } - -static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, - gpa_t addr, int len, - int is_write) +static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) { - struct kvm_io_device *dev; + if (vcpu->arch.apic && + !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) + return 0; - dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); - if (dev == NULL) - dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, - is_write); - return dev; + return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); } static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, @@ -2430,7 +2418,6 @@ static int emulator_read_emulated(unsigned long addr, unsigned int bytes, struct kvm_vcpu *vcpu) { - struct kvm_io_device *mmio_dev; gpa_t gpa; if (vcpu->mmio_read_completed) { @@ -2455,13 +2442,8 @@ mmio: /* * Is this MMIO handled locally? */ - mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); - mutex_unlock(&vcpu->kvm->lock); - if (mmio_dev) { - kvm_iodevice_read(mmio_dev, gpa, bytes, val); + if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) return X86EMUL_CONTINUE; - } vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -2488,7 +2470,6 @@ static int emulator_write_emulated_onepage(unsigned long addr, unsigned int bytes, struct kvm_vcpu *vcpu) { - struct kvm_io_device *mmio_dev; gpa_t gpa; gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); @@ -2509,13 +2490,8 @@ mmio: /* * Is this MMIO handled locally? */ - mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); - mutex_unlock(&vcpu->kvm->lock); - if (mmio_dev) { - kvm_iodevice_write(mmio_dev, gpa, bytes, val); + if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) return X86EMUL_CONTINUE; - } vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -2850,48 +2826,40 @@ int complete_pio(struct kvm_vcpu *vcpu) return 0; } -static void kernel_pio(struct kvm_io_device *pio_dev, - struct kvm_vcpu *vcpu, - void *pd) +static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) { /* TODO: String I/O for in kernel device */ + int r; if (vcpu->arch.pio.in) - kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, - vcpu->arch.pio.size, - pd); + r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); else - kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, - vcpu->arch.pio.size, - pd); + r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + return r; } -static void pio_string_write(struct kvm_io_device *pio_dev, - struct kvm_vcpu *vcpu) +static int pio_string_write(struct kvm_vcpu *vcpu) { struct kvm_pio_request *io = &vcpu->arch.pio; void *pd = vcpu->arch.pio_data; - int i; + int i, r = 0; for (i = 0; i < io->cur_count; i++) { - kvm_iodevice_write(pio_dev, io->port, - io->size, - pd); + if (kvm_io_bus_write(&vcpu->kvm->pio_bus, + io->port, io->size, pd)) { + r = -EOPNOTSUPP; + break; + } pd += io->size; } -} - -static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, - gpa_t addr, int len, - int is_write) -{ - return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); + return r; } int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned port) { - struct kvm_io_device *pio_dev; unsigned long val; vcpu->run->exit_reason = KVM_EXIT_IO; @@ -2911,11 +2879,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); - mutex_lock(&vcpu->kvm->lock); - pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); - mutex_unlock(&vcpu->kvm->lock); - if (pio_dev) { - kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { complete_pio(vcpu); return 1; } @@ -2929,7 +2893,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, { unsigned now, in_page; int ret = 0; - struct kvm_io_device *pio_dev; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -2973,12 +2936,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.guest_gva = address; - mutex_lock(&vcpu->kvm->lock); - pio_dev = vcpu_find_pio_dev(vcpu, port, - vcpu->arch.pio.cur_count, - !vcpu->arch.pio.in); - mutex_unlock(&vcpu->kvm->lock); - if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); @@ -2986,16 +2943,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, kvm_inject_gp(vcpu, 0); return 1; } - if (ret == 0 && pio_dev) { - pio_string_write(pio_dev, vcpu); + if (ret == 0 && !pio_string_write(vcpu)) { complete_pio(vcpu); if (vcpu->arch.pio.count == 0) ret = 1; } - } else if (pio_dev) - pr_unimpl(vcpu, "no string pio read support yet, " - "port %x size %d count %ld\n", - port, size, count); + } + /* no string PIO read support yet */ return ret; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 96c8c0b01929..077e8bb875a9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -60,8 +60,10 @@ struct kvm_io_bus { void kvm_io_bus_init(struct kvm_io_bus *bus); void kvm_io_bus_destroy(struct kvm_io_bus *bus); -struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, - gpa_t addr, int len, int is_write); +int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, int len, + const void *val); +int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, + void *val); void __kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev); void kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 7b7cc9fe5ee3..0352f81ecc0b 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -19,18 +19,14 @@ static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev) return container_of(dev, struct kvm_coalesced_mmio_dev, dev); } -static int coalesced_mmio_in_range(struct kvm_io_device *this, - gpa_t addr, int len, int is_write) +static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, + gpa_t addr, int len) { - struct kvm_coalesced_mmio_dev *dev = to_mmio(this); struct kvm_coalesced_mmio_zone *zone; struct kvm_coalesced_mmio_ring *ring; unsigned avail; int i; - if (!is_write) - return 0; - /* Are we able to batch it ? */ /* last is the first free entry @@ -60,11 +56,13 @@ static int coalesced_mmio_in_range(struct kvm_io_device *this, return 0; } -static void coalesced_mmio_write(struct kvm_io_device *this, - gpa_t addr, int len, const void *val) +static int coalesced_mmio_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *val) { struct kvm_coalesced_mmio_dev *dev = to_mmio(this); struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; + if (!coalesced_mmio_in_range(dev, addr, len)) + return -EOPNOTSUPP; spin_lock(&dev->lock); @@ -76,6 +74,7 @@ static void coalesced_mmio_write(struct kvm_io_device *this, smp_wmb(); ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; spin_unlock(&dev->lock); + return 0; } static void coalesced_mmio_destructor(struct kvm_io_device *this) @@ -87,7 +86,6 @@ static void coalesced_mmio_destructor(struct kvm_io_device *this) static const struct kvm_io_device_ops coalesced_mmio_ops = { .write = coalesced_mmio_write, - .in_range = coalesced_mmio_in_range, .destructor = coalesced_mmio_destructor, }; diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 0eca54e06326..ddf6aa998b18 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -227,20 +227,19 @@ static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev) return container_of(dev, struct kvm_ioapic, dev); } -static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) +static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr) { - struct kvm_ioapic *ioapic = to_ioapic(this); - return ((addr >= ioapic->base_address && (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); } -static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, - void *val) +static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) { struct kvm_ioapic *ioapic = to_ioapic(this); u32 result; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; ioapic_debug("addr %lx\n", (unsigned long)addr); ASSERT(!(addr & 0xf)); /* check alignment */ @@ -273,13 +272,16 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, printk(KERN_WARNING "ioapic: wrong length %d\n", len); } mutex_unlock(&ioapic->kvm->irq_lock); + return 0; } -static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, - const void *val) +static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) { struct kvm_ioapic *ioapic = to_ioapic(this); u32 data; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", (void*)addr, len, val); @@ -290,7 +292,7 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, data = *(u32 *) val; else { printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); - return; + return 0; } addr &= 0xff; @@ -312,6 +314,7 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, break; } mutex_unlock(&ioapic->kvm->irq_lock); + return 0; } void kvm_ioapic_reset(struct kvm_ioapic *ioapic) @@ -329,7 +332,6 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic) static const struct kvm_io_device_ops ioapic_mmio_ops = { .read = ioapic_mmio_read, .write = ioapic_mmio_write, - .in_range = ioapic_in_range, }; int kvm_ioapic_init(struct kvm *kvm) diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h index 06e38b23fa61..12fd3caffd2b 100644 --- a/virt/kvm/iodev.h +++ b/virt/kvm/iodev.h @@ -17,23 +17,24 @@ #define __KVM_IODEV_H__ #include +#include struct kvm_io_device; /** * kvm_io_device_ops are called under kvm slots_lock. + * read and write handlers return 0 if the transaction has been handled, + * or non-zero to have it passed to the next device. **/ struct kvm_io_device_ops { - void (*read)(struct kvm_io_device *this, + int (*read)(struct kvm_io_device *this, + gpa_t addr, + int len, + void *val); + int (*write)(struct kvm_io_device *this, gpa_t addr, int len, - void *val); - void (*write)(struct kvm_io_device *this, - gpa_t addr, - int len, - const void *val); - int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len, - int is_write); + const void *val); void (*destructor)(struct kvm_io_device *this); }; @@ -48,26 +49,16 @@ static inline void kvm_iodevice_init(struct kvm_io_device *dev, dev->ops = ops; } -static inline void kvm_iodevice_read(struct kvm_io_device *dev, - gpa_t addr, - int len, - void *val) +static inline int kvm_iodevice_read(struct kvm_io_device *dev, + gpa_t addr, int l, void *v) { - dev->ops->read(dev, addr, len, val); + return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP; } -static inline void kvm_iodevice_write(struct kvm_io_device *dev, - gpa_t addr, - int len, - const void *val) +static inline int kvm_iodevice_write(struct kvm_io_device *dev, + gpa_t addr, int l, const void *v) { - dev->ops->write(dev, addr, len, val); -} - -static inline int kvm_iodevice_in_range(struct kvm_io_device *dev, - gpa_t addr, int len, int is_write) -{ - return dev->ops->in_range(dev, addr, len, is_write); + return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP; } static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0edc366ecf89..594606526620 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2512,19 +2512,25 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus) } } -struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, - gpa_t addr, int len, int is_write) +/* kvm_io_bus_write - called under kvm->slots_lock */ +int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, + int len, const void *val) { int i; + for (i = 0; i < bus->dev_count; i++) + if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) + return 0; + return -EOPNOTSUPP; +} - for (i = 0; i < bus->dev_count; i++) { - struct kvm_io_device *pos = bus->devs[i]; - - if (kvm_iodevice_in_range(pos, addr, len, is_write)) - return pos; - } - - return NULL; +/* kvm_io_bus_read - called under kvm->slots_lock */ +int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val) +{ + int i; + for (i = 0; i < bus->dev_count; i++) + if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) + return 0; + return -EOPNOTSUPP; } void kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, -- cgit v1.2.3-70-g09d2 From 22fc02943a630417cb71734cd0b2e6f8dc4210b6 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Jun 2009 22:24:45 +0300 Subject: KVM: document lock nesting rule Document kvm->lock nesting within kvm->slots_lock Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 594606526620..fc1b58a72757 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -68,7 +68,7 @@ MODULE_LICENSE("GPL"); /* * Ordering of locks: * - * kvm->lock --> kvm->irq_lock + * kvm->slots_lock --> kvm->lock --> kvm->irq_lock */ DEFINE_SPINLOCK(kvm_lock); -- cgit v1.2.3-70-g09d2 From 27c4ba60171d9ce29d3645619837760d758ed288 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 29 Jun 2009 18:05:10 +0200 Subject: KVM: fix lock imbalance There is a missing unlock on one fail path in ioapic_mmio_write, fix that. Signed-off-by: Jiri Slaby Signed-off-by: Avi Kivity --- virt/kvm/ioapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index ddf6aa998b18..8a9c6cc382f8 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -292,7 +292,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, data = *(u32 *) val; else { printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); - return 0; + goto unlock; } addr &= 0xff; @@ -313,6 +313,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, default: break; } +unlock: mutex_unlock(&ioapic->kvm->irq_lock); return 0; } -- cgit v1.2.3-70-g09d2 From ae8c1c4025c2b780616586c3f86a3374a154ef90 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Jul 2009 12:09:41 +0300 Subject: KVM: Trace irq level and source id Signed-off-by: Avi Kivity --- include/trace/events/kvm.h | 11 ++++++++--- virt/kvm/irq_comm.c | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index d74b23d803f1..035232dc84e0 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -9,18 +9,23 @@ #if defined(__KVM_HAVE_IOAPIC) TRACE_EVENT(kvm_set_irq, - TP_PROTO(unsigned int gsi), - TP_ARGS(gsi), + TP_PROTO(unsigned int gsi, int level, int irq_source_id), + TP_ARGS(gsi, level, irq_source_id), TP_STRUCT__entry( __field( unsigned int, gsi ) + __field( int, level ) + __field( int, irq_source_id ) ), TP_fast_assign( __entry->gsi = gsi; + __entry->level = level; + __entry->irq_source_id = irq_source_id; ), - TP_printk("gsi %u", __entry->gsi) + TP_printk("gsi %u level %d source %d", + __entry->gsi, __entry->level, __entry->irq_source_id) ); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 94759ed96b64..56e696104d43 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -126,7 +126,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) unsigned long *irq_state, sig_level; int ret = -1; - trace_kvm_set_irq(irq); + trace_kvm_set_irq(irq, level, irq_source_id); WARN_ON(!mutex_is_locked(&kvm->irq_lock)); -- cgit v1.2.3-70-g09d2 From c323c0e5f06150cc000e09e8e3b22356099e7bde Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 24 Jun 2009 15:37:05 +0200 Subject: KVM: Ignore PCI ECS I/O enablement Linux guests will try to enable access to the extended PCI config space via the I/O ports 0xCF8/0xCFC on AMD Fam10h CPU. Since we (currently?) don't use ECS, simply ignore write and read attempts. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 96f0ae7d97b6..af40e23df251 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -844,6 +844,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 1; } break; + case MSR_AMD64_NB_CFG: + break; case MSR_IA32_DEBUGCTLMSR: if (!data) { /* We support the non-activated case already */ @@ -1049,6 +1051,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: case MSR_K8_INT_PENDING_MSG: + case MSR_AMD64_NB_CFG: data = 0; break; case MSR_MTRRcap: -- cgit v1.2.3-70-g09d2 From aec51dc4f1584018d7e35269e04e3dde3d2033e6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Jul 2009 16:01:02 +0300 Subject: KVM: Trace mmio Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 11 ++++++++++- include/trace/events/kvm.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index af40e23df251..d32e3c6d3175 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -37,6 +37,8 @@ #include #include #include +#include +#undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS #include "trace.h" @@ -2425,6 +2427,8 @@ static int emulator_read_emulated(unsigned long addr, if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, + vcpu->mmio_phys_addr, *(u64 *)val); vcpu->mmio_read_completed = 0; return X86EMUL_CONTINUE; } @@ -2445,8 +2449,12 @@ mmio: /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) + if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); return X86EMUL_CONTINUE; + } + + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -2490,6 +2498,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, return X86EMUL_CONTINUE; mmio: + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); /* * Is this MMIO handled locally? */ diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 035232dc84e0..77022af48492 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -56,6 +56,39 @@ TRACE_EVENT(kvm_ack_irq, #endif /* defined(__KVM_HAVE_IOAPIC) */ + +#define KVM_TRACE_MMIO_READ_UNSATISFIED 0 +#define KVM_TRACE_MMIO_READ 1 +#define KVM_TRACE_MMIO_WRITE 2 + +#define kvm_trace_symbol_mmio \ + { KVM_TRACE_MMIO_READ_UNSATISFIED, "unsatisfied-read" }, \ + { KVM_TRACE_MMIO_READ, "read" }, \ + { KVM_TRACE_MMIO_WRITE, "write" } + +TRACE_EVENT(kvm_mmio, + TP_PROTO(int type, int len, u64 gpa, u64 val), + TP_ARGS(type, len, gpa, val), + + TP_STRUCT__entry( + __field( u32, type ) + __field( u32, len ) + __field( u64, gpa ) + __field( u64, val ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->len = len; + __entry->gpa = gpa; + __entry->val = val; + ), + + TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx", + __print_symbolic(__entry->type, kvm_trace_symbol_mmio), + __entry->len, __entry->gpa, __entry->val) +); + #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ -- cgit v1.2.3-70-g09d2 From cb24772140e09cb2503af7a4736ae3e08e9ac7d3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Jul 2009 18:25:44 +0300 Subject: KVM: Trace apic registers using their symbolic names Signed-off-by: Avi Kivity --- arch/x86/kvm/trace.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index cd8c90db41a5..6c2c87fa6e4f 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -111,6 +111,15 @@ TRACE_EVENT(kvm_cpuid, __entry->rbx, __entry->rcx, __entry->rdx) ); +#define AREG(x) { APIC_##x, "APIC_" #x } + +#define kvm_trace_symbol_apic \ + AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \ + AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \ + AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \ + AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \ + AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \ + AREG(ECTRL) /* * Tracepoint for apic access. */ @@ -130,9 +139,10 @@ TRACE_EVENT(kvm_apic, __entry->val = val; ), - TP_printk("apic_%s 0x%x = 0x%x", + TP_printk("apic_%s %s = 0x%x", __entry->rw ? "write" : "read", - __entry->reg, __entry->val) + __print_symbolic(__entry->reg, kvm_trace_symbol_apic), + __entry->val) ); #define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) -- cgit v1.2.3-70-g09d2 From fc61b800f9506ca47bf1439342a79847f2353562 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Jul 2009 17:39:35 +0300 Subject: KVM: Add Directed EOI support to APIC emulation Directed EOI is specified by x2APIC, but is available even when lapic is in xAPIC mode. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/apicdef.h | 2 ++ arch/x86/kvm/lapic.c | 38 ++++++++++++++++++++++++++++++-------- arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/x86.c | 4 ++-- arch/x86/kvm/x86.h | 4 ++++ 5 files changed, 39 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 7ddb36ab933b..74ca38f6a4de 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -14,6 +14,7 @@ #define APIC_LVR 0x30 #define APIC_LVR_MASK 0xFF00FF +#define APIC_LVR_DIRECTED_EOI (1 << 24) #define GET_APIC_VERSION(x) ((x) & 0xFFu) #define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) #ifdef CONFIG_X86_32 @@ -40,6 +41,7 @@ #define APIC_DFR_CLUSTER 0x0FFFFFFFul #define APIC_DFR_FLAT 0xFFFFFFFFul #define APIC_SPIV 0xF0 +#define APIC_SPIV_DIRECTED_EOI (1 << 12) #define APIC_SPIV_FOCUS_DISABLED (1 << 9) #define APIC_SPIV_APIC_ENABLED (1 << 8) #define APIC_ISR 0x100 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 265a765f038f..62ea2abfd961 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -35,6 +35,7 @@ #include "kvm_cache_regs.h" #include "irq.h" #include "trace.h" +#include "x86.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -142,6 +143,21 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; } +void kvm_apic_set_version(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_cpuid_entry2 *feat; + u32 v = APIC_VERSION; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); + if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) + v |= APIC_LVR_DIRECTED_EOI; + apic_set_reg(apic, APIC_LVR, v); +} + static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ @@ -442,9 +458,11 @@ static void apic_set_eoi(struct kvm_lapic *apic) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; - mutex_lock(&apic->vcpu->kvm->irq_lock); - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); - mutex_unlock(&apic->vcpu->kvm->irq_lock); + if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { + mutex_lock(&apic->vcpu->kvm->irq_lock); + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + mutex_unlock(&apic->vcpu->kvm->irq_lock); + } } static void apic_send_ipi(struct kvm_lapic *apic) @@ -694,8 +712,11 @@ static int apic_mmio_write(struct kvm_io_device *this, apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); break; - case APIC_SPIV: - apic_set_reg(apic, APIC_SPIV, val & 0x3ff); + case APIC_SPIV: { + u32 mask = 0x3ff; + if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) + mask |= APIC_SPIV_DIRECTED_EOI; + apic_set_reg(apic, APIC_SPIV, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; u32 lvt_val; @@ -710,7 +731,7 @@ static int apic_mmio_write(struct kvm_io_device *this, } break; - + } case APIC_ICR: /* No delay here, so we always clear the pending bit */ apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); @@ -837,7 +858,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); - apic_set_reg(apic, APIC_LVR, APIC_VERSION); + kvm_apic_set_version(apic->vcpu); for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); @@ -1041,7 +1062,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) apic->base_address = vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; - apic_set_reg(apic, APIC_LVR, APIC_VERSION); + kvm_apic_set_version(vcpu); + apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 3f3ecc6edbf5..bc1c5243c865 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -29,6 +29,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); +void kvm_apic_set_version(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d32e3c6d3175..086f93137e3c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -79,8 +79,6 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -1373,6 +1371,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_nent = cpuid->nent; cpuid_fix_nx_cap(vcpu); r = 0; + kvm_apic_set_version(vcpu); out_free: vfree(cpuid_entries); @@ -1394,6 +1393,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; vcpu->arch.cpuid_nent = cpuid->nent; + kvm_apic_set_version(vcpu); return 0; out: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4c8e10af78e8..5eadea585d2a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -31,4 +31,8 @@ static inline bool kvm_exception_is_soft(unsigned int nr) { return (nr == BP_VECTOR) || (nr == OF_VECTOR); } + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index); + #endif -- cgit v1.2.3-70-g09d2 From 0105d1a526404cfc9779552a6198bfd0e5fc937a Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Jul 2009 17:39:36 +0300 Subject: KVM: x2apic interface to lapic This patch implements MSR interface to local apic as defines by x2apic Intel specification. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 223 +++++++++++++++++++++++++++++++++++++-------------- arch/x86/kvm/lapic.h | 2 + arch/x86/kvm/x86.c | 7 +- 3 files changed, 173 insertions(+), 59 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 62ea2abfd961..683345a121b7 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "kvm_cache_regs.h" #include "irq.h" #include "trace.h" @@ -158,6 +159,11 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_LVR, v); } +static inline int apic_x2apic_mode(struct kvm_lapic *apic) +{ + return apic->vcpu->arch.apic_base & X2APIC_ENABLE; +} + static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ @@ -284,7 +290,12 @@ int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) { int result = 0; - u8 logical_id; + u32 logical_id; + + if (apic_x2apic_mode(apic)) { + logical_id = apic_get_reg(apic, APIC_LDR); + return logical_id & mda; + } logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); @@ -477,7 +488,10 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.level = icr_low & APIC_INT_ASSERT; irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; irq.shorthand = icr_low & APIC_SHORT_MASK; - irq.dest_id = GET_APIC_DEST_FIELD(icr_high); + if (apic_x2apic_mode(apic)) + irq.dest_id = icr_high; + else + irq.dest_id = GET_APIC_DEST_FIELD(icr_high); apic_debug("icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " @@ -538,6 +552,12 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) return 0; switch (offset) { + case APIC_ID: + if (apic_x2apic_mode(apic)) + val = kvm_apic_id(apic); + else + val = kvm_apic_id(apic) << 24; + break; case APIC_ARBPRI: printk(KERN_WARNING "Access APIC ARBPRI register " "which is for P6\n"); @@ -564,28 +584,26 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) return container_of(dev, struct kvm_lapic, dev); } -static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) -{ - return apic_hw_enabled(apic) && - addr >= apic->base_address && - addr < apic->base_address + LAPIC_MMIO_LENGTH; -} - -static int apic_mmio_read(struct kvm_io_device *this, - gpa_t address, int len, void *data) +static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, + void *data) { - struct kvm_lapic *apic = to_lapic(this); - unsigned int offset = address - apic->base_address; unsigned char alignment = offset & 0xf; u32 result; - if (!apic_mmio_in_range(apic, address)) - return -EOPNOTSUPP; + /* this bitmask has a bit cleared for each reserver register */ + static const u64 rmask = 0x43ff01ffffffe70cULL; if ((alignment + len) > 4) { - printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", - (unsigned long)address, len); - return 0; + printk(KERN_ERR "KVM_APIC_READ: alignment error %x %d\n", + offset, len); + return 1; } + + if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { + printk(KERN_ERR "KVM_APIC_READ: read reserved register %x\n", + offset); + return 1; + } + result = __apic_read(apic, offset & ~0xf); trace_kvm_apic_read(offset, result); @@ -604,6 +622,27 @@ static int apic_mmio_read(struct kvm_io_device *this, return 0; } +static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) +{ + return apic_hw_enabled(apic) && + addr >= apic->base_address && + addr < apic->base_address + LAPIC_MMIO_LENGTH; +} + +static int apic_mmio_read(struct kvm_io_device *this, + gpa_t address, int len, void *data) +{ + struct kvm_lapic *apic = to_lapic(this); + u32 offset = address - apic->base_address; + + if (!apic_mmio_in_range(apic, address)) + return -EOPNOTSUPP; + + apic_reg_read(apic, offset, len, data); + + return 0; +} + static void update_divide_count(struct kvm_lapic *apic) { u32 tmp1, tmp2, tdcr; @@ -657,42 +696,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) apic->vcpu->kvm->arch.vapics_in_nmi_mode--; } -static int apic_mmio_write(struct kvm_io_device *this, - gpa_t address, int len, const void *data) +static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { - struct kvm_lapic *apic = to_lapic(this); - unsigned int offset = address - apic->base_address; - unsigned char alignment = offset & 0xf; - u32 val; - if (!apic_mmio_in_range(apic, address)) - return -EOPNOTSUPP; - - /* - * APIC register must be aligned on 128-bits boundary. - * 32/64/128 bits registers must be accessed thru 32 bits. - * Refer SDM 8.4.1 - */ - if (len != 4 || alignment) { - /* Don't shout loud, $infamous_os would cause only noise. */ - apic_debug("apic write: bad size=%d %lx\n", - len, (long)address); - return 0; - } - - val = *(u32 *) data; + int ret = 0; - /* too common printing */ - if (offset != APIC_EOI) - apic_debug("%s: offset 0x%x with length 0x%x, and value is " - "0x%x\n", __func__, offset, len, val); + trace_kvm_apic_write(reg, val); - offset &= 0xff0; - - trace_kvm_apic_write(offset, val); - - switch (offset) { + switch (reg) { case APIC_ID: /* Local APIC ID */ - apic_set_reg(apic, APIC_ID, val); + if (!apic_x2apic_mode(apic)) + apic_set_reg(apic, APIC_ID, val); + else + ret = 1; break; case APIC_TASKPRI: @@ -705,11 +720,17 @@ static int apic_mmio_write(struct kvm_io_device *this, break; case APIC_LDR: - apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); + if (!apic_x2apic_mode(apic)) + apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); + else + ret = 1; break; case APIC_DFR: - apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); + if (!apic_x2apic_mode(apic)) + apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); + else + ret = 1; break; case APIC_SPIV: { @@ -739,7 +760,9 @@ static int apic_mmio_write(struct kvm_io_device *this, break; case APIC_ICR2: - apic_set_reg(apic, APIC_ICR2, val & 0xff000000); + if (!apic_x2apic_mode(apic)) + val &= 0xff000000; + apic_set_reg(apic, APIC_ICR2, val); break; case APIC_LVT0: @@ -753,8 +776,8 @@ static int apic_mmio_write(struct kvm_io_device *this, if (!apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; - apic_set_reg(apic, offset, val); + val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; + apic_set_reg(apic, reg, val); break; @@ -762,7 +785,7 @@ static int apic_mmio_write(struct kvm_io_device *this, hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); - return 0; + break; case APIC_TDCR: if (val & 4) @@ -771,11 +794,58 @@ static int apic_mmio_write(struct kvm_io_device *this, update_divide_count(apic); break; + case APIC_ESR: + if (apic_x2apic_mode(apic) && val != 0) { + printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val); + ret = 1; + } + break; + + case APIC_SELF_IPI: + if (apic_x2apic_mode(apic)) { + apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); + } else + ret = 1; + break; default: - apic_debug("Local APIC Write to read-only register %x\n", - offset); + ret = 1; break; } + if (ret) + apic_debug("Local APIC Write to read-only register %x\n", reg); + return ret; +} + +static int apic_mmio_write(struct kvm_io_device *this, + gpa_t address, int len, const void *data) +{ + struct kvm_lapic *apic = to_lapic(this); + unsigned int offset = address - apic->base_address; + u32 val; + + if (!apic_mmio_in_range(apic, address)) + return -EOPNOTSUPP; + + /* + * APIC register must be aligned on 128-bits boundary. + * 32/64/128 bits registers must be accessed thru 32 bits. + * Refer SDM 8.4.1 + */ + if (len != 4 || (offset & 0xf)) { + /* Don't shout loud, $infamous_os would cause only noise. */ + apic_debug("apic write: bad size=%d %lx\n", len, (long)address); + return; + } + + val = *(u32*)data; + + /* too common printing */ + if (offset != APIC_EOI) + apic_debug("%s: offset 0x%x with length 0x%x, and value is " + "0x%x\n", __func__, offset, len, val); + + apic_reg_write(apic, offset & 0xff0, val); + return 0; } @@ -834,6 +904,11 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) value &= ~MSR_IA32_APICBASE_BSP; vcpu->arch.apic_base = value; + if (apic_x2apic_mode(apic)) { + u32 id = kvm_apic_id(apic); + u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); + apic_set_reg(apic, APIC_LDR, ldr); + } apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -1130,3 +1205,35 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) vcpu->arch.apic->vapic_addr = vapic_addr; } + +int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = (msr - APIC_BASE_MSR) << 4; + + if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + return 1; + + /* if this is ICR write vector before command */ + if (msr == 0x830) + apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; + + if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + return 1; + + if (apic_reg_read(apic, reg, 4, &low)) + return 1; + if (msr == 0x830) + apic_reg_read(apic, APIC_ICR2, 4, &high); + + *data = (((u64)high) << 32) | low; + + return 0; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index bc1c5243c865..40010b09c4aa 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -46,4 +46,6 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); +int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 086f93137e3c..a50c83232e76 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -867,6 +867,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_APICBASE: kvm_set_apic_base(vcpu, data); break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; @@ -1065,6 +1067,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_APICBASE: data = kvm_get_apic_base(vcpu); break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return kvm_x2apic_msr_read(vcpu, msr, pdata); + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; @@ -1469,7 +1474,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | - F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | + F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved, XSAVE, OSXSAVE */; /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = -- cgit v1.2.3-70-g09d2 From 70f93dae32aca8b3b7da66891aecd3ee0ee48060 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Jul 2009 18:48:12 +0300 Subject: KVM: Use temporary variable to shorten lines. Cosmetic only. No logic is changed by this patch. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- virt/kvm/ioapic.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 8a9c6cc382f8..92496ff3d82d 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -103,6 +103,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; bool mask_before, mask_after; + union kvm_ioapic_redirect_entry *e; switch (ioapic->ioregsel) { case IOAPIC_REG_VERSION: @@ -122,19 +123,20 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) ioapic_debug("change redir index %x val %x\n", index, val); if (index >= IOAPIC_NUM_PINS) return; - mask_before = ioapic->redirtbl[index].fields.mask; + e = &ioapic->redirtbl[index]; + mask_before = e->fields.mask; if (ioapic->ioregsel & 1) { - ioapic->redirtbl[index].bits &= 0xffffffff; - ioapic->redirtbl[index].bits |= (u64) val << 32; + e->bits &= 0xffffffff; + e->bits |= (u64) val << 32; } else { - ioapic->redirtbl[index].bits &= ~0xffffffffULL; - ioapic->redirtbl[index].bits |= (u32) val; - ioapic->redirtbl[index].fields.remote_irr = 0; + e->bits &= ~0xffffffffULL; + e->bits |= (u32) val; + e->fields.remote_irr = 0; } - mask_after = ioapic->redirtbl[index].fields.mask; + mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); - if (ioapic->redirtbl[index].fields.trig_mode == IOAPIC_LEVEL_TRIG + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index); break; -- cgit v1.2.3-70-g09d2 From 756975bbfd185169aac19f227d65a8d738a5f5f8 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 6 Jul 2009 11:05:39 +0800 Subject: KVM: Fix apic_mmio_write return for unaligned write Some in-famous OS do unaligned writing for APIC MMIO, and the return value has been missed in recent change, then the OS hangs. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 683345a121b7..6c8460308548 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -834,7 +834,7 @@ static int apic_mmio_write(struct kvm_io_device *this, if (len != 4 || (offset & 0xf)) { /* Don't shout loud, $infamous_os would cause only noise. */ apic_debug("apic write: bad size=%d %lx\n", len, (long)address); - return; + return 0; } val = *(u32*)data; -- cgit v1.2.3-70-g09d2 From 6098ca939ee5ceb81d6628b9130112516bae7400 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 3 Jul 2009 16:00:14 +0200 Subject: KVM: handle AMD microcode MSR Windows 7 tries to update the CPU's microcode on some processors, so we ignore the MSR write here. The patchlevel register is already handled (returning 0), because the MSR number is the same as Intel's. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a50c83232e76..6dde99ca743e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -861,6 +861,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: case MSR_VM_HSAVE_PA: + case MSR_AMD64_PATCH_LOADER: break; case 0x200 ... 0x2ff: return set_msr_mtrr(vcpu, msr, data); -- cgit v1.2.3-70-g09d2 From dc7e795e3dd2a763e5ceaa1615f307e808cf3932 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 1 Jul 2009 20:52:03 +0200 Subject: Revert "KVM: x86: check for cr3 validity in ioctl_set_sregs" This reverts commit 6c20e1442bb1c62914bb85b7f4a38973d2a423ba. To my understanding, it became obsolete with the advent of the more robust check in mmu_alloc_roots (89da4ff17f). Moreover, it prevents the conceptually safe pattern 1. set sregs 2. register mem-slots 3. run vcpu by setting a sticky triple fault during step 1. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6dde99ca743e..0e74d98641a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4326,13 +4326,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; - - down_read(&vcpu->kvm->slots_lock); - if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT)) - vcpu->arch.cr3 = sregs->cr3; - else - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); - up_read(&vcpu->kvm->slots_lock); + vcpu->arch.cr3 = sregs->cr3; kvm_set_cr8(vcpu, sregs->cr8); -- cgit v1.2.3-70-g09d2 From 07420171593908406c3a59d6f884d426a921a5ea Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 6 Jul 2009 12:21:32 +0300 Subject: KVM: MMU: Trace guest pagetable walker Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 ++ arch/x86/kvm/mmutrace.h | 117 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/paging_tmpl.h | 11 +++-- 3 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 arch/x86/kvm/mmutrace.h diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b67585c1ef08..c0dda6447b9f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -140,6 +140,9 @@ module_param(oos_shadow, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#define CREATE_TRACE_POINTS +#include "mmutrace.h" + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) struct kvm_rmap_desc { diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h new file mode 100644 index 000000000000..1367f82717d1 --- /dev/null +++ b/arch/x86/kvm/mmutrace.h @@ -0,0 +1,117 @@ +#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVMMMU_H + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvmmmu +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE mmutrace + +#define kvm_mmu_trace_pferr_flags \ + { PFERR_PRESENT_MASK, "P" }, \ + { PFERR_WRITE_MASK, "W" }, \ + { PFERR_USER_MASK, "U" }, \ + { PFERR_RSVD_MASK, "RSVD" }, \ + { PFERR_FETCH_MASK, "F" } + +/* + * A pagetable walk has started + */ +TRACE_EVENT( + kvm_mmu_pagetable_walk, + TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), + TP_ARGS(addr, write_fault, user_fault, fetch_fault), + + TP_STRUCT__entry( + __field(__u64, addr) + __field(__u32, pferr) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) + | (!!fetch_fault << 4); + ), + + TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, + __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) +); + + +/* We just walked a paging element */ +TRACE_EVENT( + kvm_mmu_paging_element, + TP_PROTO(u64 pte, int level), + TP_ARGS(pte, level), + + TP_STRUCT__entry( + __field(__u64, pte) + __field(__u32, level) + ), + + TP_fast_assign( + __entry->pte = pte; + __entry->level = level; + ), + + TP_printk("pte %llx level %u", __entry->pte, __entry->level) +); + +/* We set a pte accessed bit */ +TRACE_EVENT( + kvm_mmu_set_accessed_bit, + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + TP_ARGS(table_gfn, index, size), + + TP_STRUCT__entry( + __field(__u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) + + index * size; + ), + + TP_printk("gpa %llx", __entry->gpa) +); + +/* We set a pte dirty bit */ +TRACE_EVENT( + kvm_mmu_set_dirty_bit, + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + TP_ARGS(table_gfn, index, size), + + TP_STRUCT__entry( + __field(__u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) + + index * size; + ), + + TP_printk("gpa %llx", __entry->gpa) +); + +TRACE_EVENT( + kvm_mmu_walker_error, + TP_PROTO(u32 pferr), + TP_ARGS(pferr), + + TP_STRUCT__entry( + __field(__u32, pferr) + ), + + TP_fast_assign( + __entry->pferr = pferr; + ), + + TP_printk("pferr %x %s", __entry->pferr, + __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) +); + +#endif /* _TRACE_KVMMMU_H */ + +/* This part must be outside protection */ +#include diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 53e129cec5fd..36ac6d70a847 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -125,13 +125,15 @@ static int FNAME(walk_addr)(struct guest_walker *walker, gpa_t pte_gpa; int rsvd_fault = 0; - pgprintk("%s: addr %lx\n", __func__, addr); + trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, + fetch_fault); walk: walker->level = vcpu->arch.mmu.root_level; pte = vcpu->arch.cr3; #if PTTYPE == 64 if (!is_long_mode(vcpu)) { pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); + trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) goto not_present; --walker->level; @@ -150,10 +152,9 @@ walk: pte_gpa += index * sizeof(pt_element_t); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - pgprintk("%s: table_gfn[%d] %lx\n", __func__, - walker->level - 1, table_gfn); kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); + trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) goto not_present; @@ -175,6 +176,8 @@ walk: #endif if (!(pte & PT_ACCESSED_MASK)) { + trace_kvm_mmu_set_accessed_bit(table_gfn, index, + sizeof(pte)); mark_page_dirty(vcpu->kvm, table_gfn); if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_ACCESSED_MASK)) @@ -208,6 +211,7 @@ walk: if (write_fault && !is_dirty_gpte(pte)) { bool ret; + trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); mark_page_dirty(vcpu->kvm, table_gfn); ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_DIRTY_MASK); @@ -239,6 +243,7 @@ err: walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) walker->error_code |= PFERR_RSVD_MASK; + trace_kvm_mmu_walker_error(walker->error_code); return 0; } -- cgit v1.2.3-70-g09d2 From 9c1b96e34717d001873b603d85434aa78e730282 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 9 Jun 2009 12:37:58 +0300 Subject: KVM: Document basic API Document the basic API corresponding to the 2.6.22 release. Signed-off-by: Avi Kivity --- Documentation/kvm/api.txt | 683 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 683 insertions(+) create mode 100644 Documentation/kvm/api.txt diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt new file mode 100644 index 000000000000..1b1c22da211b --- /dev/null +++ b/Documentation/kvm/api.txt @@ -0,0 +1,683 @@ +The Definitive KVM (Kernel-based Virtual Machine) API Documentation +=================================================================== + +1. General description + +The kvm API is a set of ioctls that are issued to control various aspects +of a virtual machine. The ioctls belong to three classes + + - System ioctls: These query and set global attributes which affect the + whole kvm subsystem. In addition a system ioctl is used to create + virtual machines + + - VM ioctls: These query and set attributes that affect an entire virtual + machine, for example memory layout. In addition a VM ioctl is used to + create virtual cpus (vcpus). + + Only run VM ioctls from the same process (address space) that was used + to create the VM. + + - vcpu ioctls: These query and set attributes that control the operation + of a single virtual cpu. + + Only run vcpu ioctls from the same thread that was used to create the + vcpu. + +2. File descritpors + +The kvm API is centered around file descriptors. An initial +open("/dev/kvm") obtains a handle to the kvm subsystem; this handle +can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this +handle will create a VM file descripror which can be used to issue VM +ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu +and return a file descriptor pointing to it. Finally, ioctls on a vcpu +fd can be used to control the vcpu, including the important task of +actually running guest code. + +In general file descriptors can be migrated among processes by means +of fork() and the SCM_RIGHTS facility of unix domain socket. These +kinds of tricks are explicitly not supported by kvm. While they will +not cause harm to the host, their actual behavior is not guaranteed by +the API. The only supported use is one virtual machine per process, +and one vcpu per thread. + +3. Extensions + +As of Linux 2.6.22, the KVM ABI has been stabilized: no backward +incompatible change are allowed. However, there is an extension +facility that allows backward-compatible extensions to the API to be +queried and used. + +The extension mechanism is not based on on the Linux version number. +Instead, kvm defines extension identifiers and a facility to query +whether a particular extension identifier is available. If it is, a +set of ioctls is available for application use. + +4. API description + +This section describes ioctls that can be used to control kvm guests. +For each ioctl, the following information is provided along with a +description: + + Capability: which KVM extension provides this ioctl. Can be 'basic', + which means that is will be provided by any kernel that supports + API version 12 (see section 4.1), or a KVM_CAP_xyz constant, which + means availability needs to be checked with KVM_CHECK_EXTENSION + (see section 4.4). + + Architectures: which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Type: system, vm, or vcpu. + + Parameters: what parameters are accepted by the ioctl. + + Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + +4.1 KVM_GET_API_VERSION + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: the constant KVM_API_VERSION (=12) + +This identifies the API version as the stable kvm API. It is not +expected that this number will change. However, Linux 2.6.20 and +2.6.21 report earlier versions; these are not documented and not +supported. Applications should refuse to run if KVM_GET_API_VERSION +returns a value other than 12. If this check passes, all ioctls +described as 'basic' will be available. + +4.2 KVM_CREATE_VM + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: a VM fd that can be used to control the new virtual machine. + +The new VM has no virtual cpus and no memory. An mmap() of a VM fd +will access the virtual machine's physical address space; offset zero +corresponds to guest physical address zero. Use of mmap() on a VM fd +is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is +available. + +4.3 KVM_GET_MSR_INDEX_LIST + +Capability: basic +Architectures: x86 +Type: system +Parameters: struct kvm_msr_list (in/out) +Returns: 0 on success; -1 on error +Errors: + E2BIG: the msr index list is to be to fit in the array specified by + the user. + +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + +This ioctl returns the guest msrs that are supported. The list varies +by kvm version and host processor, but does not change otherwise. The +user fills in the size of the indices array in nmsrs, and in return +kvm adjusts nmsrs to reflect the actual number of msrs and fills in +the indices array with their numbers. + +4.4 KVM_CHECK_EXTENSION + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: extension identifier (KVM_CAP_*) +Returns: 0 if unsupported; 1 (or some other positive integer) if supported + +The API allows the application to query about extensions to the core +kvm API. Userspace passes an extension identifier (an integer) and +receives an integer that describes the extension availability. +Generally 0 means no and 1 means yes, but some extensions may report +additional information in the integer return value. + +4.5 KVM_GET_VCPU_MMAP_SIZE + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: size of vcpu mmap area, in bytes + +The KVM_RUN ioctl (cf.) communicates with userspace via a shared +memory region. This ioctl returns the size of that region. See the +KVM_RUN documentation for details. + +4.6 KVM_SET_MEMORY_REGION + +Capability: basic +Architectures: all +Type: vm ioctl +Parameters: struct kvm_memory_region (in) +Returns: 0 on success, -1 on error + +struct kvm_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ +}; + +/* for kvm_memory_region::flags */ +#define KVM_MEM_LOG_DIRTY_PAGES 1UL + +This ioctl allows the user to create or modify a guest physical memory +slot. When changing an existing slot, it may be moved in the guest +physical memory space, or its flags may be modified. It may not be +resized. Slots may not overlap. + +The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which +instructs kvm to keep track of writes to memory within the slot. See +the KVM_GET_DIRTY_LOG ioctl. + +It is recommended to use the KVM_SET_USER_MEMORY_REGION ioctl instead +of this API, if available. This newer API allows placing guest memory +at specified locations in the host address space, yielding better +control and easy access. + +4.6 KVM_CREATE_VCPU + +Capability: basic +Architectures: all +Type: vm ioctl +Parameters: vcpu id (apic id on x86) +Returns: vcpu fd on success, -1 on error + +This API adds a vcpu to a virtual machine. The vcpu id is a small integer +in the range [0, max_vcpus). + +4.7 KVM_GET_DIRTY_LOG (vm ioctl) + +Capability: basic +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_dirty_log (in/out) +Returns: 0 on success, -1 on error + +/* for KVM_GET_DIRTY_LOG */ +struct kvm_dirty_log { + __u32 slot; + __u32 padding; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +Given a memory slot, return a bitmap containing any pages dirtied +since the last call to this ioctl. Bit 0 is the first page in the +memory slot. Ensure the entire structure is cleared to avoid padding +issues. + +4.8 KVM_SET_MEMORY_ALIAS + +Capability: basic +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_memory_alias (in) +Returns: 0 (success), -1 (error) + +struct kvm_memory_alias { + __u32 slot; /* this has a different namespace than memory slots */ + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 target_phys_addr; +}; + +Defines a guest physical address space region as an alias to another +region. Useful for aliased address, for example the VGA low memory +window. Should not be used with userspace memory. + +4.9 KVM_RUN + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: none +Returns: 0 on success, -1 on error +Errors: + EINTR: an unmasked signal is pending + +This ioctl is used to run a guest virtual cpu. While there are no +explicit parameters, there is an implicit parameter block that can be +obtained by mmap()ing the vcpu fd at offset 0, with the size given by +KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct +kvm_run' (see below). + +4.10 KVM_GET_REGS + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_regs (out) +Returns: 0 on success, -1 on error + +Reads the general purpose registers from the vcpu. + +/* x86 */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +4.11 KVM_SET_REGS + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_regs (in) +Returns: 0 on success, -1 on error + +Writes the general purpose registers into the vcpu. + +See KVM_GET_REGS for the data structure. + +4.12 KVM_GET_SREGS + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_sregs (out) +Returns: 0 on success, -1 on error + +Reads special registers from the vcpu. + +/* x86 */ +struct kvm_sregs { + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; +}; + +interrupt_bitmap is a bitmap of pending external interrupts. At most +one bit may be set. This interrupt has been acknowledged by the APIC +but not yet injected into the cpu core. + +4.13 KVM_SET_SREGS + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_sregs (in) +Returns: 0 on success, -1 on error + +Writes special registers into the vcpu. See KVM_GET_SREGS for the +data structures. + +4.14 KVM_TRANSLATE + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_translation (in/out) +Returns: 0 on success, -1 on error + +Translates a virtual address according to the vcpu's current address +translation mode. + +struct kvm_translation { + /* in */ + __u64 linear_address; + + /* out */ + __u64 physical_address; + __u8 valid; + __u8 writeable; + __u8 usermode; + __u8 pad[5]; +}; + +4.15 KVM_INTERRUPT + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_interrupt (in) +Returns: 0 on success, -1 on error + +Queues a hardware interrupt vector to be injected. This is only +useful if in-kernel local APIC is not used. + +/* for KVM_INTERRUPT */ +struct kvm_interrupt { + /* in */ + __u32 irq; +}; + +Note 'irq' is an interrupt vector, not an interrupt pin or line. + +4.16 KVM_DEBUG_GUEST + +Capability: basic +Architectures: none +Type: vcpu ioctl +Parameters: none) +Returns: -1 on error + +Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. + +4.17 KVM_GET_MSRS + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_msrs (in/out) +Returns: 0 on success, -1 on error + +Reads model-specific registers from the vcpu. Supported msr indices can +be obtained using KVM_GET_MSR_INDEX_LIST. + +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +Application code should set the 'nmsrs' member (which indicates the +size of the entries array) and the 'index' member of each array entry. +kvm will fill in the 'data' member. + +4.18 KVM_SET_MSRS + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_msrs (in) +Returns: 0 on success, -1 on error + +Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the +data structures. + +Application code should set the 'nmsrs' member (which indicates the +size of the entries array), and the 'index' and 'data' members of each +array entry. + +4.19 KVM_SET_CPUID + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_cpuid (in) +Returns: 0 on success, -1 on error + +Defines the vcpu responses to the cpuid instruction. Applications +should use the KVM_SET_CPUID2 ioctl if available. + + +struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; +}; + +/* for KVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; +}; + +4.20 KVM_SET_SIGNAL_MASK + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_signal_mask (in) +Returns: 0 on success, -1 on error + +Defines which signals are blocked during execution of KVM_RUN. This +signal mask temporarily overrides the threads signal mask. Any +unblocked signal received (except SIGKILL and SIGSTOP, which retain +their traditional behaviour) will cause KVM_RUN to return with -EINTR. + +Note the signal will only be delivered if not blocked by the original +signal mask. + +/* for KVM_SET_SIGNAL_MASK */ +struct kvm_signal_mask { + __u32 len; + __u8 sigset[0]; +}; + +4.21 KVM_GET_FPU + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_fpu (out) +Returns: 0 on success, -1 on error + +Reads the floating point state from the vcpu. + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + +4.22 KVM_SET_FPU + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_fpu (in) +Returns: 0 on success, -1 on error + +Writes the floating point state to the vcpu. + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + +5. The kvm_run structure + +Application code obtains a pointer to the kvm_run structure by +mmap()ing a vcpu fd. From that point, application code can control +execution by changing fields in kvm_run prior to calling the KVM_RUN +ioctl, and obtain information about the reason KVM_RUN returned by +looking up structure members. + +struct kvm_run { + /* in */ + __u8 request_interrupt_window; + +Request that KVM_RUN return when it becomes possible to inject external +interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. + + __u8 padding1[7]; + + /* out */ + __u32 exit_reason; + +When KVM_RUN has returned successfully (return value 0), this informs +application code why KVM_RUN has returned. Allowable values for this +field are detailed below. + + __u8 ready_for_interrupt_injection; + +If request_interrupt_window has been specified, this field indicates +an interrupt can be injected now with KVM_INTERRUPT. + + __u8 if_flag; + +The value of the current interrupt flag. Only valid if in-kernel +local APIC is not used. + + __u8 padding2[2]; + + /* in (pre_kvm_run), out (post_kvm_run) */ + __u64 cr8; + +The value of the cr8 register. Only valid if in-kernel local APIC is +not used. Both input and output. + + __u64 apic_base; + +The value of the APIC BASE msr. Only valid if in-kernel local +APIC is not used. Both input and output. + + union { + /* KVM_EXIT_UNKNOWN */ + struct { + __u64 hardware_exit_reason; + } hw; + +If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown +reasons. Further architecture-specific information is available in +hardware_exit_reason. + + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + } fail_entry; + +If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due +to unknown reasons. Further architecture-specific information is +available in hardware_entry_failure_reason. + + /* KVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + +Unused. + + /* KVM_EXIT_IO */ + struct { +#define KVM_EXIT_IO_IN 0 +#define KVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u16 port; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ + } io; + +If exit_reason is KVM_EXIT_IO_IN or KVM_EXIT_IO_OUT, then the vcpu has +executed a port I/O instruction which could not be satisfied by kvm. +data_offset describes where the data is located (KVM_EXIT_IO_OUT) or +where kvm expects application code to place the data for the next +KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a patcked array. + + struct { + struct kvm_debug_exit_arch arch; + } debug; + +Unused. + + /* KVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + +If exit_reason is KVM_EXIT_MMIO or KVM_EXIT_IO_OUT, then the vcpu has +executed a memory-mapped I/O instruction which could not be satisfied +by kvm. The 'data' member contains the written data if 'is_write' is +true, and should be filled by application code otherwise. + + /* KVM_EXIT_HYPERCALL */ + struct { + __u64 nr; + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; + +Unused. + + /* KVM_EXIT_TPR_ACCESS */ + struct { + __u64 rip; + __u32 is_write; + __u32 pad; + } tpr_access; + +To be documented (KVM_TPR_ACCESS_REPORTING). + + /* KVM_EXIT_S390_SIEIC */ + struct { + __u8 icptcode; + __u64 mask; /* psw upper half */ + __u64 addr; /* psw lower half */ + __u16 ipa; + __u32 ipb; + } s390_sieic; + +s390 specific. + + /* KVM_EXIT_S390_RESET */ +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + __u64 s390_reset_flags; + +s390 specific. + + /* KVM_EXIT_DCR */ + struct { + __u32 dcrn; + __u32 data; + __u8 is_write; + } dcr; + +powerpc specific. + + /* Fix the size of the union. */ + char padding[256]; + }; +}; -- cgit v1.2.3-70-g09d2 From f691fe1da7e2715137d21ae5a80bec64db4625db Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 6 Jul 2009 15:58:14 +0300 Subject: KVM: Trace shadow page lifecycle Create, sync, unsync, zap. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++-- arch/x86/kvm/mmutrace.h | 103 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c0dda6447b9f..ac121b39a5bc 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1122,6 +1122,7 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 1; } + trace_kvm_mmu_sync_page(sp); if (rmap_write_protect(vcpu->kvm, sp->gfn)) kvm_flush_remote_tlbs(vcpu->kvm); kvm_unlink_unsync_page(vcpu->kvm, sp); @@ -1244,8 +1245,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - pgprintk("%s: looking gfn %lx role %x\n", __func__, - gfn, role.word); index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) @@ -1262,14 +1261,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); kvm_mmu_mark_parents_unsync(vcpu, sp); } - pgprintk("%s: found\n", __func__); + trace_kvm_mmu_get_page(sp, false); return sp; } ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) return sp; - pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, bucket); @@ -1282,6 +1280,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, vcpu->arch.mmu.prefetch_page(vcpu, sp); else nonpaging_prefetch_page(vcpu, sp); + trace_kvm_mmu_get_page(sp, true); return sp; } @@ -1410,6 +1409,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm, static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) { int ret; + + trace_kvm_mmu_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; ret = mmu_zap_unsync_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp); @@ -1656,6 +1657,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) struct kvm_mmu_page *s; struct hlist_node *node, *n; + trace_kvm_mmu_unsync_page(sp); index = kvm_page_table_hashfn(sp->gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; /* don't unsync if pagetable is shadowed with multiple roles */ diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 1367f82717d1..3e4a5c6ca2a9 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -2,12 +2,48 @@ #define _TRACE_KVMMMU_H #include +#include #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE mmutrace +#define KVM_MMU_PAGE_FIELDS \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ + __field(__u32, unsync) + +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ + __entry->unsync = sp->unsync; + +#define KVM_MMU_PAGE_PRINTK() ({ \ + const char *ret = p->buffer + p->len; \ + static const char *access_str[] = { \ + "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ + }; \ + union kvm_mmu_page_role role; \ + \ + role.word = __entry->role; \ + \ + trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ + " %snxe root %u %s%c", \ + __entry->gfn, role.level, role.glevels, \ + role.quadrant, \ + role.direct ? " direct" : "", \ + access_str[role.access], \ + role.invalid ? " invalid" : "", \ + role.cr4_pge ? "" : "!", \ + role.nxe ? "" : "!", \ + __entry->root_count, \ + __entry->unsync ? "unsync" : "sync", 0); \ + ret; \ + }) + #define kvm_mmu_trace_pferr_flags \ { PFERR_PRESENT_MASK, "P" }, \ { PFERR_WRITE_MASK, "W" }, \ @@ -111,6 +147,73 @@ TRACE_EVENT( __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) ); +TRACE_EVENT( + kvm_mmu_get_page, + TP_PROTO(struct kvm_mmu_page *sp, bool created), + TP_ARGS(sp, created), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + __field(bool, created) + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + __entry->created = created; + ), + + TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(), + __entry->created ? "new" : "existing") +); + +TRACE_EVENT( + kvm_mmu_sync_page, + TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + ), + + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) +); + +TRACE_EVENT( + kvm_mmu_unsync_page, + TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + ), + + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) +); + +TRACE_EVENT( + kvm_mmu_zap_page, + TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + ), + + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) +); + #endif /* _TRACE_KVMMMU_H */ /* This part must be outside protection */ -- cgit v1.2.3-70-g09d2 From f7c6d140032ae7e80df36d350c6edc41f92b2f94 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 2 Jul 2009 15:04:14 +0200 Subject: KVM: fix MMIO_CONF_BASE MSR access Some Windows versions check whether the BIOS has setup MMI/O for config space accesses on AMD Fam10h CPUs, we say "no" by returning 0 on reads and only allow disabling of MMI/O CfgSpace setup by igoring "0" writes. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0e74d98641a0..95fa45c0f16e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -844,6 +844,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 1; } break; + case MSR_FAM10H_MMIO_CONF_BASE: + if (data != 0) { + pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " + "0x%llx\n", data); + return 1; + } + break; case MSR_AMD64_NB_CFG: break; case MSR_IA32_DEBUGCTLMSR: @@ -1055,6 +1062,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_K7_EVNTSEL0: case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: + case MSR_FAM10H_MMIO_CONF_BASE: data = 0; break; case MSR_MTRRcap: -- cgit v1.2.3-70-g09d2 From 07fb8bb29f5269f30afa6f1b3d941406677f6ad2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 5 Jul 2009 21:27:58 +0300 Subject: KVM: ignore msi request if !level Irqfd sets level for interrupt to 1 and then to 0. For MSI, check level so that a single message is sent. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- virt/kvm/irq_comm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 56e696104d43..a9d7fd1f1d6a 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -139,7 +139,9 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) else clear_bit(irq_source_id, irq_state); sig_level = !!(*irq_state); - } else /* Deal with MSI/MSI-X */ + } else if (!level) + return ret; + else /* Deal with MSI/MSI-X */ sig_level = 1; /* Not possible to detect if the guest uses the PIC or the -- cgit v1.2.3-70-g09d2 From 1000ff8d893765d7b56e32fe16dbe4814f172588 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 7 Jul 2009 16:00:57 +0300 Subject: KVM: Add trace points in irqchip code Add tracepoint in msi/ioapic/pic set_irq() functions, in IPI sending and in the point where IRQ is placed into apic's IRR. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 3 ++ arch/x86/kvm/lapic.c | 4 +++ arch/x86/kvm/trace.h | 85 ++++++++++++++++++++++++++++++++++++++++++++++ include/trace/events/kvm.h | 56 ++++++++++++++++++++++++++++++ virt/kvm/ioapic.c | 2 ++ virt/kvm/irq_comm.c | 2 ++ 6 files changed, 152 insertions(+) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 1d1bb75dc7bc..e4bcbddecb36 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -30,6 +30,7 @@ #include "irq.h" #include +#include "trace.h" static void pic_lock(struct kvm_pic *s) __acquires(&s->lock) @@ -190,6 +191,8 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); + trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, + s->pics[irq >> 3].imr, ret == 0); } pic_unlock(s); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6c8460308548..5d697602048b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -375,6 +375,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; result = !apic_test_and_set_irr(vector, apic); + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector, result); if (!result) { if (trig_mode) apic_debug("level trig mode repeatedly for " @@ -493,6 +495,8 @@ static void apic_send_ipi(struct kvm_lapic *apic) else irq.dest_id = GET_APIC_DEST_FIELD(icr_high); + trace_kvm_apic_ipi(icr_low, irq.dest_id); + apic_debug("icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 6c2c87fa6e4f..0d480e77eacf 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -264,6 +264,91 @@ TRACE_EVENT(kvm_cr, #define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val) #define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val) +TRACE_EVENT(kvm_pic_set_irq, + TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced), + TP_ARGS(chip, pin, elcr, imr, coalesced), + + TP_STRUCT__entry( + __field( __u8, chip ) + __field( __u8, pin ) + __field( __u8, elcr ) + __field( __u8, imr ) + __field( bool, coalesced ) + ), + + TP_fast_assign( + __entry->chip = chip; + __entry->pin = pin; + __entry->elcr = elcr; + __entry->imr = imr; + __entry->coalesced = coalesced; + ), + + TP_printk("chip %u pin %u (%s%s)%s", + __entry->chip, __entry->pin, + (__entry->elcr & (1 << __entry->pin)) ? "level":"edge", + (__entry->imr & (1 << __entry->pin)) ? "|masked":"", + __entry->coalesced ? " (coalesced)" : "") +); + +#define kvm_apic_dst_shorthand \ + {0x0, "dst"}, \ + {0x1, "self"}, \ + {0x2, "all"}, \ + {0x3, "all-but-self"} + +TRACE_EVENT(kvm_apic_ipi, + TP_PROTO(__u32 icr_low, __u32 dest_id), + TP_ARGS(icr_low, dest_id), + + TP_STRUCT__entry( + __field( __u32, icr_low ) + __field( __u32, dest_id ) + ), + + TP_fast_assign( + __entry->icr_low = icr_low; + __entry->dest_id = dest_id; + ), + + TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)", + __entry->dest_id, (u8)__entry->icr_low, + __print_symbolic((__entry->icr_low >> 8 & 0x7), + kvm_deliver_mode), + (__entry->icr_low & (1<<11)) ? "logical" : "physical", + (__entry->icr_low & (1<<14)) ? "assert" : "de-assert", + (__entry->icr_low & (1<<15)) ? "level" : "edge", + __print_symbolic((__entry->icr_low >> 18 & 0x3), + kvm_apic_dst_shorthand)) +); + +TRACE_EVENT(kvm_apic_accept_irq, + TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced), + TP_ARGS(apicid, dm, tm, vec, coalesced), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( __u16, dm ) + __field( __u8, tm ) + __field( __u8, vec ) + __field( bool, coalesced ) + ), + + TP_fast_assign( + __entry->apicid = apicid; + __entry->dm = dm; + __entry->tm = tm; + __entry->vec = vec; + __entry->coalesced = coalesced; + ), + + TP_printk("apicid %x vec %u (%s|%s)%s", + __entry->apicid, __entry->vec, + __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), + __entry->tm ? "level" : "edge", + __entry->coalesced ? " (coalesced)" : "") +); + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 77022af48492..dbe108455275 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -28,6 +28,62 @@ TRACE_EVENT(kvm_set_irq, __entry->gsi, __entry->level, __entry->irq_source_id) ); +#define kvm_deliver_mode \ + {0x0, "Fixed"}, \ + {0x1, "LowPrio"}, \ + {0x2, "SMI"}, \ + {0x3, "Res3"}, \ + {0x4, "NMI"}, \ + {0x5, "INIT"}, \ + {0x6, "SIPI"}, \ + {0x7, "ExtINT"} + +TRACE_EVENT(kvm_ioapic_set_irq, + TP_PROTO(__u64 e, int pin, bool coalesced), + TP_ARGS(e, pin, coalesced), + + TP_STRUCT__entry( + __field( __u64, e ) + __field( int, pin ) + __field( bool, coalesced ) + ), + + TP_fast_assign( + __entry->e = e; + __entry->pin = pin; + __entry->coalesced = coalesced; + ), + + TP_printk("pin %u dst %x vec=%u (%s|%s|%s%s)%s", + __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e, + __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), + (__entry->e & (1<<11)) ? "logical" : "physical", + (__entry->e & (1<<15)) ? "level" : "edge", + (__entry->e & (1<<16)) ? "|masked" : "", + __entry->coalesced ? " (coalesced)" : "") +); + +TRACE_EVENT(kvm_msi_set_irq, + TP_PROTO(__u64 address, __u64 data), + TP_ARGS(address, data), + + TP_STRUCT__entry( + __field( __u64, address ) + __field( __u64, data ) + ), + + TP_fast_assign( + __entry->address = address; + __entry->data = data; + ), + + TP_printk("dst %u vec %x (%s|%s|%s%s)", + (u8)(__entry->address >> 12), (u8)__entry->data, + __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), + (__entry->address & (1<<2)) ? "logical" : "physical", + (__entry->data & (1<<15)) ? "level" : "edge", + (__entry->address & (1<<3)) ? "|rh" : "") +); #define kvm_irqchips \ {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \ diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 92496ff3d82d..b91fbb215447 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "ioapic.h" #include "lapic.h" @@ -193,6 +194,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) (!edge && !entry.fields.remote_irr)) ret = ioapic_service(ioapic, irq); } + trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); } return ret; } diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index a9d7fd1f1d6a..001663ff401a 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -100,6 +100,8 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, { struct kvm_lapic_irq irq; + trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + irq.dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; irq.vector = (e->msi.data & -- cgit v1.2.3-70-g09d2 From c7f0f24b1f82ec049fcfb8e6d4e1cee2815efbe5 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 7 Jul 2009 15:27:32 +0300 Subject: KVM: No need to kick cpu if not in a guest mode This will save a couple of IPIs. Signed-off-by: Gleb Natapov Acked-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 95fa45c0f16e..e3d9040f2156 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3506,6 +3506,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) smp_mb__after_clear_bit(); if (vcpu->requests || need_resched() || signal_pending(current)) { + set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); preempt_enable(); r = 1; -- cgit v1.2.3-70-g09d2 From 0d1de2d901f4ba0972a3886496a44fb1d3300dbd Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 12 Jul 2009 16:10:55 +0300 Subject: KVM: Always report x2apic as supported feature We emulate x2apic in software, so host support is not required. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3d9040f2156..dfb0e37b3c65 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1504,6 +1504,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 1: entry->edx &= kvm_supported_word0_x86_features; entry->ecx &= kvm_supported_word4_x86_features; + /* we support x2apic emulation even if host does not support + * it since we emulate x2apic in software */ + entry->ecx |= F(X2APIC); break; /* function 2 entries are STATEFUL. That is, repeated cpuid commands * may return different values. This forces us to get_cpu() before -- cgit v1.2.3-70-g09d2 From e9f4275732add046fed4a548b8dbb98dbe500d2f Mon Sep 17 00:00:00 2001 From: Beth Kon Date: Tue, 7 Jul 2009 11:50:38 -0400 Subject: KVM: PIT support for HPET legacy mode When kvm is in hpet_legacy_mode, the hpet is providing the timer interrupt and the pit should not be. So in legacy mode, the pit timer is destroyed, but the *state* of the pit is maintained. So if kvm or the guest tries to modify the state of the pit, this modification is accepted, *except* that the timer isn't actually started. When we exit hpet_legacy_mode, the current state of the pit (which is up to date since we've been accepting modifications) is used to restart the pit timer. The saved_mode code in kvm_pit_load_count temporarily changes mode to 0xff in order to destroy the timer, but then restores the actual value, again maintaining "current" state of the pit for possible later reenablement. [avi: add some reserved storage in the ioctl; make SET_PIT2 IOW] [marcelo: fix memory corruption due to reserved storage] Signed-off-by: Beth Kon Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm.h | 9 +++++++ arch/x86/kvm/i8254.c | 22 +++++++++++++---- arch/x86/kvm/i8254.h | 3 ++- arch/x86/kvm/x86.c | 59 +++++++++++++++++++++++++++++++++++++++++++++- include/linux/kvm.h | 6 +++++ 5 files changed, 93 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 708b9c32a5da..4a5fe914dc59 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -18,6 +18,7 @@ #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX #define __KVM_HAVE_MCE +#define __KVM_HAVE_PIT_STATE2 /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -237,6 +238,14 @@ struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + +struct kvm_pit_state2 { + struct kvm_pit_channel_state channels[3]; + __u32 flags; + __u32 reserved[9]; +}; + struct kvm_reinject_control { __u8 pit_reinject; __u8 reserved[31]; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 8c3ac30ef9bd..9b62c57ba6e4 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -332,20 +332,33 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) case 1: /* FIXME: enhance mode 4 precision */ case 4: - create_pit_timer(ps, val, 0); + if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { + create_pit_timer(ps, val, 0); + } break; case 2: case 3: - create_pit_timer(ps, val, 1); + if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ + create_pit_timer(ps, val, 1); + } break; default: destroy_pit_timer(&ps->pit_timer); } } -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val) +void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start) { - pit_load_count(kvm, channel, val); + u8 saved_mode; + if (hpet_legacy_start) { + /* save existing mode for later reenablement */ + saved_mode = kvm->arch.vpit->pit_state.channels[0].mode; + kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */ + pit_load_count(kvm, channel, val); + kvm->arch.vpit->pit_state.channels[0].mode = saved_mode; + } else { + pit_load_count(kvm, channel, val); + } } static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) @@ -554,6 +567,7 @@ void kvm_pit_reset(struct kvm_pit *pit) struct kvm_kpit_channel_state *c; mutex_lock(&pit->pit_state.lock); + pit->pit_state.flags = 0; for (i = 0; i < 3; i++) { c = &pit->pit_state.channels[i]; c->mode = 0xff; diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index b2670180f225..d4c1c7ffdc09 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -21,6 +21,7 @@ struct kvm_kpit_channel_state { struct kvm_kpit_state { struct kvm_kpit_channel_state channels[3]; + u32 flags; struct kvm_timer pit_timer; bool is_periodic; u32 speaker_data_on; @@ -49,7 +50,7 @@ struct kvm_pit { #define KVM_PIT_CHANNEL_MASK 0x3 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); +void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); void kvm_pit_reset(struct kvm_pit *pit); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dfb0e37b3c65..2214384ff610 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1213,6 +1213,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_IRQFD: case KVM_CAP_PIT2: + case KVM_CAP_PIT_STATE2: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2078,7 +2079,36 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); - kvm_pit_load_count(kvm, 0, ps->channels[0].count); + kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return r; +} + +static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + int r = 0; + + mutex_lock(&kvm->arch.vpit->pit_state.lock); + memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, + sizeof(ps->channels)); + ps->flags = kvm->arch.vpit->pit_state.flags; + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return r; +} + +static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + int r = 0, start = 0; + u32 prev_legacy, cur_legacy; + mutex_lock(&kvm->arch.vpit->pit_state.lock); + prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; + cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; + if (!prev_legacy && cur_legacy) + start = 1; + memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, + sizeof(kvm->arch.vpit->pit_state.channels)); + kvm->arch.vpit->pit_state.flags = ps->flags; + kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); mutex_unlock(&kvm->arch.vpit->pit_state.lock); return r; } @@ -2140,6 +2170,7 @@ long kvm_arch_vm_ioctl(struct file *filp, */ union { struct kvm_pit_state ps; + struct kvm_pit_state2 ps2; struct kvm_memory_alias alias; struct kvm_pit_config pit_config; } u; @@ -2322,6 +2353,32 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_GET_PIT2: { + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) + goto out; + r = 0; + break; + } + case KVM_SET_PIT2: { + r = -EFAULT; + if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) + goto out; + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); + if (r) + goto out; + r = 0; + break; + } case KVM_REINJECT_CONTROL: { struct kvm_reinject_control control; r = -EFAULT; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 76c640834ea6..a74a1fcc28e9 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -409,6 +409,9 @@ struct kvm_guest_debug { #define KVM_CAP_PIT2 33 #endif #define KVM_CAP_SET_BOOT_CPU_ID 34 +#ifdef __KVM_HAVE_PIT_STATE2 +#define KVM_CAP_PIT_STATE2 35 +#endif #ifdef KVM_CAP_IRQ_ROUTING @@ -586,6 +589,9 @@ struct kvm_debug_guest { #define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *) #define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *) +#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) +#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) + #define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) #define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) #define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) -- cgit v1.2.3-70-g09d2 From fef07aae9cd9ed82f94228c311b35360f1f38902 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 10 Jul 2009 14:20:35 +0200 Subject: KVM: add module parameters documentation Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- Documentation/kernel-parameters.txt | 39 +++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7936b801fe6a..877a02a12e0f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -57,6 +57,7 @@ parameter is applicable: ISAPNP ISA PnP code is enabled. ISDN Appropriate ISDN support is enabled. JOY Appropriate joystick support is enabled. + KVM Kernel Virtual Machine support is enabled. LIBATA Libata driver is enabled LP Printer support is enabled. LOOP Loopback device support is enabled. @@ -1098,6 +1099,44 @@ and is between 256 and 4096 characters. It is defined in the file kstack=N [X86] Print N words from the kernel stack in oops dumps. + kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. + Default is 0 (don't ignore, but inject #GP) + + kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging. + Default is 1 (enabled) + + kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. + Default is 0 (off) + + kvm-amd.npt= [KVM,AMD] Disable nested paging (virtualized MMU) + for all guests. + Default is 1 (enabled) if in 64bit or 32bit-PAE mode + + kvm-intel.bypass_guest_pf= + [KVM,Intel] Disables bypassing of guest page faults + on Intel chips. Default is 1 (enabled) + + kvm-intel.ept= [KVM,Intel] Disable extended page tables + (virtualized MMU) support on capable Intel chips. + Default is 1 (enabled) + + kvm-intel.emulate_invalid_guest_state= + [KVM,Intel] Enable emulation of invalid guest states + Default is 0 (disabled) + + kvm-intel.flexpriority= + [KVM,Intel] Disable FlexPriority feature (TPR shadow). + Default is 1 (enabled) + + kvm-intel.unrestricted_guest= + [KVM,Intel] Disable unrestricted guest feature + (virtualized real and unpaged mode) on capable + Intel chips. Default is 1 (enabled) + + kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification + feature (tagged TLBs) on capable Intel chips. + Default is 1 (enabled) + l2cr= [PPC] l3cr= [PPC] -- cgit v1.2.3-70-g09d2 From 090b7aff27120cdae76a346a70db394844fea598 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Tue, 7 Jul 2009 17:08:44 -0400 Subject: KVM: make io_bus interface more robust Today kvm_io_bus_regsiter_dev() returns void and will internally BUG_ON if it fails. We want to create dynamic MMIO/PIO entries driven from userspace later in the series, so we need to enhance the code to be more robust with the following changes: 1) Add a return value to the registration function 2) Fix up all the callsites to check the return code, handle any failures, and percolate the error up to the caller. 3) Add an unregister function that collapses holes in the array Signed-off-by: Gregory Haskins Acked-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 20 ++++++++++++++++++-- arch/x86/kvm/i8259.c | 9 ++++++++- include/linux/kvm_host.h | 10 +++++++--- virt/kvm/coalesced_mmio.c | 8 ++++++-- virt/kvm/ioapic.c | 8 ++++++-- virt/kvm/kvm_main.c | 39 ++++++++++++++++++++++++++++++++++----- 6 files changed, 79 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 9b62c57ba6e4..137e54817102 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -605,6 +605,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; struct kvm_kpit_state *pit_state; + int ret; pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); if (!pit) @@ -639,14 +640,29 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + if (ret < 0) + goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); + ret = __kvm_io_bus_register_dev(&kvm->pio_bus, + &pit->speaker_dev); + if (ret < 0) + goto fail_unregister; } return pit; + +fail_unregister: + __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); + +fail: + if (pit->irq_source_id >= 0) + kvm_free_irq_source_id(kvm, pit->irq_source_id); + + kfree(pit); + return NULL; } void kvm_free_pit(struct kvm *kvm) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index e4bcbddecb36..daf4606b0293 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -539,6 +539,8 @@ static const struct kvm_io_device_ops picdev_ops = { struct kvm_pic *kvm_create_pic(struct kvm *kvm) { struct kvm_pic *s; + int ret; + s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; @@ -555,6 +557,11 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) * Initialize PIO device */ kvm_iodevice_init(&s->dev, &picdev_ops); - kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); + ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); + if (ret < 0) { + kfree(s); + return NULL; + } + return s; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 077e8bb875a9..983b0bdeb3ff 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -64,10 +64,14 @@ int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, int len, const void *val); int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val); -void __kvm_io_bus_register_dev(struct kvm_io_bus *bus, +int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev); +int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, + struct kvm_io_device *dev); +void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev); +void kvm_io_bus_unregister_dev(struct kvm *kvm, struct kvm_io_bus *bus, struct kvm_io_device *dev); -void kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, - struct kvm_io_device *dev); struct kvm_vcpu { struct kvm *kvm; diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 0352f81ecc0b..04d69cd7049b 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -92,6 +92,7 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = { int kvm_coalesced_mmio_init(struct kvm *kvm) { struct kvm_coalesced_mmio_dev *dev; + int ret; dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); if (!dev) @@ -100,9 +101,12 @@ int kvm_coalesced_mmio_init(struct kvm *kvm) kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); dev->kvm = kvm; kvm->coalesced_mmio_dev = dev; - kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev); - return 0; + ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev); + if (ret < 0) + kfree(dev); + + return ret; } int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index b91fbb215447..fa05f67423ab 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -342,6 +342,7 @@ static const struct kvm_io_device_ops ioapic_mmio_ops = { int kvm_ioapic_init(struct kvm *kvm) { struct kvm_ioapic *ioapic; + int ret; ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); if (!ioapic) @@ -350,7 +351,10 @@ int kvm_ioapic_init(struct kvm *kvm) kvm_ioapic_reset(ioapic); kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); ioapic->kvm = kvm; - kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev); - return 0; + ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev); + if (ret < 0) + kfree(ioapic); + + return ret; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fc1b58a72757..9c2fd025b8ae 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2533,21 +2533,50 @@ int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val) return -EOPNOTSUPP; } -void kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, +int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, struct kvm_io_device *dev) { + int ret; + down_write(&kvm->slots_lock); - __kvm_io_bus_register_dev(bus, dev); + ret = __kvm_io_bus_register_dev(bus, dev); up_write(&kvm->slots_lock); + + return ret; } /* An unlocked version. Caller must have write lock on slots_lock. */ -void __kvm_io_bus_register_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev) +int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev) { - BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); + if (bus->dev_count > NR_IOBUS_DEVS-1) + return -ENOSPC; bus->devs[bus->dev_count++] = dev; + + return 0; +} + +void kvm_io_bus_unregister_dev(struct kvm *kvm, + struct kvm_io_bus *bus, + struct kvm_io_device *dev) +{ + down_write(&kvm->slots_lock); + __kvm_io_bus_unregister_dev(bus, dev); + up_write(&kvm->slots_lock); +} + +/* An unlocked version. Caller must have write lock on slots_lock. */ +void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) + if (bus->devs[i] == dev) { + bus->devs[i] = bus->devs[--bus->dev_count]; + break; + } } static struct notifier_block kvm_cpu_notifier = { -- cgit v1.2.3-70-g09d2 From d34e6b175e61821026893ec5298cc8e7558df43a Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Tue, 7 Jul 2009 17:08:49 -0400 Subject: KVM: add ioeventfd support ioeventfd is a mechanism to register PIO/MMIO regions to trigger an eventfd signal when written to by a guest. Host userspace can register any arbitrary IO address with a corresponding eventfd and then pass the eventfd to a specific end-point of interest for handling. Normal IO requires a blocking round-trip since the operation may cause side-effects in the emulated model or may return data to the caller. Therefore, an IO in KVM traps from the guest to the host, causes a VMX/SVM "heavy-weight" exit back to userspace, and is ultimately serviced by qemu's device model synchronously before returning control back to the vcpu. However, there is a subclass of IO which acts purely as a trigger for other IO (such as to kick off an out-of-band DMA request, etc). For these patterns, the synchronous call is particularly expensive since we really only want to simply get our notification transmitted asychronously and return as quickly as possible. All the sychronous infrastructure to ensure proper data-dependencies are met in the normal IO case are just unecessary overhead for signalling. This adds additional computational load on the system, as well as latency to the signalling path. Therefore, we provide a mechanism for registration of an in-kernel trigger point that allows the VCPU to only require a very brief, lightweight exit just long enough to signal an eventfd. This also means that any clients compatible with the eventfd interface (which includes userspace and kernelspace equally well) can now register to be notified. The end result should be a more flexible and higher performance notification API for the backend KVM hypervisor and perhipheral components. To test this theory, we built a test-harness called "doorbell". This module has a function called "doorbell_ring()" which simply increments a counter for each time the doorbell is signaled. It supports signalling from either an eventfd, or an ioctl(). We then wired up two paths to the doorbell: One via QEMU via a registered io region and through the doorbell ioctl(). The other is direct via ioeventfd. You can download this test harness here: ftp://ftp.novell.com/dev/ghaskins/doorbell.tar.bz2 The measured results are as follows: qemu-mmio: 110000 iops, 9.09us rtt ioeventfd-mmio: 200100 iops, 5.00us rtt ioeventfd-pio: 367300 iops, 2.72us rtt I didn't measure qemu-pio, because I have to figure out how to register a PIO region with qemu's device model, and I got lazy. However, for now we can extrapolate based on the data from the NULLIO runs of +2.56us for MMIO, and -350ns for HC, we get: qemu-pio: 153139 iops, 6.53us rtt ioeventfd-hc: 412585 iops, 2.37us rtt these are just for fun, for now, until I can gather more data. Here is a graph for your convenience: http://developer.novell.com/wiki/images/7/76/Iofd-chart.png The conclusion to draw is that we save about 4us by skipping the userspace hop. -------------------- Signed-off-by: Gregory Haskins Acked-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + include/linux/kvm.h | 24 +++++ include/linux/kvm_host.h | 10 +- virt/kvm/eventfd.c | 251 ++++++++++++++++++++++++++++++++++++++++++++++- virt/kvm/kvm_main.c | 11 ++- 5 files changed, 293 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2214384ff610..42160b031fcd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1212,6 +1212,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_IRQFD: + case KVM_CAP_IOEVENTFD: case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: r = 1; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index a74a1fcc28e9..230a91aa61c9 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -307,6 +307,28 @@ struct kvm_guest_debug { struct kvm_guest_debug_arch arch; }; +enum { + kvm_ioeventfd_flag_nr_datamatch, + kvm_ioeventfd_flag_nr_pio, + kvm_ioeventfd_flag_nr_deassign, + kvm_ioeventfd_flag_nr_max, +}; + +#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) +#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) +#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) + +#define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1) + +struct kvm_ioeventfd { + __u64 datamatch; + __u64 addr; /* legal pio/mmio address */ + __u32 len; /* 1, 2, 4, or 8 bytes */ + __s32 fd; + __u32 flags; + __u8 pad[36]; +}; + #define KVM_TRC_SHIFT 16 /* * kvm trace categories @@ -412,6 +434,7 @@ struct kvm_guest_debug { #ifdef __KVM_HAVE_PIT_STATE2 #define KVM_CAP_PIT_STATE2 35 #endif +#define KVM_CAP_IOEVENTFD 36 #ifdef KVM_CAP_IRQ_ROUTING @@ -520,6 +543,7 @@ struct kvm_irqfd { #define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd) #define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config) #define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78) +#define KVM_IOEVENTFD _IOW(KVMIO, 0x79, struct kvm_ioeventfd) /* * ioctls for vcpu fds diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 983b0bdeb3ff..6ec9fc56a49e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -155,6 +155,7 @@ struct kvm { spinlock_t lock; struct list_head items; } irqfds; + struct list_head ioeventfds; #endif struct kvm_vm_stat stat; struct kvm_arch arch; @@ -528,19 +529,24 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} #ifdef CONFIG_HAVE_KVM_EVENTFD -void kvm_irqfd_init(struct kvm *kvm); +void kvm_eventfd_init(struct kvm *kvm); int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); void kvm_irqfd_release(struct kvm *kvm); +int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); #else -static inline void kvm_irqfd_init(struct kvm *kvm) {} +static inline void kvm_eventfd_init(struct kvm *kvm) {} static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) { return -EINVAL; } static inline void kvm_irqfd_release(struct kvm *kvm) {} +static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + return -ENOSYS; +} #endif /* CONFIG_HAVE_KVM_EVENTFD */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 4092b8dcd510..99017e8a92ac 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -21,6 +21,7 @@ */ #include +#include #include #include #include @@ -28,6 +29,9 @@ #include #include #include +#include + +#include "iodev.h" /* * -------------------------------------------------------------------- @@ -234,10 +238,11 @@ fail: } void -kvm_irqfd_init(struct kvm *kvm) +kvm_eventfd_init(struct kvm *kvm) { spin_lock_init(&kvm->irqfds.lock); INIT_LIST_HEAD(&kvm->irqfds.items); + INIT_LIST_HEAD(&kvm->ioeventfds); } /* @@ -327,3 +332,247 @@ static void __exit irqfd_module_exit(void) module_init(irqfd_module_init); module_exit(irqfd_module_exit); + +/* + * -------------------------------------------------------------------- + * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. + * + * userspace can register a PIO/MMIO address with an eventfd for receiving + * notification when the memory has been touched. + * -------------------------------------------------------------------- + */ + +struct _ioeventfd { + struct list_head list; + u64 addr; + int length; + struct eventfd_ctx *eventfd; + u64 datamatch; + struct kvm_io_device dev; + bool wildcard; +}; + +static inline struct _ioeventfd * +to_ioeventfd(struct kvm_io_device *dev) +{ + return container_of(dev, struct _ioeventfd, dev); +} + +static void +ioeventfd_release(struct _ioeventfd *p) +{ + eventfd_ctx_put(p->eventfd); + list_del(&p->list); + kfree(p); +} + +static bool +ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) +{ + u64 _val; + + if (!(addr == p->addr && len == p->length)) + /* address-range must be precise for a hit */ + return false; + + if (p->wildcard) + /* all else equal, wildcard is always a hit */ + return true; + + /* otherwise, we have to actually compare the data */ + + BUG_ON(!IS_ALIGNED((unsigned long)val, len)); + + switch (len) { + case 1: + _val = *(u8 *)val; + break; + case 2: + _val = *(u16 *)val; + break; + case 4: + _val = *(u32 *)val; + break; + case 8: + _val = *(u64 *)val; + break; + default: + return false; + } + + return _val == p->datamatch ? true : false; +} + +/* MMIO/PIO writes trigger an event if the addr/val match */ +static int +ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) +{ + struct _ioeventfd *p = to_ioeventfd(this); + + if (!ioeventfd_in_range(p, addr, len, val)) + return -EOPNOTSUPP; + + eventfd_signal(p->eventfd, 1); + return 0; +} + +/* + * This function is called as KVM is completely shutting down. We do not + * need to worry about locking just nuke anything we have as quickly as possible + */ +static void +ioeventfd_destructor(struct kvm_io_device *this) +{ + struct _ioeventfd *p = to_ioeventfd(this); + + ioeventfd_release(p); +} + +static const struct kvm_io_device_ops ioeventfd_ops = { + .write = ioeventfd_write, + .destructor = ioeventfd_destructor, +}; + +/* assumes kvm->slots_lock held */ +static bool +ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) +{ + struct _ioeventfd *_p; + + list_for_each_entry(_p, &kvm->ioeventfds, list) + if (_p->addr == p->addr && _p->length == p->length && + (_p->wildcard || p->wildcard || + _p->datamatch == p->datamatch)) + return true; + + return false; +} + +static int +kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; + struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; + struct _ioeventfd *p; + struct eventfd_ctx *eventfd; + int ret; + + /* must be natural-word sized */ + switch (args->len) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return -EINVAL; + } + + /* check for range overflow */ + if (args->addr + args->len < args->addr) + return -EINVAL; + + /* check for extra flags that we don't understand */ + if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) + return -EINVAL; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&p->list); + p->addr = args->addr; + p->length = args->len; + p->eventfd = eventfd; + + /* The datamatch feature is optional, otherwise this is a wildcard */ + if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) + p->datamatch = args->datamatch; + else + p->wildcard = true; + + down_write(&kvm->slots_lock); + + /* Verify that there isnt a match already */ + if (ioeventfd_check_collision(kvm, p)) { + ret = -EEXIST; + goto unlock_fail; + } + + kvm_iodevice_init(&p->dev, &ioeventfd_ops); + + ret = __kvm_io_bus_register_dev(bus, &p->dev); + if (ret < 0) + goto unlock_fail; + + list_add_tail(&p->list, &kvm->ioeventfds); + + up_write(&kvm->slots_lock); + + return 0; + +unlock_fail: + up_write(&kvm->slots_lock); + +fail: + kfree(p); + eventfd_ctx_put(eventfd); + + return ret; +} + +static int +kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; + struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; + struct _ioeventfd *p, *tmp; + struct eventfd_ctx *eventfd; + int ret = -ENOENT; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + down_write(&kvm->slots_lock); + + list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { + bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); + + if (p->eventfd != eventfd || + p->addr != args->addr || + p->length != args->len || + p->wildcard != wildcard) + continue; + + if (!p->wildcard && p->datamatch != args->datamatch) + continue; + + __kvm_io_bus_unregister_dev(bus, &p->dev); + ioeventfd_release(p); + ret = 0; + break; + } + + up_write(&kvm->slots_lock); + + eventfd_ctx_put(eventfd); + + return ret; +} + +int +kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) + return kvm_deassign_ioeventfd(kvm, args); + + return kvm_assign_ioeventfd(kvm, args); +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9c2fd025b8ae..d7b9bbba26da 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -979,7 +979,7 @@ static struct kvm *kvm_create_vm(void) spin_lock_init(&kvm->mmu_lock); spin_lock_init(&kvm->requests_lock); kvm_io_bus_init(&kvm->pio_bus); - kvm_irqfd_init(kvm); + kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); kvm_io_bus_init(&kvm->mmio_bus); @@ -2271,6 +2271,15 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); break; } + case KVM_IOEVENTFD: { + struct kvm_ioeventfd data; + + r = -EFAULT; + if (copy_from_user(&data, argp, sizeof data)) + goto out; + r = kvm_ioeventfd(kvm, &data); + break; + } #ifdef CONFIG_KVM_APIC_ARCHITECTURE case KVM_SET_BOOT_CPU_ID: r = 0; -- cgit v1.2.3-70-g09d2 From a205bc19f0d203f31a15bd2f9464ef05f918b77e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 9 Jul 2009 16:36:01 +0200 Subject: KVM: MMU: Fix MMU_DEBUG compile breakage Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ac121b39a5bc..f1f08159e11e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1808,8 +1808,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", - is_present_pte(*sptep)?"RW":"R", gfn, - *shadow_pte, sptep); + *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, + *sptep, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; -- cgit v1.2.3-70-g09d2 From b59bb7bdf08ee98e0d1f1758901f545655b7a413 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 9 Jul 2009 15:33:51 +0300 Subject: KVM: Move exception handling to the same place as other events Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42160b031fcd..11cfd897aac6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -223,13 +223,6 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -static void __queue_exception(struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); -} - /* * Load the pae pdptrs. Return true is they are all valid. */ @@ -3491,9 +3484,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { /* try to reinject previous events if any */ + if (vcpu->arch.exception.pending) { + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code); + return; + } + if (vcpu->arch.nmi_injected) { kvm_x86_ops->set_nmi(vcpu); return; @@ -3574,10 +3574,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) goto out; } - if (vcpu->arch.exception.pending) - __queue_exception(vcpu); - else - inject_pending_irq(vcpu, kvm_run); + inject_pending_event(vcpu, kvm_run); /* enable NMI/IRQ window open exits if needed */ if (vcpu->arch.nmi_pending) -- cgit v1.2.3-70-g09d2 From 0b71785dc05f1f66e6268022b9953c0d6a9985c6 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 9 Jul 2009 15:33:53 +0300 Subject: KVM: Move kvm_cpu_get_interrupt() declaration to x86 code It is implemented only by x86. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + include/linux/kvm_host.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 30b625d8e5f0..3f4f00a23536 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -797,5 +797,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void); int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); +int kvm_cpu_get_interrupt(struct kvm_vcpu *v); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6ec9fc56a49e..2c6a5f008746 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -341,7 +341,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); -int kvm_cpu_get_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); -- cgit v1.2.3-70-g09d2 From a1b37100d9e29c1f8dc3e2f5490a205c80180e01 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 9 Jul 2009 15:33:52 +0300 Subject: KVM: Reduce runnability interface with arch support code Remove kvm_cpu_has_interrupt() and kvm_arch_interrupt_allowed() from interface between general code and arch code. kvm_arch_vcpu_runnable() checks for interrupts instead. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 16 ++-------------- arch/powerpc/kvm/powerpc.c | 13 +------------ arch/s390/kvm/interrupt.c | 8 +------- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 6 ++++-- include/linux/kvm_host.h | 2 -- virt/kvm/kvm_main.c | 4 +--- 7 files changed, 11 insertions(+), 40 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index d7aa6bb8f477..0ad09f05efa9 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1935,19 +1935,6 @@ int kvm_highest_pending_irq(struct kvm_vcpu *vcpu) return find_highest_bits((int *)&vpd->irr[0]); } -int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) -{ - if (kvm_highest_pending_irq(vcpu) != -1) - return 1; - return 0; -} - -int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - /* do real check here */ - return 1; -} - int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { return vcpu->arch.timer_fired; @@ -1960,7 +1947,8 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE; + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) || + (kvm_highest_pending_irq(vcpu) != -1); } int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0341391eff12..2a4551f78f60 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -39,20 +39,9 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) return gfn; } -int kvm_cpu_has_interrupt(struct kvm_vcpu *v) -{ - return !!(v->arch.pending_exceptions); -} - -int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - /* do real check here */ - return 1; -} - int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - return !(v->arch.msr & MSR_WE); + return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions); } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 4d613415c435..2c2f98353415 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -283,7 +283,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu) return 1; } -int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) +static int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int; @@ -320,12 +320,6 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) return rc; } -int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - /* do real check here */ - return 1; -} - int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { return 0; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3f4f00a23536..08732d7b6d98 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -797,6 +797,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void); int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); +int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 11cfd897aac6..b87d65d89a05 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4890,8 +4890,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED - || vcpu->arch.nmi_pending; + || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED + || vcpu->arch.nmi_pending || + (kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_has_interrupt(vcpu)); } void kvm_vcpu_kick(struct kvm_vcpu *vcpu) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2c6a5f008746..4af56036a6bf 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -332,7 +332,6 @@ int kvm_arch_hardware_setup(void); void kvm_arch_hardware_unsetup(void); void kvm_arch_check_processor_compat(void *rtn); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); -int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); void kvm_free_physmem(struct kvm *kvm); @@ -341,7 +340,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); -int kvm_cpu_has_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d7b9bbba26da..532af9b41ee3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1666,9 +1666,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) for (;;) { prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); - if ((kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_has_interrupt(vcpu)) || - kvm_arch_vcpu_runnable(vcpu)) { + if (kvm_arch_vcpu_runnable(vcpu)) { set_bit(KVM_REQ_UNHALT, &vcpu->requests); break; } -- cgit v1.2.3-70-g09d2 From 4088bb3caee82086fd85a844604274f6237f66a7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 8 Jul 2009 11:26:54 +0300 Subject: KVM: silence lapic kernel messages that can be triggered by a guest Some Linux versions (f8) try to read EOI register that is write only. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5d697602048b..66122bfc74a9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -597,14 +597,14 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, static const u64 rmask = 0x43ff01ffffffe70cULL; if ((alignment + len) > 4) { - printk(KERN_ERR "KVM_APIC_READ: alignment error %x %d\n", - offset, len); + apic_debug("KVM_APIC_READ: alignment error %x %d\n", + offset, len); return 1; } if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { - printk(KERN_ERR "KVM_APIC_READ: read reserved register %x\n", - offset); + apic_debug("KVM_APIC_READ: read reserved register %x\n", + offset); return 1; } -- cgit v1.2.3-70-g09d2 From 3662cb1cd6ed26873ca808f3e16cc54246ad40ca Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 9 Jul 2009 17:00:42 +0800 Subject: KVM: Discard unnecessary kvm_mmu_flush_tlb() in kvm_mmu_load() set_cr3() should already cover the TLB flushing. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f1f08159e11e..87c67f449279 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2373,8 +2373,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); if (r) goto out; + /* set_cr3() should ensure TLB has been flushed */ kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); - kvm_mmu_flush_tlb(vcpu); out: return r; } -- cgit v1.2.3-70-g09d2 From 6a1ac77110ee3e8d8dfdef8442f3b30b3d83e6a2 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 15 Jul 2009 15:34:41 -0300 Subject: KVM: MMU: fix missing locking in alloc_mmu_pages n_requested_mmu_pages/n_free_mmu_pages are used by kvm_mmu_change_mmu_pages to calculate the number of pages to zap. alloc_mmu_pages, called from the vcpu initialization path, modifies this variables without proper locking, which can result in a negative value in kvm_mmu_change_mmu_pages (say, with cpu hotplug). Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 87c67f449279..86c2551fe136 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2728,12 +2728,14 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) ASSERT(vcpu); + spin_lock(&vcpu->kvm->mmu_lock); if (vcpu->kvm->arch.n_requested_mmu_pages) vcpu->kvm->arch.n_free_mmu_pages = vcpu->kvm->arch.n_requested_mmu_pages; else vcpu->kvm->arch.n_free_mmu_pages = vcpu->kvm->arch.n_alloc_mmu_pages; + spin_unlock(&vcpu->kvm->mmu_lock); /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first -- cgit v1.2.3-70-g09d2 From decde80b6a2fedd6ea62ea5c4bc4b3cb9fabf153 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 12 Jul 2009 17:13:29 +0300 Subject: KVM: s390: remove unused structs They are not used by common code without defines which s390 does not have. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/s390/include/asm/kvm.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h index 0b2f829f6d50..3dfcaeb5d7f4 100644 --- a/arch/s390/include/asm/kvm.h +++ b/arch/s390/include/asm/kvm.h @@ -15,15 +15,6 @@ */ #include -/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ -struct kvm_pic_state { - /* no PIC for s390 */ -}; - -struct kvm_ioapic_state { - /* no IOAPIC for s390 */ -}; - /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* general purpose regs for s390 */ -- cgit v1.2.3-70-g09d2 From 46a359e71526909a18a47aaf4347343d6d1d74b2 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 18 Jul 2009 23:58:32 +0900 Subject: KVM: x86: use get_desc_base() and get_desc_limit() Use get_desc_base() and get_desc_limit() to get the base address and limit in desc_struct. Cc: Avi Kivity Cc: kvm@vger.kernel.org Signed-off-by: Akinobu Mita Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b87d65d89a05..6277a316821e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -142,8 +142,7 @@ unsigned long segment_base(u16 selector) table_base = segment_base(ldt_selector); } d = (struct desc_struct *)(table_base + (selector & ~7)); - v = d->base0 | ((unsigned long)d->base1 << 16) | - ((unsigned long)d->base2 << 24); + v = get_desc_base(d); #ifdef CONFIG_X86_64 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; @@ -3947,11 +3946,8 @@ static void kvm_set_segment(struct kvm_vcpu *vcpu, static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, struct kvm_segment *kvm_desct) { - kvm_desct->base = seg_desc->base0; - kvm_desct->base |= seg_desc->base1 << 16; - kvm_desct->base |= seg_desc->base2 << 24; - kvm_desct->limit = seg_desc->limit0; - kvm_desct->limit |= seg_desc->limit << 16; + kvm_desct->base = get_desc_base(seg_desc); + kvm_desct->limit = get_desc_limit(seg_desc); if (seg_desc->g) { kvm_desct->limit <<= 12; kvm_desct->limit |= 0xfff; @@ -4030,11 +4026,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) { - u32 base_addr; - - base_addr = seg_desc->base0; - base_addr |= (seg_desc->base1 << 16); - base_addr |= (seg_desc->base2 << 24); + u32 base_addr = get_desc_base(seg_desc); return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); } @@ -4323,7 +4315,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } } - if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { + if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); return 1; } -- cgit v1.2.3-70-g09d2 From b792c344dfd57ee2cf737440e4a9b4a5bc39d1db Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 19 Jul 2009 00:00:01 +0900 Subject: KVM: x86: use kvm_get_gdt() and kvm_read_ldt() Use kvm_get_gdt() and kvm_read_ldt() to reduce inline assembly code. Cc: Avi Kivity Cc: kvm@vger.kernel.org Signed-off-by: Akinobu Mita Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 6 +++--- arch/x86/kvm/x86.c | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8728e514c851..92fc0dab505d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -290,7 +290,7 @@ static void svm_hardware_enable(void *garbage) struct svm_cpu_data *svm_data; uint64_t efer; - struct desc_ptr gdt_descr; + struct descriptor_table gdt_descr; struct desc_struct *gdt; int me = raw_smp_processor_id(); @@ -310,8 +310,8 @@ static void svm_hardware_enable(void *garbage) svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; svm_data->next_asid = svm_data->max_asid + 1; - asm volatile ("sgdt %0" : "=m"(gdt_descr)); - gdt = (struct desc_struct *)gdt_descr.address; + kvm_get_gdt(&gdt_descr); + gdt = (struct desc_struct *)gdt_descr.base; svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); rdmsrl(MSR_EFER, efer); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6277a316821e..c7ec0c921c01 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -132,13 +132,12 @@ unsigned long segment_base(u16 selector) if (selector == 0) return 0; - asm("sgdt %0" : "=m"(gdt)); + kvm_get_gdt(&gdt); table_base = gdt.base; if (selector & 4) { /* from ldt */ - u16 ldt_selector; + u16 ldt_selector = kvm_read_ldt(); - asm("sldt %0" : "=g"(ldt_selector)); table_base = segment_base(ldt_selector); } d = (struct desc_struct *)(table_base + (selector & ~7)); -- cgit v1.2.3-70-g09d2 From b927a3cec081a605142f5b7e90b730611bee28b1 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 21 Jul 2009 10:42:48 +0800 Subject: KVM: VMX: Introduce KVM_SET_IDENTITY_MAP_ADDR ioctl Now KVM allow guest to modify guest's physical address of EPT's identity mapping page. (change from v1, discard unnecessary check, change ioctl to accept parameter address rather than value) Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx.c | 15 ++++++++++----- arch/x86/kvm/x86.c | 19 +++++++++++++++++++ include/linux/kvm.h | 2 ++ 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 08732d7b6d98..e210b218df44 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -411,6 +411,7 @@ struct kvm_arch{ struct page *ept_identity_pagetable; bool ept_identity_pagetable_done; + gpa_t ept_identity_map_addr; unsigned long irq_sources_bitmap; unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c6256b98f078..686e1abb6816 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1719,7 +1719,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : - VMX_EPT_IDENTITY_PAGETABLE_ADDR; + vcpu->kvm->arch.ept_identity_map_addr; } vmx_flush_tlb(vcpu); @@ -2122,7 +2122,7 @@ static int init_rmode_identity_map(struct kvm *kvm) if (likely(kvm->arch.ept_identity_pagetable_done)) return 1; ret = 0; - identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT; + identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2191,14 +2191,15 @@ static int alloc_identity_pagetable(struct kvm *kvm) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + kvm_userspace_mem.guest_phys_addr = + kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); if (r) goto out; kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, - VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); + kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: up_write(&kvm->slots_lock); return r; @@ -3814,9 +3815,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (alloc_apic_access_page(kvm) != 0) goto free_vmcs; - if (enable_ept) + if (enable_ept) { + if (!kvm->arch.ept_identity_map_addr) + kvm->arch.ept_identity_map_addr = + VMX_EPT_IDENTITY_PAGETABLE_ADDR; if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; + } return &vmx->vcpu; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c7ec0c921c01..f4cb1baaa04b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1206,6 +1206,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IOEVENTFD: case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: + case KVM_CAP_SET_IDENTITY_MAP_ADDR: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1906,6 +1907,13 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) return ret; } +static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, + u64 ident_addr) +{ + kvm->arch.ept_identity_map_addr = ident_addr; + return 0; +} + static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, u32 kvm_nr_mmu_pages) { @@ -2173,6 +2181,17 @@ long kvm_arch_vm_ioctl(struct file *filp, if (r < 0) goto out; break; + case KVM_SET_IDENTITY_MAP_ADDR: { + u64 ident_addr; + + r = -EFAULT; + if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) + goto out; + r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); + if (r < 0) + goto out; + break; + } case KVM_SET_MEMORY_REGION: { struct kvm_memory_region kvm_mem; struct kvm_userspace_memory_region kvm_userspace_mem; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 230a91aa61c9..f8f8900fc5ec 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -435,6 +435,7 @@ struct kvm_ioeventfd { #define KVM_CAP_PIT_STATE2 35 #endif #define KVM_CAP_IOEVENTFD 36 +#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37 #ifdef KVM_CAP_IRQ_ROUTING @@ -512,6 +513,7 @@ struct kvm_irqfd { #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) +#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) /* Device model IOC */ #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) #define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) -- cgit v1.2.3-70-g09d2 From 84fde248fe42f130cdda39faaa8bb1224c6a13ff Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 16 Jul 2009 17:03:30 +0300 Subject: KVM: PIT: Unregister ack notifier callback when freeing Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 137e54817102..472653c15a2a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -672,6 +672,8 @@ void kvm_free_pit(struct kvm *kvm) if (kvm->arch.vpit) { kvm_unregister_irq_mask_notifier(kvm, 0, &kvm->arch.vpit->mask_notifier); + kvm_unregister_irq_ack_notifier(kvm, + &kvm->arch.vpit->pit_state.irq_ack_notifier); mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; hrtimer_cancel(timer); -- cgit v1.2.3-70-g09d2 From e601e3be7a5324fcfd34fdb8796688cfe48bd0fe Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 20 Jul 2009 11:30:12 +0200 Subject: KVM: Drop obsolete cpu_get/put in make_all_cpus_request spin_lock disables preemption, so we can simply read the current cpu. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 532af9b41ee3..646cf2ae32fd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -741,8 +741,8 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) cpumask_clear(cpus); - me = get_cpu(); spin_lock(&kvm->requests_lock); + me = smp_processor_id(); kvm_for_each_vcpu(i, vcpu, kvm) { if (test_and_set_bit(req, &vcpu->requests)) continue; @@ -757,7 +757,6 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) else called = false; spin_unlock(&kvm->requests_lock); - put_cpu(); free_cpumask_var(cpus); return called; } -- cgit v1.2.3-70-g09d2 From 7f582ab6d8116ce8db5792c219a278519deae6ad Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 22 Jul 2009 23:53:01 +0200 Subject: KVM: VMX: Avoid to return ENOTSUPP to userland Choose some allowed error values for the cases VMX returned ENOTSUPP so far as these values could be returned by the KVM_RUN IOCTL. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 686e1abb6816..c5aaa1b5fdbe 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3133,7 +3133,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) printk(KERN_ERR "Fail to handle apic access vmexit! Offset is 0x%lx\n", offset); - return -ENOTSUPP; + return -ENOEXEC; } return 1; } @@ -3202,7 +3202,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (exit_qualification & (1 << 6)) { printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); - return -ENOTSUPP; + return -EINVAL; } gla_validity = (exit_qualification >> 7) & 0x3; -- cgit v1.2.3-70-g09d2 From 5f0269f5d72f622514daec9af158c32e933800b6 Mon Sep 17 00:00:00 2001 From: Mikhail Ershov Date: Mon, 3 Aug 2009 14:58:25 +0300 Subject: KVM: Align cr8 threshold when userspace changes cr8 Commit f0a3602c20 ("KVM: Move interrupt injection logic to x86.c") does not update the cr8 intercept if the lapic is disabled, so when userspace updates cr8, the cr8 threshold control is not updated and we are left with illegal control fields. Fix by explicitly resetting the cr8 threshold. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f4cb1baaa04b..69de7248083f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4445,6 +4445,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + update_cr8_intercept(vcpu); + /* Older userspace won't unhalt the vcpu on reset. */ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && -- cgit v1.2.3-70-g09d2 From 1444885a045fe3b1905a14ea1b52540bf556578b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 27 Jul 2009 23:41:01 -0300 Subject: KVM: limit lapic periodic timer frequency Otherwise its possible to starve the host by programming lapic timer with a very high frequency. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 66122bfc74a9..5b9d1ae09cad 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -670,6 +670,15 @@ static void start_apic_timer(struct kvm_lapic *apic) if (!apic->lapic_timer.period) return; + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + if (apic->lapic_timer.period < NSEC_PER_MSEC/2) + apic->lapic_timer.period = NSEC_PER_MSEC/2; + } hrtimer_start(&apic->lapic_timer.timer, ktime_add_ns(now, apic->lapic_timer.period), -- cgit v1.2.3-70-g09d2 From aed665f7bbacf09121370bc7e21b4cc7d55fc5ef Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 3 Aug 2009 17:10:06 +0800 Subject: KVM: fix kvm_init() error handling Remove debugfs file if kvm_arch_init() return error Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 646cf2ae32fd..4470251ddc53 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2791,8 +2791,8 @@ out_free_0: __free_page(bad_page); out: kvm_arch_exit(); - kvm_exit_debug(); out_fail: + kvm_exit_debug(); return r; } EXPORT_SYMBOL_GPL(kvm_init); -- cgit v1.2.3-70-g09d2 From 44ad9944f151390363fc6edaba466de8dfef050f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 27 Jul 2009 16:30:42 +0200 Subject: KVM: MMU: make rmap code aware of mapping levels This patch removes the largepage parameter from the rmap_add function. Together with rmap_remove this function now uses the role.level field to find determine if the page is a huge page. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 53 ++++++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 86c2551fe136..b93ad2c79c15 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -479,19 +479,19 @@ static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) * Note: gfn must be unaliased before this function get called */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) { struct kvm_memory_slot *slot; unsigned long idx; slot = gfn_to_memslot(kvm, gfn); - if (!lpage) + if (likely(level == PT_PAGE_TABLE_LEVEL)) return &slot->rmap[gfn - slot->base_gfn]; - idx = (gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)); + idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); - return &slot->lpage_info[0][idx].rmap_pde; + return &slot->lpage_info[level - 2][idx].rmap_pde; } /* @@ -507,7 +507,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) * the spte was not added. * */ -static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) +static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { struct kvm_mmu_page *sp; struct kvm_rmap_desc *desc; @@ -519,7 +519,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); sp->gfns[spte - sp->spt] = gfn; - rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); if (!*rmapp) { rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); *rmapp = (unsigned long)spte; @@ -589,7 +589,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) kvm_release_pfn_dirty(pfn); else kvm_release_pfn_clean(pfn); - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); + rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); BUG(); @@ -652,10 +652,10 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) { unsigned long *rmapp; u64 *spte; - int write_protected = 0; + int i, write_protected = 0; gfn = unalias_gfn(kvm, gfn); - rmapp = gfn_to_rmap(kvm, gfn, 0); + rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); spte = rmap_next(kvm, rmapp, NULL); while (spte) { @@ -677,21 +677,24 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) } /* check for huge page mappings */ - rmapp = gfn_to_rmap(kvm, gfn, 1); - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - BUG_ON(!spte); - BUG_ON(!(*spte & PT_PRESENT_MASK)); - BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); - pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writeble_pte(*spte)) { - rmap_remove(kvm, spte); - --kvm->stat.lpages; - __set_spte(spte, shadow_trap_nonpresent_pte); - spte = NULL; - write_protected = 1; + for (i = PT_DIRECTORY_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + rmapp = gfn_to_rmap(kvm, gfn, i); + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!spte); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); + pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); + if (is_writeble_pte(*spte)) { + rmap_remove(kvm, spte); + --kvm->stat.lpages; + __set_spte(spte, shadow_trap_nonpresent_pte); + spte = NULL; + write_protected = 1; + } + spte = rmap_next(kvm, rmapp, spte); } - spte = rmap_next(kvm, rmapp, spte); } return write_protected; @@ -1815,7 +1818,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { - rmap_count = rmap_add(vcpu, sptep, gfn, largepage); + rmap_count = rmap_add(vcpu, sptep, gfn); if (!is_rmap_spte(*sptep)) kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) -- cgit v1.2.3-70-g09d2 From d25797b24c0ff2efc2b2fabaebb0ec0cafc0d3e3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 27 Jul 2009 16:30:43 +0200 Subject: KVM: MMU: rename is_largepage_backed to mapping_level With the new name and the corresponding backend changes this function can now support multiple hugepage sizes. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 100 ++++++++++++++++++++++++++++++--------------- arch/x86/kvm/paging_tmpl.h | 4 +- 2 files changed, 69 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b93ad2c79c15..c707936b2414 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -393,37 +393,52 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) * Return the pointer to the largepage write count for a given * gfn, handling slots that are not large page aligned. */ -static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) +static int *slot_largepage_idx(gfn_t gfn, + struct kvm_memory_slot *slot, + int level) { unsigned long idx; - idx = (gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)); - return &slot->lpage_info[0][idx].write_count; + idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); + return &slot->lpage_info[level - 2][idx].write_count; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) { + struct kvm_memory_slot *slot; int *write_count; + int i; gfn = unalias_gfn(kvm, gfn); - write_count = slot_largepage_idx(gfn, - gfn_to_memslot_unaliased(kvm, gfn)); - *write_count += 1; + + slot = gfn_to_memslot_unaliased(kvm, gfn); + for (i = PT_DIRECTORY_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + write_count = slot_largepage_idx(gfn, slot, i); + *write_count += 1; + } } static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) { + struct kvm_memory_slot *slot; int *write_count; + int i; gfn = unalias_gfn(kvm, gfn); - write_count = slot_largepage_idx(gfn, - gfn_to_memslot_unaliased(kvm, gfn)); - *write_count -= 1; - WARN_ON(*write_count < 0); + for (i = PT_DIRECTORY_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + slot = gfn_to_memslot_unaliased(kvm, gfn); + write_count = slot_largepage_idx(gfn, slot, i); + *write_count -= 1; + WARN_ON(*write_count < 0); + } } -static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) +static int has_wrprotected_page(struct kvm *kvm, + gfn_t gfn, + int level) { struct kvm_memory_slot *slot; int *largepage_idx; @@ -431,47 +446,67 @@ static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) gfn = unalias_gfn(kvm, gfn); slot = gfn_to_memslot_unaliased(kvm, gfn); if (slot) { - largepage_idx = slot_largepage_idx(gfn, slot); + largepage_idx = slot_largepage_idx(gfn, slot, level); return *largepage_idx; } return 1; } -static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) +static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { + unsigned long page_size = PAGE_SIZE; struct vm_area_struct *vma; unsigned long addr; - int ret = 0; + int i, ret = 0; addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return ret; + return page_size; down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); - if (vma && is_vm_hugetlb_page(vma)) - ret = 1; + if (!vma) + goto out; + + page_size = vma_kernel_pagesize(vma); + +out: up_read(¤t->mm->mmap_sem); + for (i = PT_PAGE_TABLE_LEVEL; + i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { + if (page_size >= KVM_HPAGE_SIZE(i)) + ret = i; + else + break; + } + return ret; } -static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - - if (has_wrprotected_page(vcpu->kvm, large_gfn)) - return 0; - - if (!host_largepage_backed(vcpu->kvm, large_gfn)) - return 0; + int host_level; + int level = PT_PAGE_TABLE_LEVEL; slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) - return 0; + return PT_PAGE_TABLE_LEVEL; - return 1; + host_level = host_mapping_level(vcpu->kvm, large_gfn); + + if (host_level == PT_PAGE_TABLE_LEVEL) + return host_level; + + for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { + + if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) + break; + } + + return level - 1; } /* @@ -1733,7 +1768,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { + if (largepage && has_wrprotected_page(vcpu->kvm, gfn, 1)) { ret = 1; spte = shadow_trap_nonpresent_pte; goto set_pte; @@ -1884,8 +1919,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) pfn_t pfn; unsigned long mmu_seq; - if (is_largepage_backed(vcpu, gfn & - ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1))) { + if (mapping_level(vcpu, gfn) == PT_DIRECTORY_LEVEL) { gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); largepage = 1; } @@ -2091,8 +2125,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - if (is_largepage_backed(vcpu, gfn & - ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1))) { + if (mapping_level(vcpu, gfn) == PT_DIRECTORY_LEVEL) { gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); largepage = 1; } @@ -2494,7 +2527,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { + if (is_large_pte(gpte) && + (mapping_level(vcpu, gfn) == PT_DIRECTORY_LEVEL)) { gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); vcpu->arch.update_pte.largepage = 1; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 36ac6d70a847..44f0346578cb 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -407,8 +407,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (walker.level == PT_DIRECTORY_LEVEL) { gfn_t large_gfn; large_gfn = walker.gfn & - ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); - if (is_largepage_backed(vcpu, large_gfn)) { + ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); + if (mapping_level(vcpu, large_gfn) == PT_DIRECTORY_LEVEL) { walker.gfn = large_gfn; largepage = 1; } -- cgit v1.2.3-70-g09d2 From 852e3c19ac64b7c3912e8efe42d3ce090ebc0161 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 27 Jul 2009 16:30:44 +0200 Subject: KVM: MMU: make direct mapping paths aware of mapping levels Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 83 ++++++++++++++++++++++++----------------- arch/x86/kvm/paging_tmpl.h | 6 +-- 3 files changed, 53 insertions(+), 38 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e210b218df44..e09dc26d96bd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -315,7 +315,7 @@ struct kvm_vcpu_arch { struct { gfn_t gfn; /* presumed gfn during guest pte update */ pfn_t pfn; /* pfn corresponding to that gfn */ - int largepage; + int level; unsigned long mmu_seq; } update_pte; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c707936b2414..110c224ed1fb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -257,7 +257,7 @@ static int is_last_spte(u64 pte, int level) { if (level == PT_PAGE_TABLE_LEVEL) return 1; - if (level == PT_DIRECTORY_LEVEL && is_large_pte(pte)) + if (is_large_pte(pte)) return 1; return 0; } @@ -753,7 +753,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int (*handler)(struct kvm *kvm, unsigned long *rmapp)) { - int i; + int i, j; int retval = 0; /* @@ -772,11 +772,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - int idx = gfn_offset / - KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL); + retval |= handler(kvm, &memslot->rmap[gfn_offset]); - retval |= handler(kvm, - &memslot->lpage_info[0][idx].rmap_pde); + + for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { + int idx = gfn_offset; + idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); + retval |= handler(kvm, + &memslot->lpage_info[j][idx].rmap_pde); + } } } @@ -814,12 +818,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) +static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { unsigned long *rmapp; + struct kvm_mmu_page *sp; + + sp = page_header(__pa(spte)); gfn = unalias_gfn(vcpu->kvm, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); kvm_unmap_rmapp(vcpu->kvm, rmapp); kvm_flush_remote_tlbs(vcpu->kvm); @@ -1734,7 +1741,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int user_fault, - int write_fault, int dirty, int largepage, + int write_fault, int dirty, int level, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync) { @@ -1757,7 +1764,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_nx_mask; if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; - if (largepage) + if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, @@ -1768,7 +1775,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - if (largepage && has_wrprotected_page(vcpu->kvm, gfn, 1)) { + if (level > PT_PAGE_TABLE_LEVEL && + has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; spte = shadow_trap_nonpresent_pte; goto set_pte; @@ -1806,7 +1814,7 @@ set_pte: static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, + int *ptwrite, int level, gfn_t gfn, pfn_t pfn, bool speculative) { int was_rmapped = 0; @@ -1823,7 +1831,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ - if (largepage && !is_large_pte(*sptep)) { + if (level > PT_PAGE_TABLE_LEVEL && + !is_large_pte(*sptep)) { struct kvm_mmu_page *child; u64 pte = *sptep; @@ -1836,8 +1845,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } else was_rmapped = 1; } + if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, - dirty, largepage, gfn, pfn, speculative, true)) { + dirty, level, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); @@ -1857,7 +1867,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!is_rmap_spte(*sptep)) kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, gfn, largepage); + rmap_recycle(vcpu, sptep, gfn); } else { if (was_writeble) kvm_release_pfn_dirty(pfn); @@ -1875,7 +1885,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn) + int level, gfn_t gfn, pfn_t pfn) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -1883,11 +1893,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, gfn_t pseudo_gfn; for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { - if (iterator.level == PT_PAGE_TABLE_LEVEL - || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { + if (iterator.level == level) { mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, - largepage, gfn, pfn, false); + level, gfn, pfn, false); ++vcpu->stat.pf_fixed; break; } @@ -1915,14 +1924,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; - int largepage = 0; + int level; pfn_t pfn; unsigned long mmu_seq; - if (mapping_level(vcpu, gfn) == PT_DIRECTORY_LEVEL) { - gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); - largepage = 1; - } + level = mapping_level(vcpu, gfn); + + /* + * This path builds a PAE pagetable - so we can map 2mb pages at + * maximum. Therefore check if the level is larger than that. + */ + if (level > PT_DIRECTORY_LEVEL) + level = PT_DIRECTORY_LEVEL; + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -1938,7 +1953,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, v, write, largepage, gfn, pfn); + r = __direct_map(vcpu, v, write, level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); @@ -2114,7 +2129,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, { pfn_t pfn; int r; - int largepage = 0; + int level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; @@ -2125,10 +2140,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - if (mapping_level(vcpu, gfn) == PT_DIRECTORY_LEVEL) { - gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); - largepage = 1; - } + level = mapping_level(vcpu, gfn); + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); @@ -2141,7 +2156,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - largepage, gfn, pfn); + level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -2448,7 +2463,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { - if (!vcpu->arch.update_pte.largepage || + if (vcpu->arch.update_pte.level == PT_PAGE_TABLE_LEVEL || sp->role.glevels == PT32_ROOT_LEVEL) { ++vcpu->kvm->stat.mmu_pde_zapped; return; @@ -2498,7 +2513,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 gpte = 0; pfn_t pfn; - vcpu->arch.update_pte.largepage = 0; + vcpu->arch.update_pte.level = PT_PAGE_TABLE_LEVEL; if (bytes != 4 && bytes != 8) return; @@ -2530,7 +2545,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (is_large_pte(gpte) && (mapping_level(vcpu, gfn) == PT_DIRECTORY_LEVEL)) { gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); - vcpu->arch.update_pte.largepage = 1; + vcpu->arch.update_pte.level = PT_DIRECTORY_LEVEL; } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 44f0346578cb..b167f0d57b54 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -253,7 +253,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pt_element_t gpte; unsigned pte_access; pfn_t pfn; - int largepage = vcpu->arch.update_pte.largepage; + int level = vcpu->arch.update_pte.level; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { @@ -272,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, largepage, + gpte & PT_DIRTY_MASK, NULL, level, gpte_to_gfn(gpte), pfn, true); } @@ -306,7 +306,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, gw->pte_access & access, user_fault, write_fault, gw->ptes[gw->level-1] & PT_DIRTY_MASK, - ptwrite, largepage, + ptwrite, level, gw->gfn, pfn, false); break; } -- cgit v1.2.3-70-g09d2 From e04da980c35d75fa050ba4009ad99025432d8d7d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 27 Jul 2009 16:30:45 +0200 Subject: KVM: MMU: make page walker aware of mapping levels The page walker may be used with nested paging too when accessing mmio areas. Make it support the additional page-level too. [ Marcelo: fix reserved bit check for 1gb pte ] Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 17 ++++++++++++++- arch/x86/kvm/paging_tmpl.h | 52 +++++++++++++++++++++++++--------------------- 2 files changed, 44 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 110c224ed1fb..09ab6433bf1d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -108,6 +108,9 @@ module_param(oos_shadow, bool, 0644); #define PT32_LEVEL_MASK(level) \ (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) +#define PT32_LVL_OFFSET_MASK(level) \ + (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT32_LEVEL_BITS))) - 1)) #define PT32_INDEX(address, level)\ (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) @@ -116,10 +119,19 @@ module_param(oos_shadow, bool, 0644); #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #define PT64_DIR_BASE_ADDR_MASK \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) +#define PT64_LVL_ADDR_MASK(level) \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#define PT64_LVL_OFFSET_MASK(level) \ + (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) #define PT32_BASE_ADDR_MASK PAGE_MASK #define PT32_DIR_BASE_ADDR_MASK \ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) +#define PT32_LVL_ADDR_MASK(level) \ + (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT32_LEVEL_BITS))) - 1)) #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) @@ -130,6 +142,7 @@ module_param(oos_shadow, bool, 0644); #define PFERR_RSVD_MASK (1U << 3) #define PFERR_FETCH_MASK (1U << 4) +#define PT_PDPE_LEVEL 3 #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 @@ -2273,7 +2286,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; - context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; + context->rsvd_bits_mask[1][2] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | + rsvd_bits(13, 29); context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b167f0d57b54..578276e34bd9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -27,7 +27,8 @@ #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK - #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_BITS PT64_LEVEL_BITS @@ -43,7 +44,8 @@ #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS @@ -53,8 +55,8 @@ #error Invalid PTTYPE value #endif -#define gpte_to_gfn FNAME(gpte_to_gfn) -#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) +#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) +#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) /* * The guest_walker structure emulates the behavior of the hardware page @@ -71,14 +73,9 @@ struct guest_walker { u32 error_code; }; -static gfn_t gpte_to_gfn(pt_element_t gpte) +static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) { - return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - -static gfn_t gpte_to_gfn_pde(pt_element_t gpte) -{ - return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, @@ -189,18 +186,24 @@ walk: walker->ptes[walker->level - 1] = pte; - if (walker->level == PT_PAGE_TABLE_LEVEL) { - walker->gfn = gpte_to_gfn(pte); - break; - } - - if (walker->level == PT_DIRECTORY_LEVEL - && (pte & PT_PAGE_SIZE_MASK) - && (PTTYPE == 64 || is_pse(vcpu))) { - walker->gfn = gpte_to_gfn_pde(pte); - walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); - if (PTTYPE == 32 && is_cpuid_PSE36()) + if ((walker->level == PT_PAGE_TABLE_LEVEL) || + ((walker->level == PT_DIRECTORY_LEVEL) && + (pte & PT_PAGE_SIZE_MASK) && + (PTTYPE == 64 || is_pse(vcpu))) || + ((walker->level == PT_PDPE_LEVEL) && + (pte & PT_PAGE_SIZE_MASK) && + is_long_mode(vcpu))) { + int lvl = walker->level; + + walker->gfn = gpte_to_gfn_lvl(pte, lvl); + walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) + >> PAGE_SHIFT; + + if (PTTYPE == 32 && + walker->level == PT_DIRECTORY_LEVEL && + is_cpuid_PSE36()) walker->gfn += pse36_gfn_delta(pte); + break; } @@ -609,9 +612,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_BASE_ADDR_MASK #undef PT_INDEX #undef PT_LEVEL_MASK -#undef PT_DIR_BASE_ADDR_MASK +#undef PT_LVL_ADDR_MASK +#undef PT_LVL_OFFSET_MASK #undef PT_LEVEL_BITS #undef PT_MAX_FULL_LEVELS #undef gpte_to_gfn -#undef gpte_to_gfn_pde +#undef gpte_to_gfn_lvl #undef CMPXCHG -- cgit v1.2.3-70-g09d2 From 7e4e4056f72da51c5dede48515df0ecd20eaf8ca Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 27 Jul 2009 16:30:46 +0200 Subject: KVM: MMU: shadow support for 1gb pages This patch adds support for shadow paging to the 1gb page table code in KVM. With this code the guest can use 1gb pages even if the host does not support them. [ Marcelo: fix shadow page collision on pmd level if a guest 1gb page is mapped with 4kb ptes on host level ] Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu.c | 14 ++------------ arch/x86/kvm/paging_tmpl.h | 43 +++++++++++++++++++---------------------- 3 files changed, 22 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e09dc26d96bd..c9fb2bc13a81 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -315,7 +315,6 @@ struct kvm_vcpu_arch { struct { gfn_t gfn; /* presumed gfn during guest pte update */ pfn_t pfn; /* pfn corresponding to that gfn */ - int level; unsigned long mmu_seq; } update_pte; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 09ab6433bf1d..1249c12e1d5c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2478,11 +2478,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { - if (vcpu->arch.update_pte.level == PT_PAGE_TABLE_LEVEL || - sp->role.glevels == PT32_ROOT_LEVEL) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } + ++vcpu->kvm->stat.mmu_pde_zapped; + return; } ++vcpu->kvm->stat.mmu_pte_updated; @@ -2528,8 +2525,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 gpte = 0; pfn_t pfn; - vcpu->arch.update_pte.level = PT_PAGE_TABLE_LEVEL; - if (bytes != 4 && bytes != 8) return; @@ -2557,11 +2552,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - if (is_large_pte(gpte) && - (mapping_level(vcpu, gfn) == PT_DIRECTORY_LEVEL)) { - gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); - vcpu->arch.update_pte.level = PT_DIRECTORY_LEVEL; - } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 578276e34bd9..d2fec9c12d22 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -256,7 +256,6 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pt_element_t gpte; unsigned pte_access; pfn_t pfn; - int level = vcpu->arch.update_pte.level; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { @@ -275,7 +274,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, level, + gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, gpte_to_gfn(gpte), pfn, true); } @@ -284,7 +283,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, - int user_fault, int write_fault, int largepage, + int user_fault, int write_fault, int hlevel, int *ptwrite, pfn_t pfn) { unsigned access = gw->pt_access; @@ -303,8 +302,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, for_each_shadow_entry(vcpu, addr, iterator) { level = iterator.level; sptep = iterator.sptep; - if (level == PT_PAGE_TABLE_LEVEL - || (largepage && level == PT_DIRECTORY_LEVEL)) { + if (iterator.level == hlevel) { mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, user_fault, write_fault, @@ -323,12 +321,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, kvm_flush_remote_tlbs(vcpu->kvm); } - if (level == PT_DIRECTORY_LEVEL - && gw->level == PT_DIRECTORY_LEVEL) { + if (level <= gw->level) { + int delta = level - gw->level + 1; direct = 1; - if (!is_dirty_gpte(gw->ptes[level - 1])) + if (!is_dirty_gpte(gw->ptes[level - delta])) access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(gw->ptes[level - 1]); + table_gfn = gpte_to_gfn(gw->ptes[level - delta]); + /* advance table_gfn when emulating 1gb pages with 4k */ + if (delta == 0) + table_gfn += PT_INDEX(addr, level); } else { direct = 0; table_gfn = gw->table_gfn[level - 2]; @@ -381,7 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int write_pt = 0; int r; pfn_t pfn; - int largepage = 0; + int level = PT_PAGE_TABLE_LEVEL; unsigned long mmu_seq; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -407,15 +408,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - if (walker.level == PT_DIRECTORY_LEVEL) { - gfn_t large_gfn; - large_gfn = walker.gfn & - ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); - if (mapping_level(vcpu, large_gfn) == PT_DIRECTORY_LEVEL) { - walker.gfn = large_gfn; - largepage = 1; - } + if (walker.level >= PT_DIRECTORY_LEVEL) { + level = min(walker.level, mapping_level(vcpu, walker.gfn)); + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); @@ -432,8 +429,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, goto out_unlock; kvm_mmu_free_some_pages(vcpu); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - largepage, &write_pt, pfn); - + level, &write_pt, pfn); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, sptep, *sptep, write_pt); @@ -468,8 +464,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) sptep = iterator.sptep; /* FIXME: properly handle invlpg on large guest pages */ - if (level == PT_PAGE_TABLE_LEVEL || - ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { + if (level == PT_PAGE_TABLE_LEVEL || + ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || + ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { struct kvm_mmu_page *sp = page_header(__pa(sptep)); pte_gpa = (sp->gfn << PAGE_SHIFT); @@ -599,7 +596,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - is_dirty_gpte(gpte), 0, gfn, + is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, spte_to_pfn(sp->spt[i]), true, false); } -- cgit v1.2.3-70-g09d2 From 04326caacff2b162d359c15a2edf634448897d1a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 27 Jul 2009 16:30:47 +0200 Subject: KVM: MMU: enable gbpages by increasing nr of pagesizes Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c9fb2bc13a81..3315efaacf9e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -55,7 +55,7 @@ #define UNMAPPED_GVA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ -#define KVM_NR_PAGE_SIZES 2 +#define KVM_NR_PAGE_SIZES 3 #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) -- cgit v1.2.3-70-g09d2 From 344f414fa0f16254dd07195d4cd11b2f92931d3d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 27 Jul 2009 16:30:48 +0200 Subject: KVM: report 1GB page support to userspace If userspace knows that the kernel part supports 1GB pages it can enable the corresponding cpuid bit so that guests actually use GB pages. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 3 ++- 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3315efaacf9e..b17d845897b7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -528,6 +528,8 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + bool (*gb_page_enable)(void); + const struct trace_print_flags *exit_reasons_str; }; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 92fc0dab505d..10e718db990b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2754,6 +2754,11 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { { -1, NULL } }; +static bool svm_gb_page_enable(void) +{ + return true; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -2817,6 +2822,7 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .exit_reasons_str = svm_exit_reasons_str, + .gb_page_enable = svm_gb_page_enable, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c5aaa1b5fdbe..32e6d2031ba2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3908,6 +3908,11 @@ static const struct trace_print_flags vmx_exit_reasons_str[] = { { -1, NULL } }; +static bool vmx_gb_page_enable(void) +{ + return false; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -3969,6 +3974,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_mt_mask = vmx_get_mt_mask, .exit_reasons_str = vmx_exit_reasons_str, + .gb_page_enable = vmx_gb_page_enable, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 69de7248083f..fa525d511d92 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1444,6 +1444,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { unsigned f_nx = is_efer_nx() ? F(NX) : 0; + unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; #ifdef CONFIG_X86_64 unsigned f_lm = F(LM); #else @@ -1468,7 +1469,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | + F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = -- cgit v1.2.3-70-g09d2 From 95fb4eb6981216c07ac01f598e61b273b6eff58c Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Wed, 29 Jul 2009 00:46:38 +0200 Subject: KVM: remove superfluous NULL pointer check in kvm_inject_pit_timer_irqs() This takes care of the following entries from Dan's list: arch/x86/kvm/i8254.c +714 kvm_inject_pit_timer_irqs(6) warning: variable derefenced in initializer 'vcpu' arch/x86/kvm/i8254.c +714 kvm_inject_pit_timer_irqs(6) warning: variable derefenced before check 'vcpu' Reported-by: Dan Carpenter Cc: corbet@lwn.net Cc: eteo@redhat.com Cc: Julia Lawall Signed-off-by: Bartlomiej Zolnierkiewicz Acked-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 472653c15a2a..82ad523b4901 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -713,7 +713,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; struct kvm_kpit_state *ps; - if (vcpu && pit) { + if (pit) { int inject = 0; ps = &pit->pit_state; -- cgit v1.2.3-70-g09d2 From 3b80fffe2b31fb716d3ebe729c54464ee7856723 Mon Sep 17 00:00:00 2001 From: Izik Eidus Date: Tue, 28 Jul 2009 15:26:58 -0300 Subject: KVM: MMU: make __kvm_mmu_free_some_pages handle empty list First check if the list is empty before attempting to look at list entries. Cc: stable@kernel.org Signed-off-by: Izik Eidus Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1249c12e1d5c..28be35c6ff1d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2705,7 +2705,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { + while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && + !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, -- cgit v1.2.3-70-g09d2 From b90c062c65cc8839edfac39778a37a55ca9bda36 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 28 Jul 2009 15:26:59 -0300 Subject: KVM: MMU: fix bogus alloc_mmu_pages assignment Remove the bogus n_free_mmu_pages assignment from alloc_mmu_pages. It breaks accounting of mmu pages, since n_free_mmu_pages is modified but the real number of pages remains the same. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 28be35c6ff1d..6f38178af926 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2786,14 +2786,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) ASSERT(vcpu); - spin_lock(&vcpu->kvm->mmu_lock); - if (vcpu->kvm->arch.n_requested_mmu_pages) - vcpu->kvm->arch.n_free_mmu_pages = - vcpu->kvm->arch.n_requested_mmu_pages; - else - vcpu->kvm->arch.n_free_mmu_pages = - vcpu->kvm->arch.n_alloc_mmu_pages; - spin_unlock(&vcpu->kvm->mmu_lock); /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first -- cgit v1.2.3-70-g09d2 From 07708c4af1346ab1521b26a202f438366b7bcffd Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 3 Aug 2009 18:43:28 +0200 Subject: KVM: x86: Disallow hypercalls for guest callers in rings > 0 So far unprivileged guest callers running in ring 3 can issue, e.g., MMU hypercalls. Normally, such callers cannot provide any hand-crafted MMU command structure as it has to be passed by its physical address, but they can still crash the guest kernel by passing random addresses. To close the hole, this patch considers hypercalls valid only if issued from guest ring 0. This may still be relaxed on a per-hypercall base in the future once required. Cc: stable@kernel.org Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 6 ++++++ include/linux/kvm_para.h | 1 + 2 files changed, 7 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa525d511d92..92b5eddaa877 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3213,6 +3213,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } + if (kvm_x86_ops->get_cpl(vcpu) != 0) { + ret = -KVM_EPERM; + goto out; + } + switch (nr) { case KVM_HC_VAPIC_POLL_IRQ: ret = 0; @@ -3224,6 +3229,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ret = -KVM_ENOSYS; break; } +out: kvm_register_write(vcpu, VCPU_REGS_RAX, ret); ++vcpu->stat.hypercalls; return r; diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h index 3ddce03766ca..d73109243fda 100644 --- a/include/linux/kvm_para.h +++ b/include/linux/kvm_para.h @@ -13,6 +13,7 @@ #define KVM_ENOSYS 1000 #define KVM_EFAULT EFAULT #define KVM_E2BIG E2BIG +#define KVM_EPERM EPERM #define KVM_HC_VAPIC_POLL_IRQ 1 #define KVM_HC_MMU_OP 2 -- cgit v1.2.3-70-g09d2 From eab4b8aa34fc64e3a91358e1612e6d059396193b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 4 Aug 2009 15:02:54 +0300 Subject: KVM: VMX: Optimize vmx_get_cpl() Instead of calling vmx_get_segment() (which reads a whole bunch of vmcs fields), read only the cs selector which contains the cpl. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32e6d2031ba2..0ba706e87c50 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1773,16 +1773,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - struct kvm_segment kvm_seg; - if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ return 3; - vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS); - return kvm_seg.selector & 3; + return vmcs_read16(GUEST_CS_SELECTOR) & 3; } static u32 vmx_segment_access_rights(struct kvm_segment *var) -- cgit v1.2.3-70-g09d2 From 1f3ee616dd21ff155f781c35509229bf2788c072 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Tue, 30 Jun 2009 16:24:28 +0530 Subject: KVM: ignore reads to perfctr msrs We ignore writes to the perfctr msrs. Ignore reads as well. Kaspersky antivirus crashes Windows guests if it can't read these MSRs. Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92b5eddaa877..132c5100d4f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1048,9 +1048,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_K8_SYSCFG: case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: + case MSR_K7_PERFCTR0: case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: -- cgit v1.2.3-70-g09d2 From 3a34a8810b2ed316bfe58fa53640e8d30de3f6c2 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 4 Aug 2009 02:08:45 -0700 Subject: KVM: fix EFER read buffer overflow Check whether index is within bounds before grabbing the element. Signed-off-by: Roel Kluin Cc: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0ba706e87c50..31c3a8740c42 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -571,12 +571,15 @@ static void reload_tss(void) static void load_transition_efer(struct vcpu_vmx *vmx) { int efer_offset = vmx->msr_offset_efer; - u64 host_efer = vmx->host_msrs[efer_offset].data; - u64 guest_efer = vmx->guest_msrs[efer_offset].data; + u64 host_efer; + u64 guest_efer; u64 ignore_bits; if (efer_offset < 0) return; + host_efer = vmx->host_msrs[efer_offset].data; + guest_efer = vmx->guest_msrs[efer_offset].data; + /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless * outside long mode -- cgit v1.2.3-70-g09d2 From 956f97cf665dc8c8ce2d5138bb38c022673b12d7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 4 Aug 2009 15:30:27 +0300 Subject: KVM: Call kvm_vcpu_kick() inside pic spinlock d5ecfdd25 moved it out because back than it was impossible to call it inside spinlock. This restriction no longer exists. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 10 +--------- arch/x86/kvm/irq.h | 1 - 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index daf4606b0293..d27320c8d464 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -43,11 +43,9 @@ static void pic_unlock(struct kvm_pic *s) { struct kvm *kvm = s->kvm; unsigned acks = s->pending_acks; - bool wakeup = s->wakeup_needed; struct kvm_vcpu *vcpu; s->pending_acks = 0; - s->wakeup_needed = false; spin_unlock(&s->lock); @@ -56,12 +54,6 @@ static void pic_unlock(struct kvm_pic *s) __ffs(acks)); acks &= acks - 1; } - - if (wakeup) { - vcpu = s->kvm->bsp_vcpu; - if (vcpu) - kvm_vcpu_kick(vcpu); - } } static void pic_clear_isr(struct kvm_kpic_state *s, int irq) @@ -527,7 +519,7 @@ static void pic_irq_request(void *opaque, int level) s->output = level; if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { s->pics[0].isr_ack &= ~(1 << irq); - s->wakeup_needed = true; + kvm_vcpu_kick(vcpu); } } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9f593188129e..7d6058a2fd38 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -63,7 +63,6 @@ struct kvm_kpic_state { struct kvm_pic { spinlock_t lock; - bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ -- cgit v1.2.3-70-g09d2 From 938396a2341a4a96eaa5bea93a2bcdd69827c70c Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 4 Aug 2009 15:30:28 +0300 Subject: KVM: Call ack notifiers from PIC when guest OS acks an IRQ. Currently they are called when irq vector is been delivered. Calling ack notifiers at this point is wrong. Device assignment ack notifier enables host interrupts, but guest not yet had a chance to clear interrupt condition in a device. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index d27320c8d464..3aacd331e2de 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -41,25 +41,16 @@ static void pic_lock(struct kvm_pic *s) static void pic_unlock(struct kvm_pic *s) __releases(&s->lock) { - struct kvm *kvm = s->kvm; - unsigned acks = s->pending_acks; - struct kvm_vcpu *vcpu; - - s->pending_acks = 0; - spin_unlock(&s->lock); - - while (acks) { - kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)), - __ffs(acks)); - acks &= acks - 1; - } } static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); s->isr_ack |= (1 << irq); + if (s != &s->pics_state->pics[0]) + irq += 8; + kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); } void kvm_pic_clear_isr_ack(struct kvm *kvm) @@ -240,7 +231,6 @@ int kvm_pic_read_irq(struct kvm *kvm) } pic_update_irq(s); pic_unlock(s); - kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq); return intno; } @@ -260,7 +250,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) if (s->irr & (1 << irq) || s->isr & (1 << irq)) { n = irq + irqbase; - s->pics_state->pending_acks |= 1 << n; + kvm_notify_acked_irq(kvm, SELECT_PIC(n), n); } } s->last_irr = 0; -- cgit v1.2.3-70-g09d2 From 88ba63c2653b9b4857148fedba47ba2d8a4652c9 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 4 Aug 2009 15:30:29 +0300 Subject: KVM: Replace pic_lock()/pic_unlock() with direct call to spinlock functions They are not doing anything else now. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 3aacd331e2de..01f151682802 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -32,18 +32,6 @@ #include #include "trace.h" -static void pic_lock(struct kvm_pic *s) - __acquires(&s->lock) -{ - spin_lock(&s->lock); -} - -static void pic_unlock(struct kvm_pic *s) - __releases(&s->lock) -{ - spin_unlock(&s->lock); -} - static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); @@ -56,10 +44,10 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - pic_lock(s); + spin_lock(&s->lock); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; - pic_unlock(s); + spin_unlock(&s->lock); } /* @@ -160,9 +148,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - pic_lock(s); + spin_lock(&s->lock); pic_update_irq(s); - pic_unlock(s); + spin_unlock(&s->lock); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -170,14 +158,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - pic_lock(s); + spin_lock(&s->lock); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); } - pic_unlock(s); + spin_unlock(&s->lock); return ret; } @@ -205,7 +193,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - pic_lock(s); + spin_lock(&s->lock); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -230,7 +218,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - pic_unlock(s); + spin_unlock(&s->lock); return intno; } @@ -448,7 +436,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return 0; } - pic_lock(s); + spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -461,7 +449,7 @@ static int picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - pic_unlock(s); + spin_unlock(&s->lock); return 0; } @@ -478,7 +466,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return 0; } - pic_lock(s); + spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -492,7 +480,7 @@ static int picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - pic_unlock(s); + spin_unlock(&s->lock); return 0; } -- cgit v1.2.3-70-g09d2 From 256cd2ef4f5c125f5df2c81d8457f080a4684ae6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 10 Aug 2009 10:41:27 +0300 Subject: x86: Export kmap_atomic_to_page() Needed by KVM. Signed-off-by: Avi Kivity --- arch/x86/mm/highmem_32.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 2112ed55e7ea..572f47c1c73c 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -104,6 +104,7 @@ EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_to_page); void __init set_highmem_pages_init(void) { -- cgit v1.2.3-70-g09d2 From 2af9194d1b683f91ae956afff9afb0b52a241371 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:28 +0200 Subject: KVM: SVM: add helper functions for global interrupt flag This patch makes the code easier to read when it comes to setting, clearing and checking the status of the virtualized global interrupt flag for the VCPU. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 10e718db990b..9f7277273a23 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -129,6 +129,21 @@ static inline bool is_nested(struct vcpu_svm *svm) return svm->nested_vmcb; } +static inline void enable_gif(struct vcpu_svm *svm) +{ + svm->vcpu.arch.hflags |= HF_GIF_MASK; +} + +static inline void disable_gif(struct vcpu_svm *svm) +{ + svm->vcpu.arch.hflags &= ~HF_GIF_MASK; +} + +static inline bool gif_set(struct vcpu_svm *svm) +{ + return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); +} + static unsigned long iopm_base; struct kvm_ldttss_desc { @@ -621,7 +636,9 @@ static void init_vmcb(struct vcpu_svm *svm) force_new_asid(&svm->vcpu); svm->nested_vmcb = 0; - svm->vcpu.arch.hflags = HF_GIF_MASK; + svm->vcpu.arch.hflags = 0; + + enable_gif(svm); } static int svm_vcpu_reset(struct kvm_vcpu *vcpu) @@ -1629,7 +1646,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; + disable_gif(svm); /* Exit nested SVM mode */ svm->nested_vmcb = 0; @@ -1761,7 +1778,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; - svm->vcpu.arch.hflags |= HF_GIF_MASK; + enable_gif(svm); return 0; } @@ -1850,7 +1867,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - svm->vcpu.arch.hflags |= HF_GIF_MASK; + enable_gif(svm); return 1; } @@ -1863,7 +1880,7 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; + disable_gif(svm); /* After a CLGI no interrupts should come */ svm_clear_vintr(svm); @@ -2352,7 +2369,7 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - BUG_ON(!(svm->vcpu.arch.hflags & HF_GIF_MASK)); + BUG_ON(!(gif_set(svm))); svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; @@ -2383,7 +2400,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; return (vmcb->save.rflags & X86_EFLAGS_IF) && !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && - (svm->vcpu.arch.hflags & HF_GIF_MASK) && + gif_set(svm) && !is_nested(svm); } @@ -2398,7 +2415,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) * GIF becomes 1, because that's a separate STGI/VMRUN intercept. * The next time we get that intercept, this function will be * called again though and we'll get the vintr intercept. */ - if (svm->vcpu.arch.hflags & HF_GIF_MASK) { + if (gif_set(svm)) { svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } -- cgit v1.2.3-70-g09d2 From 33740e4009b7d287538f68f614eb3542df3597e4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:29 +0200 Subject: KVM: SVM: optimize nested #vmexit It is more efficient to copy only the relevant parts of the vmcb back to the nested vmcb when we emulate an vmexit. Signed-off-by: Joerg Roedel Acked-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 68 ++++++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9f7277273a23..2f5f2236f2a2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1572,53 +1572,52 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, { struct vmcb *nested_vmcb = (struct vmcb *)arg1; struct vmcb *hsave = svm->hsave; - u64 nested_save[] = { nested_vmcb->save.cr0, - nested_vmcb->save.cr3, - nested_vmcb->save.cr4, - nested_vmcb->save.efer, - nested_vmcb->control.intercept_cr_read, - nested_vmcb->control.intercept_cr_write, - nested_vmcb->control.intercept_dr_read, - nested_vmcb->control.intercept_dr_write, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept, - nested_vmcb->control.msrpm_base_pa, - nested_vmcb->control.iopm_base_pa, - nested_vmcb->control.tsc_offset }; + struct vmcb *vmcb = svm->vmcb; /* Give the current vmcb to the guest */ - memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb)); - nested_vmcb->save.cr0 = nested_save[0]; - if (!npt_enabled) - nested_vmcb->save.cr3 = nested_save[1]; - nested_vmcb->save.cr4 = nested_save[2]; - nested_vmcb->save.efer = nested_save[3]; - nested_vmcb->control.intercept_cr_read = nested_save[4]; - nested_vmcb->control.intercept_cr_write = nested_save[5]; - nested_vmcb->control.intercept_dr_read = nested_save[6]; - nested_vmcb->control.intercept_dr_write = nested_save[7]; - nested_vmcb->control.intercept_exceptions = nested_save[8]; - nested_vmcb->control.intercept = nested_save[9]; - nested_vmcb->control.msrpm_base_pa = nested_save[10]; - nested_vmcb->control.iopm_base_pa = nested_save[11]; - nested_vmcb->control.tsc_offset = nested_save[12]; + disable_gif(svm); + + nested_vmcb->save.es = vmcb->save.es; + nested_vmcb->save.cs = vmcb->save.cs; + nested_vmcb->save.ss = vmcb->save.ss; + nested_vmcb->save.ds = vmcb->save.ds; + nested_vmcb->save.gdtr = vmcb->save.gdtr; + nested_vmcb->save.idtr = vmcb->save.idtr; + if (npt_enabled) + nested_vmcb->save.cr3 = vmcb->save.cr3; + nested_vmcb->save.cr2 = vmcb->save.cr2; + nested_vmcb->save.rflags = vmcb->save.rflags; + nested_vmcb->save.rip = vmcb->save.rip; + nested_vmcb->save.rsp = vmcb->save.rsp; + nested_vmcb->save.rax = vmcb->save.rax; + nested_vmcb->save.dr7 = vmcb->save.dr7; + nested_vmcb->save.dr6 = vmcb->save.dr6; + nested_vmcb->save.cpl = vmcb->save.cpl; + + nested_vmcb->control.int_ctl = vmcb->control.int_ctl; + nested_vmcb->control.int_vector = vmcb->control.int_vector; + nested_vmcb->control.int_state = vmcb->control.int_state; + nested_vmcb->control.exit_code = vmcb->control.exit_code; + nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; + nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; + nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; + nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; + nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + nested_vmcb->control.tlb_ctl = 0; + nested_vmcb->control.event_inj = 0; + nested_vmcb->control.event_inj_err = 0; /* We always set V_INTR_MASKING and remember the old value in hflags */ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; - if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) && - (nested_vmcb->control.int_vector)) { - nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n", - nested_vmcb->control.int_vector); - } - /* Restore the original control entries */ svm->vmcb->control = hsave->control; /* Kill any pending exceptions */ if (svm->vcpu.arch.exception.pending == true) nsvm_printk("WARNING: Pending Exception\n"); + kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); @@ -1646,7 +1645,6 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; - disable_gif(svm); /* Exit nested SVM mode */ svm->nested_vmcb = 0; -- cgit v1.2.3-70-g09d2 From defbba5660fb9fcad186bd799a635e52994a4d1a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:30 +0200 Subject: KVM: SVM: optimize nested vmrun Only copy the necessary parts of the vmcb save area on vmrun and save precious time. Signed-off-by: Joerg Roedel Acked-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2f5f2236f2a2..f11f88005c29 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1681,6 +1681,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, { struct vmcb *nested_vmcb = (struct vmcb *)arg1; struct vmcb *hsave = svm->hsave; + struct vmcb *vmcb = svm->vmcb; /* nested_vmcb is our indicator if nested SVM is activated */ svm->nested_vmcb = svm->vmcb->save.rax; @@ -1691,12 +1692,25 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, /* Save the old vmcb, so we don't need to pick what we save, but can restore everything when a VMEXIT occurs */ - memcpy(hsave, svm->vmcb, sizeof(struct vmcb)); - /* We need to remember the original CR3 in the SPT case */ - if (!npt_enabled) - hsave->save.cr3 = svm->vcpu.arch.cr3; - hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rip = svm->next_rip; + hsave->save.es = vmcb->save.es; + hsave->save.cs = vmcb->save.cs; + hsave->save.ss = vmcb->save.ss; + hsave->save.ds = vmcb->save.ds; + hsave->save.gdtr = vmcb->save.gdtr; + hsave->save.idtr = vmcb->save.idtr; + hsave->save.efer = svm->vcpu.arch.shadow_efer; + hsave->save.cr0 = svm->vcpu.arch.cr0; + hsave->save.cr4 = svm->vcpu.arch.cr4; + hsave->save.rflags = vmcb->save.rflags; + hsave->save.rip = svm->next_rip; + hsave->save.rsp = vmcb->save.rsp; + hsave->save.rax = vmcb->save.rax; + if (npt_enabled) + hsave->save.cr3 = vmcb->save.cr3; + else + hsave->save.cr3 = svm->vcpu.arch.cr3; + + hsave->control = vmcb->control; if (svm->vmcb->save.rflags & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; @@ -1721,7 +1735,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); kvm_mmu_reset_context(&svm->vcpu); } - svm->vmcb->save.cr2 = nested_vmcb->save.cr2; + svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); -- cgit v1.2.3-70-g09d2 From 0460a979b4b7a564e59eaa8efbba6f5ae38c5b78 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:31 +0200 Subject: KVM: SVM: copy only necessary parts of the control area on vmrun/vmexit The vmcb control area contains more then 800 bytes of reserved fields which are unnecessarily copied. Fix this by introducing a copy function which only copies the relevant part and saves time. Signed-off-by: Joerg Roedel Acked-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f11f88005c29..df795bcebd22 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1567,6 +1567,38 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) nested_svm_exit_handled_real); } +static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) +{ + struct vmcb_control_area *dst = &dst_vmcb->control; + struct vmcb_control_area *from = &from_vmcb->control; + + dst->intercept_cr_read = from->intercept_cr_read; + dst->intercept_cr_write = from->intercept_cr_write; + dst->intercept_dr_read = from->intercept_dr_read; + dst->intercept_dr_write = from->intercept_dr_write; + dst->intercept_exceptions = from->intercept_exceptions; + dst->intercept = from->intercept; + dst->iopm_base_pa = from->iopm_base_pa; + dst->msrpm_base_pa = from->msrpm_base_pa; + dst->tsc_offset = from->tsc_offset; + dst->asid = from->asid; + dst->tlb_ctl = from->tlb_ctl; + dst->int_ctl = from->int_ctl; + dst->int_vector = from->int_vector; + dst->int_state = from->int_state; + dst->exit_code = from->exit_code; + dst->exit_code_hi = from->exit_code_hi; + dst->exit_info_1 = from->exit_info_1; + dst->exit_info_2 = from->exit_info_2; + dst->exit_int_info = from->exit_int_info; + dst->exit_int_info_err = from->exit_int_info_err; + dst->nested_ctl = from->nested_ctl; + dst->event_inj = from->event_inj; + dst->event_inj_err = from->event_inj_err; + dst->nested_cr3 = from->nested_cr3; + dst->lbr_ctl = from->lbr_ctl; +} + static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, void *arg2, void *opaque) { @@ -1612,7 +1644,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; /* Restore the original control entries */ - svm->vmcb->control = hsave->control; + copy_vmcb_control_area(vmcb, hsave); /* Kill any pending exceptions */ if (svm->vcpu.arch.exception.pending == true) @@ -1710,7 +1742,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, else hsave->save.cr3 = svm->vcpu.arch.cr3; - hsave->control = vmcb->control; + copy_vmcb_control_area(hsave, vmcb); if (svm->vmcb->save.rflags & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; -- cgit v1.2.3-70-g09d2 From a5c3832dfe6324862b4fd1d90831266b15d4b58e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:32 +0200 Subject: KVM: SVM: complete interrupts after handling nested exits The interrupt completion code must run after nested exits are handled because not injected interrupts or exceptions may be handled by the l1 guest first. Signed-off-by: Joerg Roedel Acked-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index df795bcebd22..825b82540f01 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -111,6 +111,7 @@ static int nested = 0; module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); +static void svm_complete_interrupts(struct vcpu_svm *svm); static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); static int nested_svm_vmexit(struct vcpu_svm *svm); @@ -2324,6 +2325,8 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } } + svm_complete_interrupts(svm); + if (npt_enabled) { int mmu_reload = 0; if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { @@ -2690,8 +2693,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); } - - svm_complete_interrupts(svm); } #undef R -- cgit v1.2.3-70-g09d2 From e6aa9abd7381557c67be6a9e7240eb132ca00d66 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:33 +0200 Subject: KVM: SVM: move nested svm state into seperate struct This makes it more clear for which purpose these members in the vcpu_svm exist. Signed-off-by: Joerg Roedel Acked-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 62 +++++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 825b82540f01..fbadaa7cb27a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -70,6 +70,18 @@ static const u32 host_save_user_msrs[] = { struct kvm_vcpu; +struct nested_state { + struct vmcb *hsave; + u64 hsave_msr; + u64 vmcb; + + /* These are the merged vectors */ + u32 *msrpm; + + /* gpa pointers to the real vectors */ + u64 vmcb_msrpm; +}; + struct vcpu_svm { struct kvm_vcpu vcpu; struct vmcb *vmcb; @@ -85,16 +97,8 @@ struct vcpu_svm { u64 host_gs_base; u32 *msrpm; - struct vmcb *hsave; - u64 hsave_msr; - - u64 nested_vmcb; - /* These are the merged vectors */ - u32 *nested_msrpm; - - /* gpa pointers to the real vectors */ - u64 nested_vmcb_msrpm; + struct nested_state nested; }; /* enable NPT for AMD64 and X86 with PAE */ @@ -127,7 +131,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) static inline bool is_nested(struct vcpu_svm *svm) { - return svm->nested_vmcb; + return svm->nested.vmcb; } static inline void enable_gif(struct vcpu_svm *svm) @@ -636,7 +640,7 @@ static void init_vmcb(struct vcpu_svm *svm) } force_new_asid(&svm->vcpu); - svm->nested_vmcb = 0; + svm->nested.vmcb = 0; svm->vcpu.arch.hflags = 0; enable_gif(svm); @@ -699,9 +703,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) hsave_page = alloc_page(GFP_KERNEL); if (!hsave_page) goto uninit; - svm->hsave = page_address(hsave_page); + svm->nested.hsave = page_address(hsave_page); - svm->nested_msrpm = page_address(nested_msrpm_pages); + svm->nested.msrpm = page_address(nested_msrpm_pages); svm->vmcb = page_address(page); clear_page(svm->vmcb); @@ -731,8 +735,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); - __free_page(virt_to_page(svm->hsave)); - __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER); + __free_page(virt_to_page(svm->nested.hsave)); + __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); } @@ -1558,13 +1562,13 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) switch (svm->vmcb->control.exit_code) { case SVM_EXIT_MSR: - return nested_svm_do(svm, svm->nested_vmcb, - svm->nested_vmcb_msrpm, NULL, + return nested_svm_do(svm, svm->nested.vmcb, + svm->nested.vmcb_msrpm, NULL, nested_svm_exit_handled_msr); default: break; } - return nested_svm_do(svm, svm->nested_vmcb, 0, &k, + return nested_svm_do(svm, svm->nested.vmcb, 0, &k, nested_svm_exit_handled_real); } @@ -1604,7 +1608,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, void *arg2, void *opaque) { struct vmcb *nested_vmcb = (struct vmcb *)arg1; - struct vmcb *hsave = svm->hsave; + struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; /* Give the current vmcb to the guest */ @@ -1679,7 +1683,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, svm->vmcb->control.exit_int_info = 0; /* Exit nested SVM mode */ - svm->nested_vmcb = 0; + svm->nested.vmcb = 0; return 0; } @@ -1687,7 +1691,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, static int nested_svm_vmexit(struct vcpu_svm *svm) { nsvm_printk("VMexit\n"); - if (nested_svm_do(svm, svm->nested_vmcb, 0, + if (nested_svm_do(svm, svm->nested.vmcb, 0, NULL, nested_svm_vmexit_real)) return 1; @@ -1703,8 +1707,8 @@ static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, int i; u32 *nested_msrpm = (u32*)arg1; for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) - svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; - svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm); + svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; + svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); return 0; } @@ -1713,11 +1717,11 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, void *arg2, void *opaque) { struct vmcb *nested_vmcb = (struct vmcb *)arg1; - struct vmcb *hsave = svm->hsave; + struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; /* nested_vmcb is our indicator if nested SVM is activated */ - svm->nested_vmcb = svm->vmcb->save.rax; + svm->nested.vmcb = svm->vmcb->save.rax; /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); @@ -1795,7 +1799,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm->vmcb->control.intercept |= nested_vmcb->control.intercept; - svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; + svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; force_new_asid(&svm->vcpu); svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; @@ -1897,7 +1901,7 @@ static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) NULL, nested_svm_vmrun)) return 1; - if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0, + if (nested_svm_do(svm, svm->nested.vmcb_msrpm, 0, NULL, nested_svm_vmrun_msrpm)) return 1; @@ -2107,7 +2111,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) *data = svm->vmcb->save.last_excp_to; break; case MSR_VM_HSAVE_PA: - *data = svm->hsave_msr; + *data = svm->nested.hsave_msr; break; case MSR_VM_CR: *data = 0; @@ -2195,7 +2199,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) svm_disable_lbrv(svm); break; case MSR_VM_HSAVE_PA: - svm->hsave_msr = data; + svm->nested.hsave_msr = data; break; case MSR_VM_CR: case MSR_VM_IGNNE: -- cgit v1.2.3-70-g09d2 From aad42c641cfcda4f87abc4f6588329b9b3cc3364 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:34 +0200 Subject: KVM: SVM: cache nested intercepts When the nested intercepts are cached we don't need to call get_user_pages and/or map the nested vmcb on every nested #vmexit to check who will handle the intercept. Further this patch aligns the emulated svm behavior better to real hardware. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fbadaa7cb27a..4426c631057b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -80,6 +80,15 @@ struct nested_state { /* gpa pointers to the real vectors */ u64 vmcb_msrpm; + + /* cache for intercepts of the guest */ + u16 intercept_cr_read; + u16 intercept_cr_write; + u16 intercept_dr_read; + u16 intercept_dr_write; + u32 intercept_exceptions; + u64 intercept; + }; struct vcpu_svm { @@ -1452,7 +1461,6 @@ static int nested_svm_exit_handled_real(struct vcpu_svm *svm, void *arg2, void *opaque) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; bool kvm_overrides = *(bool *)opaque; u32 exit_code = svm->vmcb->control.exit_code; @@ -1479,38 +1487,38 @@ static int nested_svm_exit_handled_real(struct vcpu_svm *svm, switch (exit_code) { case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); - if (nested_vmcb->control.intercept_cr_read & cr_bits) + if (svm->nested.intercept_cr_read & cr_bits) return 1; break; } case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); - if (nested_vmcb->control.intercept_cr_write & cr_bits) + if (svm->nested.intercept_cr_write & cr_bits) return 1; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); - if (nested_vmcb->control.intercept_dr_read & dr_bits) + if (svm->nested.intercept_dr_read & dr_bits) return 1; break; } case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); - if (nested_vmcb->control.intercept_dr_write & dr_bits) + if (svm->nested.intercept_dr_write & dr_bits) return 1; break; } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (nested_vmcb->control.intercept_exceptions & excp_bits) + if (svm->nested.intercept_exceptions & excp_bits) return 1; break; } default: { u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); nsvm_printk("exit code: 0x%x\n", exit_code); - if (nested_vmcb->control.intercept & exit_bits) + if (svm->nested.intercept & exit_bits) return 1; } } @@ -1801,6 +1809,14 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; + /* cache intercepts */ + svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; + svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; + svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read; + svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write; + svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; + svm->nested.intercept = nested_vmcb->control.intercept; + force_new_asid(&svm->vcpu); svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err; -- cgit v1.2.3-70-g09d2 From 4c2161aed55c294c4c42622455f067a4b3077b85 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:35 +0200 Subject: KVM: SVM: consolidate nested_svm_exit_handled When caching guest intercepts there is no need anymore for the nested_svm_exit_handled_real function. So move its code into nested_svm_exit_handled. Signed-off-by: Joerg Roedel Acked-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 109 ++++++++++++++++++++++++----------------------------- 1 file changed, 49 insertions(+), 60 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4426c631057b..bdd73fd9a75c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1456,15 +1456,58 @@ static int nested_svm_do(struct vcpu_svm *svm, return retval; } -static int nested_svm_exit_handled_real(struct vcpu_svm *svm, - void *arg1, - void *arg2, - void *opaque) +static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, + void *arg1, void *arg2, + void *opaque) +{ + struct vmcb *nested_vmcb = (struct vmcb *)arg1; + u8 *msrpm = (u8 *)arg2; + u32 t0, t1; + u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + u32 param = svm->vmcb->control.exit_info_1 & 1; + + if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return 0; + + switch (msr) { + case 0 ... 0x1fff: + t0 = (msr * 2) % 8; + t1 = msr / 8; + break; + case 0xc0000000 ... 0xc0001fff: + t0 = (8192 + msr - 0xc0000000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + case 0xc0010000 ... 0xc0011fff: + t0 = (16384 + msr - 0xc0010000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + default: + return 1; + break; + } + if (msrpm[t1] & ((1 << param) << t0)) + return 1; + + return 0; +} + +static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) { - bool kvm_overrides = *(bool *)opaque; u32 exit_code = svm->vmcb->control.exit_code; - if (kvm_overrides) { + switch (svm->vmcb->control.exit_code) { + case SVM_EXIT_MSR: + return nested_svm_do(svm, svm->nested.vmcb, + svm->nested.vmcb_msrpm, NULL, + nested_svm_exit_handled_msr); + default: + break; + } + + if (kvm_override) { switch (exit_code) { case SVM_EXIT_INTR: case SVM_EXIT_NMI: @@ -1526,60 +1569,6 @@ static int nested_svm_exit_handled_real(struct vcpu_svm *svm, return 0; } -static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, - void *arg1, void *arg2, - void *opaque) -{ - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - u8 *msrpm = (u8 *)arg2; - u32 t0, t1; - u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - u32 param = svm->vmcb->control.exit_info_1 & 1; - - if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return 0; - - switch(msr) { - case 0 ... 0x1fff: - t0 = (msr * 2) % 8; - t1 = msr / 8; - break; - case 0xc0000000 ... 0xc0001fff: - t0 = (8192 + msr - 0xc0000000) * 2; - t1 = (t0 / 8); - t0 %= 8; - break; - case 0xc0010000 ... 0xc0011fff: - t0 = (16384 + msr - 0xc0010000) * 2; - t1 = (t0 / 8); - t0 %= 8; - break; - default: - return 1; - break; - } - if (msrpm[t1] & ((1 << param) << t0)) - return 1; - - return 0; -} - -static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) -{ - bool k = kvm_override; - - switch (svm->vmcb->control.exit_code) { - case SVM_EXIT_MSR: - return nested_svm_do(svm, svm->nested.vmcb, - svm->nested.vmcb_msrpm, NULL, - nested_svm_exit_handled_msr); - default: break; - } - - return nested_svm_do(svm, svm->nested.vmcb, 0, &k, - nested_svm_exit_handled_real); -} - static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) { struct vmcb_control_area *dst = &dst_vmcb->control; -- cgit v1.2.3-70-g09d2 From 9c4e40b9949868928bd7fec67a4b0902d90e57ed Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:36 +0200 Subject: KVM: SVM: do nested vmexit in nested_svm_exit_handled If this function returns true a nested vmexit is required. Move that vmexit into the nested_svm_exit_handled function. This also simplifies the handling of nested #pf intercepts in this function. Signed-off-by: Joerg Roedel Acked-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bdd73fd9a75c..3bb6d4b92b34 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1366,8 +1366,6 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; if (nested_svm_exit_handled(svm, false)) { nsvm_printk("VMexit -> EXCP 0x%x\n", nr); - - nested_svm_vmexit(svm); return 1; } } @@ -1388,7 +1386,6 @@ static inline int nested_svm_intr(struct vcpu_svm *svm) if (nested_svm_exit_handled(svm, false)) { nsvm_printk("VMexit -> INTR\n"); - nested_svm_vmexit(svm); return 1; } } @@ -1497,15 +1494,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) { u32 exit_code = svm->vmcb->control.exit_code; - - switch (svm->vmcb->control.exit_code) { - case SVM_EXIT_MSR: - return nested_svm_do(svm, svm->nested.vmcb, - svm->nested.vmcb_msrpm, NULL, - nested_svm_exit_handled_msr); - default: - break; - } + bool vmexit = false; if (kvm_override) { switch (exit_code) { @@ -1528,45 +1517,55 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) } switch (exit_code) { + case SVM_EXIT_MSR: + if (nested_svm_do(svm, svm->nested.vmcb, svm->nested.vmcb_msrpm, + NULL, nested_svm_exit_handled_msr)) + vmexit = true; + break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); if (svm->nested.intercept_cr_read & cr_bits) - return 1; + vmexit = true; break; } case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); if (svm->nested.intercept_cr_write & cr_bits) - return 1; + vmexit = true; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); if (svm->nested.intercept_dr_read & dr_bits) - return 1; + vmexit = true; break; } case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); if (svm->nested.intercept_dr_write & dr_bits) - return 1; + vmexit = true; break; } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); if (svm->nested.intercept_exceptions & excp_bits) - return 1; + vmexit = true; break; } default: { u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); nsvm_printk("exit code: 0x%x\n", exit_code); if (svm->nested.intercept & exit_bits) - return 1; + vmexit = true; } } - return 0; + if (vmexit) { + nsvm_printk("#VMEXIT reason=%04x\n", exit_code); + nested_svm_vmexit(svm); + } + + return vmexit; } static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) @@ -2327,11 +2326,8 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", exit_code, svm->vmcb->control.exit_info_1, svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); - if (nested_svm_exit_handled(svm, true)) { - nested_svm_vmexit(svm); - nsvm_printk("-> #VMEXIT\n"); + if (nested_svm_exit_handled(svm, true)) return 1; - } } svm_complete_interrupts(svm); -- cgit v1.2.3-70-g09d2 From 0295ad7de86a6347316bc7414c1b9c15f56a1333 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:37 +0200 Subject: KVM: SVM: simplify nested_svm_check_exception Makes the code of this function more readable by removing on indentation level for the core logic. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3bb6d4b92b34..67fad6641d55 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1359,18 +1359,15 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code) { - if (is_nested(svm)) { - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = error_code; - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - if (nested_svm_exit_handled(svm, false)) { - nsvm_printk("VMexit -> EXCP 0x%x\n", nr); - return 1; - } - } + if (!is_nested(svm)) + return 0; - return 0; + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = error_code; + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + + return nested_svm_exit_handled(svm, false); } static inline int nested_svm_intr(struct vcpu_svm *svm) -- cgit v1.2.3-70-g09d2 From 34f80cfad59ee587e374cbaf5f2a31d9f5015404 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:38 +0200 Subject: KVM: SVM: get rid of nested_svm_vmexit_real This patch is the starting point of removing nested_svm_do from the nested svm code. The nested_svm_do function basically maps two guest physical pages to host virtual addresses and calls a passed function on it. This function pointer code flow is hard to read and not the best technical solution here. As a side effect this patch indroduces the nested_svm_[un]map helper functions. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 52 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 67fad6641d55..5e55a1bdd13d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1390,6 +1390,39 @@ static inline int nested_svm_intr(struct vcpu_svm *svm) return 0; } +static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) +{ + struct page *page; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); + up_read(¤t->mm->mmap_sem); + + if (is_error_page(page)) + goto error; + + return kmap_atomic(page, idx); + +error: + kvm_release_page_clean(page); + kvm_inject_gp(&svm->vcpu, 0); + + return NULL; +} + +static void nested_svm_unmap(void *addr, enum km_type idx) +{ + struct page *page; + + if (!addr) + return; + + page = kmap_atomic_to_page(addr); + + kunmap_atomic(addr, idx); + kvm_release_page_dirty(page); +} + static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) { struct page *page; @@ -1597,13 +1630,16 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr dst->lbr_ctl = from->lbr_ctl; } -static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static int nested_svm_vmexit(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; + struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; + nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); + if (!nested_vmcb) + return 1; + /* Give the current vmcb to the guest */ disable_gif(svm); @@ -1678,15 +1714,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, /* Exit nested SVM mode */ svm->nested.vmcb = 0; - return 0; -} - -static int nested_svm_vmexit(struct vcpu_svm *svm) -{ - nsvm_printk("VMexit\n"); - if (nested_svm_do(svm, svm->nested.vmcb, 0, - NULL, nested_svm_vmexit_real)) - return 1; + nested_svm_unmap(nested_vmcb, KM_USER0); kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); -- cgit v1.2.3-70-g09d2 From 3d62d9aa9868865217ce3a1b70d6039a98b50820 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:39 +0200 Subject: KVM: SVM: clean up nested_svm_exit_handled_msr This patch changes nested svm to call nested_svm_exit_handled_msr directly and not through nested_svm_do. [alex: fix oops due to nested kmap_atomics] Signed-off-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5e55a1bdd13d..e85d79142ffc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1483,18 +1483,21 @@ static int nested_svm_do(struct vcpu_svm *svm, return retval; } -static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, - void *arg1, void *arg2, - void *opaque) +static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - u8 *msrpm = (u8 *)arg2; - u32 t0, t1; - u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u32 param = svm->vmcb->control.exit_info_1 & 1; + u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + bool ret = false; + u32 t0, t1; + u8 *msrpm; - if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return 0; + if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return false; + + msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); + + if (!msrpm) + goto out; switch (msr) { case 0 ... 0x1fff: @@ -1512,13 +1515,16 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, t0 %= 8; break; default: - return 1; - break; + ret = true; + goto out; } - if (msrpm[t1] & ((1 << param) << t0)) - return 1; - return 0; + ret = msrpm[t1] & ((1 << param) << t0); + +out: + nested_svm_unmap(msrpm, KM_USER0); + + return ret; } static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) @@ -1548,9 +1554,7 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) switch (exit_code) { case SVM_EXIT_MSR: - if (nested_svm_do(svm, svm->nested.vmcb, svm->nested.vmcb_msrpm, - NULL, nested_svm_exit_handled_msr)) - vmexit = true; + vmexit = nested_svm_exit_handled_msr(svm); break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); -- cgit v1.2.3-70-g09d2 From 9966bf6872598362b632b738213edfb5a961315d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:40 +0200 Subject: KVM: SVM: clean up nestec vmload/vmsave paths This patch removes the usage of nested_svm_do from the vmload and vmsave emulation code paths. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e85d79142ffc..78c0463ccdd4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -128,8 +128,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm); static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); static int nested_svm_vmexit(struct vcpu_svm *svm); -static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, - void *arg2, void *opaque); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); @@ -1865,7 +1863,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, return 0; } -static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) { to_vmcb->save.fs = from_vmcb->save.fs; to_vmcb->save.gs = from_vmcb->save.gs; @@ -1879,44 +1877,44 @@ static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; - - return 1; -} - -static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb, - void *arg2, void *opaque) -{ - return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb); -} - -static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, - void *arg2, void *opaque) -{ - return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb); } static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + struct vmcb *nested_vmcb; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + if (!nested_vmcb) + return 1; + + nested_svm_vmloadsave(nested_vmcb, svm->vmcb); + nested_svm_unmap(nested_vmcb, KM_USER0); return 1; } static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + struct vmcb *nested_vmcb; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + if (!nested_vmcb) + return 1; + + nested_svm_vmloadsave(svm->vmcb, nested_vmcb); + nested_svm_unmap(nested_vmcb, KM_USER0); return 1; } -- cgit v1.2.3-70-g09d2 From 9738b2c97d19d87e5c204ae8c3f715a546bb6773 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:41 +0200 Subject: KVM: SVM: clean up nested vmrun path This patch removes the usage of nested_svm_do from the vmrun emulation path. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 78c0463ccdd4..68b6d4cdd153 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1724,25 +1724,35 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) return 0; } -static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { + u32 *nested_msrpm; int i; - u32 *nested_msrpm = (u32*)arg1; + + nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); + if (!nested_msrpm) + return false; + for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; + svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); - return 0; + nested_svm_unmap(nested_msrpm, KM_USER0); + + return true; } -static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static bool nested_svm_vmrun(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; + struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + if (!nested_vmcb) + return false; + /* nested_vmcb is our indicator if nested SVM is activated */ svm->nested.vmcb = svm->vmcb->save.rax; @@ -1858,9 +1868,11 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; + nested_svm_unmap(nested_vmcb, KM_USER0); + enable_gif(svm); - return 0; + return true; } static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) @@ -1928,12 +1940,10 @@ static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - if (nested_svm_do(svm, svm->vmcb->save.rax, 0, - NULL, nested_svm_vmrun)) + if (!nested_svm_vmrun(svm)) return 1; - if (nested_svm_do(svm, svm->nested.vmcb_msrpm, 0, - NULL, nested_svm_vmrun_msrpm)) + if (!nested_svm_vmrun_msrpm(svm)) return 1; return 1; -- cgit v1.2.3-70-g09d2 From ea8e064fe22a132da1473d82a57751208e6b8bfd Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:42 +0200 Subject: KVM: SVM: remove nested_svm_do and helper functions This function is not longer required. So remove it. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 60 ------------------------------------------------------ 1 file changed, 60 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 68b6d4cdd153..d458297d54c8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1421,66 +1421,6 @@ static void nested_svm_unmap(void *addr, enum km_type idx) kvm_release_page_dirty(page); } -static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) -{ - struct page *page; - - down_read(¤t->mm->mmap_sem); - page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); - - if (is_error_page(page)) { - printk(KERN_INFO "%s: could not find page at 0x%llx\n", - __func__, gpa); - kvm_release_page_clean(page); - kvm_inject_gp(&svm->vcpu, 0); - return NULL; - } - return page; -} - -static int nested_svm_do(struct vcpu_svm *svm, - u64 arg1_gpa, u64 arg2_gpa, void *opaque, - int (*handler)(struct vcpu_svm *svm, - void *arg1, - void *arg2, - void *opaque)) -{ - struct page *arg1_page; - struct page *arg2_page = NULL; - void *arg1; - void *arg2 = NULL; - int retval; - - arg1_page = nested_svm_get_page(svm, arg1_gpa); - if(arg1_page == NULL) - return 1; - - if (arg2_gpa) { - arg2_page = nested_svm_get_page(svm, arg2_gpa); - if(arg2_page == NULL) { - kvm_release_page_clean(arg1_page); - return 1; - } - } - - arg1 = kmap_atomic(arg1_page, KM_USER0); - if (arg2_gpa) - arg2 = kmap_atomic(arg2_page, KM_USER1); - - retval = handler(svm, arg1, arg2, opaque); - - kunmap_atomic(arg1, KM_USER0); - if (arg2_gpa) - kunmap_atomic(arg2, KM_USER1); - - kvm_release_page_dirty(arg1_page); - if (arg2_gpa) - kvm_release_page_dirty(arg2_page); - - return retval; -} - static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) { u32 param = svm->vmcb->control.exit_info_1 & 1; -- cgit v1.2.3-70-g09d2 From 1f8da47805072e89454ccfdada553c2afc4dfb79 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:43 +0200 Subject: KVM: SVM: handle errors in vmrun emulation path appropriatly If nested svm fails to load the msrpm the vmrun succeeds with the old msrpm which is not correct. This patch changes the logic to roll back to host mode in case the msrpm cannot be loaded. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d458297d54c8..53376f144d70 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1874,6 +1874,7 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { nsvm_printk("VMrun\n"); + if (nested_svm_check_permissions(svm)) return 1; @@ -1884,7 +1885,18 @@ static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; if (!nested_svm_vmrun_msrpm(svm)) - return 1; + goto failed; + + return 1; + +failed: + + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); return 1; } -- cgit v1.2.3-70-g09d2 From 410e4d573d9b7fbd134f9a47815b6ad517492a08 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:44 +0200 Subject: KVM: SVM: move special nested exit handling to separate function This patch moves the handling for special nested vmexits like #pf to a separate function. This makes the kvm_override parameter obsolete and makes the code more readable. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 80 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 53376f144d70..be28a38db918 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -47,6 +47,10 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_LBRV (1 << 1) #define SVM_FEATURE_SVML (1 << 2) +#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ +#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ +#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ + #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) /* Turn on to get debugging output*/ @@ -126,7 +130,7 @@ module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); -static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); +static int nested_svm_exit_handled(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); @@ -1365,7 +1369,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, svm->vmcb->control.exit_info_1 = error_code; svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - return nested_svm_exit_handled(svm, false); + return nested_svm_exit_handled(svm); } static inline int nested_svm_intr(struct vcpu_svm *svm) @@ -1379,7 +1383,7 @@ static inline int nested_svm_intr(struct vcpu_svm *svm) svm->vmcb->control.exit_code = SVM_EXIT_INTR; - if (nested_svm_exit_handled(svm, false)) { + if (nested_svm_exit_handled(svm)) { nsvm_printk("VMexit -> INTR\n"); return 1; } @@ -1465,31 +1469,39 @@ out: return ret; } -static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) +static int nested_svm_exit_special(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; - bool vmexit = false; - if (kvm_override) { - switch (exit_code) { - case SVM_EXIT_INTR: - case SVM_EXIT_NMI: - return 0; + switch (exit_code) { + case SVM_EXIT_INTR: + case SVM_EXIT_NMI: + return NESTED_EXIT_HOST; /* For now we are always handling NPFs when using them */ - case SVM_EXIT_NPF: - if (npt_enabled) - return 0; - break; - /* When we're shadowing, trap PFs */ - case SVM_EXIT_EXCP_BASE + PF_VECTOR: - if (!npt_enabled) - return 0; - break; - default: - break; - } + case SVM_EXIT_NPF: + if (npt_enabled) + return NESTED_EXIT_HOST; + break; + /* When we're shadowing, trap PFs */ + case SVM_EXIT_EXCP_BASE + PF_VECTOR: + if (!npt_enabled) + return NESTED_EXIT_HOST; + break; + default: + break; } + return NESTED_EXIT_CONTINUE; +} + +/* + * If this function returns true, this #vmexit was already handled + */ +static int nested_svm_exit_handled(struct vcpu_svm *svm) +{ + u32 exit_code = svm->vmcb->control.exit_code; + int vmexit = NESTED_EXIT_HOST; + switch (exit_code) { case SVM_EXIT_MSR: vmexit = nested_svm_exit_handled_msr(svm); @@ -1497,42 +1509,42 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); if (svm->nested.intercept_cr_read & cr_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); if (svm->nested.intercept_cr_write & cr_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); if (svm->nested.intercept_dr_read & dr_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); if (svm->nested.intercept_dr_write & dr_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); if (svm->nested.intercept_exceptions & excp_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; break; } default: { u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); nsvm_printk("exit code: 0x%x\n", exit_code); if (svm->nested.intercept & exit_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; } } - if (vmexit) { + if (vmexit == NESTED_EXIT_DONE) { nsvm_printk("#VMEXIT reason=%04x\n", exit_code); nested_svm_vmexit(svm); } @@ -2312,10 +2324,18 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) trace_kvm_exit(exit_code, svm->vmcb->save.rip); if (is_nested(svm)) { + int vmexit; + nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", exit_code, svm->vmcb->control.exit_info_1, svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); - if (nested_svm_exit_handled(svm, true)) + + vmexit = nested_svm_exit_special(svm); + + if (vmexit == NESTED_EXIT_CONTINUE) + vmexit = nested_svm_exit_handled(svm); + + if (vmexit == NESTED_EXIT_DONE) return 1; } -- cgit v1.2.3-70-g09d2 From cda0ffdd862d36d0b054249ce920f00d1dbae037 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:45 +0200 Subject: KVM: SVM: remove unnecessary is_nested check from svm_cpu_run This check is not necessary. We have to sync the vcpu->arch.cr2 always back to the VMCB. This patch remove the is_nested check. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index be28a38db918..a1cfa7d57ab9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2602,8 +2602,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) fs_selector = kvm_read_fs(); gs_selector = kvm_read_gs(); ldt_selector = kvm_read_ldt(); - if (!is_nested(svm)) - svm->vmcb->save.cr2 = vcpu->arch.cr2; + svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ if (npt_enabled) svm->vmcb->save.cr3 = vcpu->arch.cr3; -- cgit v1.2.3-70-g09d2 From 26666957a58fab7c8a4a31f1ab24d3bbf4c58c7a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:46 +0200 Subject: KVM: SVM: move nested_svm_intr main logic out of if-clause This patch removes one indentation level from nested_svm_intr and makes the logic more readable. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a1cfa7d57ab9..144f2025799f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1374,19 +1374,20 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, static inline int nested_svm_intr(struct vcpu_svm *svm) { - if (is_nested(svm)) { - if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - return 0; + if (!is_nested(svm)) + return 0; - if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) - return 0; + if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) + return 0; - svm->vmcb->control.exit_code = SVM_EXIT_INTR; + if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) + return 0; - if (nested_svm_exit_handled(svm)) { - nsvm_printk("VMexit -> INTR\n"); - return 1; - } + svm->vmcb->control.exit_code = SVM_EXIT_INTR; + + if (nested_svm_exit_handled(svm)) { + nsvm_printk("VMexit -> INTR\n"); + return 1; } return 0; -- cgit v1.2.3-70-g09d2 From 108768de55949c778bd95b36f3b17e652e59cd5b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:47 +0200 Subject: KVM: SVM: check for nested VINTR flag in svm_interrupt_allowed Not checking for this flag breaks any nested hypervisor that does not set VINTR. So fix it with this patch. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 144f2025799f..6610f60b4c96 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2463,7 +2463,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) return (vmcb->save.rflags & X86_EFLAGS_IF) && !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && gif_set(svm) && - !is_nested(svm); + !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); } static void enable_irq_window(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 4b6e4dca7011613c8508640c10a091b3ed2cd215 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 7 Aug 2009 11:49:48 +0200 Subject: KVM: SVM: enable nested svm by default Nested SVM is (in my experience) stable enough to be enabled by default. So omit the requirement to pass a module parameter. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6610f60b4c96..8f7751a66a2b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -124,7 +124,7 @@ static int npt = 1; module_param(npt, int, S_IRUGO); -static int nested = 0; +static int nested = 1; module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); -- cgit v1.2.3-70-g09d2 From cb142eb743d02d48165c9d941b601d731cc4a003 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 9 Aug 2009 15:17:40 +0300 Subject: KVM: Update cr8 intercept when APIC TPR is changed by userspace Since on vcpu entry we do it only if apic is enabled we should do it when TPR is changed while apic is disabled. This happens when windows resets HW without setting TPR to zero. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 132c5100d4f4..31bf98427f4e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -77,6 +77,7 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +static void update_cr8_intercept(struct kvm_vcpu *vcpu); static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); @@ -1629,6 +1630,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, vcpu_load(vcpu); memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); kvm_apic_post_state_restore(vcpu); + update_cr8_intercept(vcpu); vcpu_put(vcpu); return 0; -- cgit v1.2.3-70-g09d2 From 52c7847d121da3651c08d9e9a99eb8a7cf2faa7a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 10 Aug 2009 15:42:41 +0300 Subject: KVM: SVM: Drop tlb flush workaround in npt It is no longer possible to reproduce the problem any more, so presumably it has been fixed. Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8f7751a66a2b..944cc9c04b3c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1187,17 +1187,8 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) error_code = svm->vmcb->control.exit_info_1; trace_kvm_page_fault(fault_address, error_code); - /* - * FIXME: Tis shouldn't be necessary here, but there is a flush - * missing in the MMU code. Until we find this bug, flush the - * complete TLB here on an NPF - */ - if (npt_enabled) - svm_flush_tlb(&svm->vcpu); - else { - if (kvm_event_needs_reinjection(&svm->vcpu)) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - } + if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } -- cgit v1.2.3-70-g09d2 From 6621fbc2c6add9c9abfc87d7bc9b248ffd307ae3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 10 Aug 2009 17:00:52 +0300 Subject: KVM: Move #endif KVM_CAP_IRQ_ROUTING to correct place The symbol only controls irq routing, not MSI-X. Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4470251ddc53..1df4c0422f68 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2236,6 +2236,7 @@ static long kvm_vm_ioctl(struct file *filp, vfree(entries); break; } +#endif /* KVM_CAP_IRQ_ROUTING */ #ifdef __KVM_HAVE_MSIX case KVM_ASSIGN_SET_MSIX_NR: { struct kvm_assigned_msix_nr entry_nr; @@ -2258,7 +2259,6 @@ static long kvm_vm_ioctl(struct file *filp, break; } #endif -#endif /* KVM_CAP_IRQ_ROUTING */ case KVM_IRQFD: { struct kvm_irqfd data; -- cgit v1.2.3-70-g09d2 From 345dcaa8fde7fa70252d58c862bf41fd2149ca2c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Aug 2009 15:29:37 +0300 Subject: KVM: VMX: Adjust rflags if in real mode emulation We set rflags.vm86 when virtualizing real mode to do through vm8086 mode; so we need to take it out again when reading rflags. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 31c3a8740c42..2b7e7bd190fd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -781,7 +781,12 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { - return vmcs_readl(GUEST_RFLAGS); + unsigned long rflags; + + rflags = vmcs_readl(GUEST_RFLAGS); + if (to_vmx(vcpu)->rmode.vm86_active) + rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + return rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -- cgit v1.2.3-70-g09d2 From c0c7c04b874bf98a10d8e0c8322a5d3bc93536bc Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Tue, 11 Aug 2009 15:57:59 -0500 Subject: KVM: When switching to a vm8086 task, load segments as 16-bit According to 16.2.5 in the SDM, eflags.vm in the tss is consulted before loading and new segments. If eflags.vm == 1, then the segments are treated as 16-bit segments. The LDTR and TR are not normally available in vm86 mode so if they happen to somehow get loaded, they need to be treated as 32-bit segments. This fixes an invalid vmentry failure in a custom OS that was happening after a task switch into vm8086 mode. Since the segments were being mistakenly treated as 32-bit, we loaded garbage state. Signed-off-by: Anthony Liguori Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 31bf98427f4e..1aa7e6d91d4f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4101,12 +4101,19 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se return 0; } +static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) +{ + return (seg != VCPU_SREG_LDTR) && + (seg != VCPU_SREG_TR) && + (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); +} + int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int type_bits, int seg) { struct kvm_segment kvm_seg; - if (!(vcpu->arch.cr0 & X86_CR0_PE)) + if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; -- cgit v1.2.3-70-g09d2 From 56e8231841301ad38e347e33fd4319c89f697045 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Aug 2009 15:04:37 +0300 Subject: KVM: Rename x86_emulate.c to emulate.c We're in arch/x86, what could we possibly be emulating? Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 187 +++ arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/include/asm/kvm_x86_emulate.h | 187 --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/emulate.c | 2392 ++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/x86_emulate.c | 2392 -------------------------------- 7 files changed, 2582 insertions(+), 2582 deletions(-) create mode 100644 arch/x86/include/asm/kvm_emulate.h delete mode 100644 arch/x86/include/asm/kvm_x86_emulate.h create mode 100644 arch/x86/kvm/emulate.c delete mode 100644 arch/x86/kvm/x86_emulate.c diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h new file mode 100644 index 000000000000..b7ed2c423116 --- /dev/null +++ b/arch/x86/include/asm/kvm_emulate.h @@ -0,0 +1,187 @@ +/****************************************************************************** + * x86_emulate.h + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef _ASM_X86_KVM_X86_EMULATE_H +#define _ASM_X86_KVM_X86_EMULATE_H + +struct x86_emulate_ctxt; + +/* + * x86_emulate_ops: + * + * These operations represent the instruction emulator's interface to memory. + * There are two categories of operation: those that act on ordinary memory + * regions (*_std), and those that act on memory regions known to require + * special treatment or emulation (*_emulated). + * + * The emulator assumes that an instruction accesses only one 'emulated memory' + * location, that this location is the given linear faulting address (cr2), and + * that this is one of the instruction's data operands. Instruction fetches and + * stack operations are assumed never to access emulated memory. The emulator + * automatically deduces which operand of a string-move operation is accessing + * emulated memory, and assumes that the other operand accesses normal memory. + * + * NOTES: + * 1. The emulator isn't very smart about emulated vs. standard memory. + * 'Emulated memory' access addresses should be checked for sanity. + * 'Normal memory' accesses may fault, and the caller must arrange to + * detect and handle reentrancy into the emulator via recursive faults. + * Accesses may be unaligned and may cross page boundaries. + * 2. If the access fails (cannot emulate, or a standard access faults) then + * it is up to the memop to propagate the fault to the guest VM via + * some out-of-band mechanism, unknown to the emulator. The memop signals + * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will + * then immediately bail. + * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only + * cmpxchg8b_emulated need support 8-byte accesses. + * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. + */ +/* Access completed successfully: continue emulation as normal. */ +#define X86EMUL_CONTINUE 0 +/* Access is unhandleable: bail from emulation and return error to caller. */ +#define X86EMUL_UNHANDLEABLE 1 +/* Terminate emulation but return success to the caller. */ +#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ +#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ +#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ +struct x86_emulate_ops { + /* + * read_std: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch, stack operations, and others. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_std)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu); + + /* + * read_emulated: Read bytes from emulated/special memory area. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_emulated)(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu); + + /* + * write_emulated: Read bytes from emulated/special memory area. + * @addr: [IN ] Linear address to which to write. + * @val: [IN ] Value to write to memory (low-order bytes used as + * required). + * @bytes: [IN ] Number of bytes to write to memory. + */ + int (*write_emulated)(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu); + + /* + * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an + * emulated/special memory area. + * @addr: [IN ] Linear address to access. + * @old: [IN ] Value expected to be current at @addr. + * @new: [IN ] Value to write to @addr. + * @bytes: [IN ] Number of bytes to access using CMPXCHG. + */ + int (*cmpxchg_emulated)(unsigned long addr, + const void *old, + const void *new, + unsigned int bytes, + struct kvm_vcpu *vcpu); + +}; + +/* Type, address-of, and value of an instruction's operand. */ +struct operand { + enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; + unsigned int bytes; + unsigned long val, orig_val, *ptr; +}; + +struct fetch_cache { + u8 data[15]; + unsigned long start; + unsigned long end; +}; + +struct decode_cache { + u8 twobyte; + u8 b; + u8 lock_prefix; + u8 rep_prefix; + u8 op_bytes; + u8 ad_bytes; + u8 rex_prefix; + struct operand src; + struct operand src2; + struct operand dst; + bool has_seg_override; + u8 seg_override; + unsigned int d; + unsigned long regs[NR_VCPU_REGS]; + unsigned long eip; + /* modrm */ + u8 modrm; + u8 modrm_mod; + u8 modrm_reg; + u8 modrm_rm; + u8 use_modrm_ea; + bool rip_relative; + unsigned long modrm_ea; + void *modrm_ptr; + unsigned long modrm_val; + struct fetch_cache fetch; +}; + +#define X86_SHADOW_INT_MOV_SS 1 +#define X86_SHADOW_INT_STI 2 + +struct x86_emulate_ctxt { + /* Register state before/after emulation. */ + struct kvm_vcpu *vcpu; + + unsigned long eflags; + /* Emulated execution mode, represented by an X86EMUL_MODE value. */ + int mode; + u32 cs_base; + + /* interruptibility state, as a result of execution of STI or MOV SS */ + int interruptibility; + + /* decode cache */ + struct decode_cache decode; +}; + +/* Repeat String Operation Prefix */ +#define REPE_PREFIX 1 +#define REPNE_PREFIX 2 + +/* Execution mode, passed to the emulator. */ +#define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ +#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ +#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ + +/* Host execution mode. */ +#if defined(CONFIG_X86_32) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 +#elif defined(CONFIG_X86_64) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 +#endif + +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); + +#endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b17d845897b7..33901be75a36 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -138,7 +138,7 @@ enum { VCPU_SREG_LDTR, }; -#include +#include #define KVM_NR_MEM_OBJS 40 diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h deleted file mode 100644 index b7ed2c423116..000000000000 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ /dev/null @@ -1,187 +0,0 @@ -/****************************************************************************** - * x86_emulate.h - * - * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. - * - * Copyright (c) 2005 Keir Fraser - * - * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 - */ - -#ifndef _ASM_X86_KVM_X86_EMULATE_H -#define _ASM_X86_KVM_X86_EMULATE_H - -struct x86_emulate_ctxt; - -/* - * x86_emulate_ops: - * - * These operations represent the instruction emulator's interface to memory. - * There are two categories of operation: those that act on ordinary memory - * regions (*_std), and those that act on memory regions known to require - * special treatment or emulation (*_emulated). - * - * The emulator assumes that an instruction accesses only one 'emulated memory' - * location, that this location is the given linear faulting address (cr2), and - * that this is one of the instruction's data operands. Instruction fetches and - * stack operations are assumed never to access emulated memory. The emulator - * automatically deduces which operand of a string-move operation is accessing - * emulated memory, and assumes that the other operand accesses normal memory. - * - * NOTES: - * 1. The emulator isn't very smart about emulated vs. standard memory. - * 'Emulated memory' access addresses should be checked for sanity. - * 'Normal memory' accesses may fault, and the caller must arrange to - * detect and handle reentrancy into the emulator via recursive faults. - * Accesses may be unaligned and may cross page boundaries. - * 2. If the access fails (cannot emulate, or a standard access faults) then - * it is up to the memop to propagate the fault to the guest VM via - * some out-of-band mechanism, unknown to the emulator. The memop signals - * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will - * then immediately bail. - * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only - * cmpxchg8b_emulated need support 8-byte accesses. - * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. - */ -/* Access completed successfully: continue emulation as normal. */ -#define X86EMUL_CONTINUE 0 -/* Access is unhandleable: bail from emulation and return error to caller. */ -#define X86EMUL_UNHANDLEABLE 1 -/* Terminate emulation but return success to the caller. */ -#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ -#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ -#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ -struct x86_emulate_ops { - /* - * read_std: Read bytes of standard (non-emulated/special) memory. - * Used for instruction fetch, stack operations, and others. - * @addr: [IN ] Linear address from which to read. - * @val: [OUT] Value read from memory, zero-extended to 'u_long'. - * @bytes: [IN ] Number of bytes to read from memory. - */ - int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu); - - /* - * read_emulated: Read bytes from emulated/special memory area. - * @addr: [IN ] Linear address from which to read. - * @val: [OUT] Value read from memory, zero-extended to 'u_long'. - * @bytes: [IN ] Number of bytes to read from memory. - */ - int (*read_emulated)(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); - - /* - * write_emulated: Read bytes from emulated/special memory area. - * @addr: [IN ] Linear address to which to write. - * @val: [IN ] Value to write to memory (low-order bytes used as - * required). - * @bytes: [IN ] Number of bytes to write to memory. - */ - int (*write_emulated)(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); - - /* - * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an - * emulated/special memory area. - * @addr: [IN ] Linear address to access. - * @old: [IN ] Value expected to be current at @addr. - * @new: [IN ] Value to write to @addr. - * @bytes: [IN ] Number of bytes to access using CMPXCHG. - */ - int (*cmpxchg_emulated)(unsigned long addr, - const void *old, - const void *new, - unsigned int bytes, - struct kvm_vcpu *vcpu); - -}; - -/* Type, address-of, and value of an instruction's operand. */ -struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; - unsigned int bytes; - unsigned long val, orig_val, *ptr; -}; - -struct fetch_cache { - u8 data[15]; - unsigned long start; - unsigned long end; -}; - -struct decode_cache { - u8 twobyte; - u8 b; - u8 lock_prefix; - u8 rep_prefix; - u8 op_bytes; - u8 ad_bytes; - u8 rex_prefix; - struct operand src; - struct operand src2; - struct operand dst; - bool has_seg_override; - u8 seg_override; - unsigned int d; - unsigned long regs[NR_VCPU_REGS]; - unsigned long eip; - /* modrm */ - u8 modrm; - u8 modrm_mod; - u8 modrm_reg; - u8 modrm_rm; - u8 use_modrm_ea; - bool rip_relative; - unsigned long modrm_ea; - void *modrm_ptr; - unsigned long modrm_val; - struct fetch_cache fetch; -}; - -#define X86_SHADOW_INT_MOV_SS 1 -#define X86_SHADOW_INT_STI 2 - -struct x86_emulate_ctxt { - /* Register state before/after emulation. */ - struct kvm_vcpu *vcpu; - - unsigned long eflags; - /* Emulated execution mode, represented by an X86EMUL_MODE value. */ - int mode; - u32 cs_base; - - /* interruptibility state, as a result of execution of STI or MOV SS */ - int interruptibility; - - /* decode cache */ - struct decode_cache decode; -}; - -/* Repeat String Operation Prefix */ -#define REPE_PREFIX 1 -#define REPNE_PREFIX 2 - -/* Execution mode, passed to the emulator. */ -#define X86EMUL_MODE_REAL 0 /* Real mode. */ -#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ -#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ -#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ - -/* Host execution mode. */ -#if defined(CONFIG_X86_32) -#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 -#elif defined(CONFIG_X86_64) -#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 -#endif - -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); - -#endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index afaaa7627d95..0e7fe78d0f74 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -9,7 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) -kvm-y += x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ +kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o timer.o kvm-intel-y += vmx.o kvm-amd-y += svm.o diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c new file mode 100644 index 000000000000..2eb807a7b620 --- /dev/null +++ b/arch/x86/kvm/emulate.c @@ -0,0 +1,2392 @@ +/****************************************************************************** + * emulate.c + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * Linux coding style, mod r/m decoder, segment base fixes, real-mode + * privileged instructions: + * + * Copyright (C) 2006 Qumranet + * + * Avi Kivity + * Yaniv Kamay + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef __KERNEL__ +#include +#include +#include +#define DPRINTF(_f, _a ...) printf(_f , ## _a) +#else +#include +#include "kvm_cache_regs.h" +#define DPRINTF(x...) do {} while (0) +#endif +#include +#include + +#include "mmu.h" /* for is_long_mode() */ + +/* + * Opcode effective-address decode tables. + * Note that we only emulate instructions that have at least one memory + * operand (excluding implicit stack references). We assume that stack + * references and instruction fetches will never occur in special memory + * areas that require emulation. So, for example, 'mov ,' need + * not be handled. + */ + +/* Operand sizes: 8-bit operands or specified/overridden size. */ +#define ByteOp (1<<0) /* 8-bit operands. */ +/* Destination operand type. */ +#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ +#define DstReg (2<<1) /* Register operand. */ +#define DstMem (3<<1) /* Memory operand. */ +#define DstAcc (4<<1) /* Destination Accumulator */ +#define DstMask (7<<1) +/* Source operand type. */ +#define SrcNone (0<<4) /* No source operand. */ +#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ +#define SrcReg (1<<4) /* Register operand. */ +#define SrcMem (2<<4) /* Memory operand. */ +#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ +#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ +#define SrcImm (5<<4) /* Immediate operand. */ +#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ +#define SrcOne (7<<4) /* Implied '1' */ +#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ +#define SrcImmU (9<<4) /* Immediate operand, unsigned */ +#define SrcMask (0xf<<4) +/* Generic ModRM decode. */ +#define ModRM (1<<8) +/* Destination is only written; never read. */ +#define Mov (1<<9) +#define BitOp (1<<10) +#define MemAbs (1<<11) /* Memory operand is absolute displacement */ +#define String (1<<12) /* String instruction (rep capable) */ +#define Stack (1<<13) /* Stack instruction (push/pop) */ +#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ +#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ +#define GroupMask 0xff /* Group number stored in bits 0:7 */ +/* Source 2 operand type */ +#define Src2None (0<<29) +#define Src2CL (1<<29) +#define Src2ImmByte (2<<29) +#define Src2One (3<<29) +#define Src2Imm16 (4<<29) +#define Src2Mask (7<<29) + +enum { + Group1_80, Group1_81, Group1_82, Group1_83, + Group1A, Group3_Byte, Group3, Group4, Group5, Group7, +}; + +static u32 opcode_table[256] = { + /* 0x00 - 0x07 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, + /* 0x08 - 0x0F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x10 - 0x17 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x18 - 0x1F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x20 - 0x27 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, + /* 0x28 - 0x2F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x30 - 0x37 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x38 - 0x3F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + 0, 0, + /* 0x40 - 0x47 */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x48 - 0x4F */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x50 - 0x57 */ + SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, + SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, + /* 0x58 - 0x5F */ + DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, + DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, + /* 0x60 - 0x67 */ + 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , + 0, 0, 0, 0, + /* 0x68 - 0x6F */ + SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ + /* 0x70 - 0x77 */ + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + /* 0x78 - 0x7F */ + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + /* 0x80 - 0x87 */ + Group | Group1_80, Group | Group1_81, + Group | Group1_82, Group | Group1_83, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + /* 0x88 - 0x8F */ + ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, + ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, + DstReg | SrcMem | ModRM | Mov, Group | Group1A, + /* 0x90 - 0x97 */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x98 - 0x9F */ + 0, 0, SrcImm | Src2Imm16, 0, + ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, + /* 0xA0 - 0xA7 */ + ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, + ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, + ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | String, ImplicitOps | String, + /* 0xA8 - 0xAF */ + 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | String, ImplicitOps | String, + /* 0xB0 - 0xB7 */ + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + /* 0xB8 - 0xBF */ + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + /* 0xC0 - 0xC7 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + 0, ImplicitOps | Stack, 0, 0, + ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, + /* 0xC8 - 0xCF */ + 0, 0, 0, ImplicitOps | Stack, + ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, + /* 0xD0 - 0xD7 */ + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + 0, 0, 0, 0, + /* 0xD8 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xE7 */ + 0, 0, 0, 0, + ByteOp | SrcImmUByte, SrcImmUByte, + ByteOp | SrcImmUByte, SrcImmUByte, + /* 0xE8 - 0xEF */ + SrcImm | Stack, SrcImm | ImplicitOps, + SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + /* 0xF0 - 0xF7 */ + 0, 0, 0, 0, + ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, + /* 0xF8 - 0xFF */ + ImplicitOps, 0, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, +}; + +static u32 twobyte_table[256] = { + /* 0x00 - 0x0F */ + 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, + ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + /* 0x10 - 0x1F */ + 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, + /* 0x20 - 0x2F */ + ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x30 - 0x3F */ + ImplicitOps, 0, ImplicitOps, 0, + ImplicitOps, ImplicitOps, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x40 - 0x47 */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x48 - 0x4F */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x50 - 0x5F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x60 - 0x6F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x70 - 0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x80 - 0x8F */ + SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, + SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xA0 - 0xA7 */ + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + DstMem | SrcReg | Src2ImmByte | ModRM, + DstMem | SrcReg | Src2CL | ModRM, 0, 0, + /* 0xA8 - 0xAF */ + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + DstMem | SrcReg | Src2ImmByte | ModRM, + DstMem | SrcReg | Src2CL | ModRM, + ModRM, 0, + /* 0xB0 - 0xB7 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, + DstMem | SrcReg | ModRM | BitOp, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xB8 - 0xBF */ + 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xC0 - 0xCF */ + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xF0 - 0xFF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static u32 group_table[] = { + [Group1_80*8] = + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + [Group1_81*8] = + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + [Group1_82*8] = + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + [Group1_83*8] = + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + [Group1A*8] = + DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, + [Group3_Byte*8] = + ByteOp | SrcImm | DstMem | ModRM, 0, + ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, + [Group3*8] = + DstMem | SrcImm | ModRM, 0, + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + 0, 0, 0, 0, + [Group4*8] = + ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, 0, 0, + [Group5*8] = + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + SrcMem | ModRM | Stack, 0, + SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, + [Group7*8] = + 0, 0, ModRM | SrcMem, ModRM | SrcMem, + SrcNone | ModRM | DstMem | Mov, 0, + SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, +}; + +static u32 group2_table[] = { + [Group7*8] = + SrcNone | ModRM, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | DstMem | Mov, 0, + SrcMem16 | ModRM | Mov, 0, +}; + +/* EFLAGS bit definitions. */ +#define EFLG_VM (1<<17) +#define EFLG_RF (1<<16) +#define EFLG_OF (1<<11) +#define EFLG_DF (1<<10) +#define EFLG_IF (1<<9) +#define EFLG_SF (1<<7) +#define EFLG_ZF (1<<6) +#define EFLG_AF (1<<4) +#define EFLG_PF (1<<2) +#define EFLG_CF (1<<0) + +/* + * Instruction emulation: + * Most instructions are emulated directly via a fragment of inline assembly + * code. This allows us to save/restore EFLAGS and thus very easily pick up + * any modified flags. + */ + +#if defined(CONFIG_X86_64) +#define _LO32 "k" /* force 32-bit operand */ +#define _STK "%%rsp" /* stack pointer */ +#elif defined(__i386__) +#define _LO32 "" /* force 32-bit operand */ +#define _STK "%%esp" /* stack pointer */ +#endif + +/* + * These EFLAGS bits are restored from saved value during emulation, and + * any changes are written back to the saved value after emulation. + */ +#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) + +/* Before executing instruction: restore necessary bits in EFLAGS. */ +#define _PRE_EFLAGS(_sav, _msk, _tmp) \ + /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ + "movl %"_sav",%"_LO32 _tmp"; " \ + "push %"_tmp"; " \ + "push %"_tmp"; " \ + "movl %"_msk",%"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "pushf; " \ + "notl %"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ + "pop %"_tmp"; " \ + "orl %"_LO32 _tmp",("_STK"); " \ + "popf; " \ + "pop %"_sav"; " + +/* After executing instruction: write-back necessary bits in EFLAGS. */ +#define _POST_EFLAGS(_sav, _msk, _tmp) \ + /* _sav |= EFLAGS & _msk; */ \ + "pushf; " \ + "pop %"_tmp"; " \ + "andl %"_msk",%"_LO32 _tmp"; " \ + "orl %"_LO32 _tmp",%"_sav"; " + +#ifdef CONFIG_X86_64 +#define ON64(x) x +#else +#define ON64(x) +#endif + +#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op _suffix " %"_x"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _y ((_src).val), "i" (EFLAGS_MASK)); \ + } while (0) + + +/* Raw emulation: instruction has two explicit operands. */ +#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 2: \ + ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ + break; \ + case 4: \ + ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ + break; \ + case 8: \ + ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ + break; \ + } \ + } while (0) + +#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + switch ((_dst).bytes) { \ + case 1: \ + ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ + break; \ + default: \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + _wx, _wy, _lx, _ly, _qx, _qy); \ + break; \ + } \ + } while (0) + +/* Source operand is byte-sized and may be restricted to just %cl. */ +#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "c", "b", "c", "b", "c", "b", "c") + +/* Source operand is byte, word, long or quad sized. */ +#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "q", "w", "r", _LO32, "r", "", "r") + +/* Source operand is word, long or quad sized. */ +#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + "w", "r", _LO32, "r", "", "r") + +/* Instruction has three operands and one operand is stored in ECX register */ +#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ + do { \ + unsigned long _tmp; \ + _type _clv = (_cl).val; \ + _type _srcv = (_src).val; \ + _type _dstv = (_dst).val; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "2") \ + _op _suffix " %4,%1 \n" \ + _POST_EFLAGS("0", "5", "2") \ + : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ + ); \ + \ + (_cl).val = (unsigned long) _clv; \ + (_src).val = (unsigned long) _srcv; \ + (_dst).val = (unsigned long) _dstv; \ + } while (0) + +#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 2: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "w", unsigned short); \ + break; \ + case 4: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "l", unsigned int); \ + break; \ + case 8: \ + ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "q", unsigned long)); \ + break; \ + } \ + } while (0) + +#define __emulate_1op(_op, _dst, _eflags, _suffix) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op _suffix " %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "+m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ + } while (0) + +/* Instruction has only one explicit operand (no source operand). */ +#define emulate_1op(_op, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ + case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ + case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ + case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ + } \ + } while (0) + +/* Fetch next part of the instruction being emulated. */ +#define insn_fetch(_type, _size, _eip) \ +({ unsigned long _x; \ + rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ + if (rc != 0) \ + goto done; \ + (_eip) += (_size); \ + (_type)_x; \ +}) + +static inline unsigned long ad_mask(struct decode_cache *c) +{ + return (1UL << (c->ad_bytes << 3)) - 1; +} + +/* Access/update address held in a register, based on addressing mode. */ +static inline unsigned long +address_mask(struct decode_cache *c, unsigned long reg) +{ + if (c->ad_bytes == sizeof(unsigned long)) + return reg; + else + return reg & ad_mask(c); +} + +static inline unsigned long +register_address(struct decode_cache *c, unsigned long base, unsigned long reg) +{ + return base + address_mask(c, reg); +} + +static inline void +register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) +{ + if (c->ad_bytes == sizeof(unsigned long)) + *reg += inc; + else + *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); +} + +static inline void jmp_rel(struct decode_cache *c, int rel) +{ + register_address_increment(c, &c->eip, rel); +} + +static void set_seg_override(struct decode_cache *c, int seg) +{ + c->has_seg_override = true; + c->seg_override = seg; +} + +static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) +{ + if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) + return 0; + + return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); +} + +static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, + struct decode_cache *c) +{ + if (!c->has_seg_override) + return 0; + + return seg_base(ctxt, c->seg_override); +} + +static unsigned long es_base(struct x86_emulate_ctxt *ctxt) +{ + return seg_base(ctxt, VCPU_SREG_ES); +} + +static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) +{ + return seg_base(ctxt, VCPU_SREG_SS); +} + +static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long linear, u8 *dest) +{ + struct fetch_cache *fc = &ctxt->decode.fetch; + int rc; + int size; + + if (linear < fc->start || linear >= fc->end) { + size = min(15UL, PAGE_SIZE - offset_in_page(linear)); + rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); + if (rc) + return rc; + fc->start = linear; + fc->end = linear + size; + } + *dest = fc->data[linear - fc->start]; + return 0; +} + +static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long eip, void *dest, unsigned size) +{ + int rc = 0; + + eip += ctxt->cs_base; + while (size--) { + rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); + if (rc) + return rc; + } + return 0; +} + +/* + * Given the 'reg' portion of a ModRM byte, and a register block, return a + * pointer into the block that addresses the relevant register. + * @highbyte_regs specifies whether to decode AH,CH,DH,BH. + */ +static void *decode_register(u8 modrm_reg, unsigned long *regs, + int highbyte_regs) +{ + void *p; + + p = ®s[modrm_reg]; + if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) + p = (unsigned char *)®s[modrm_reg & 3] + 1; + return p; +} + +static int read_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *ptr, + u16 *size, unsigned long *address, int op_bytes) +{ + int rc; + + if (op_bytes == 2) + op_bytes = 3; + *address = 0; + rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, + ctxt->vcpu); + if (rc) + return rc; + rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, + ctxt->vcpu); + return rc; +} + +static int test_cc(unsigned int condition, unsigned int flags) +{ + int rc = 0; + + switch ((condition & 15) >> 1) { + case 0: /* o */ + rc |= (flags & EFLG_OF); + break; + case 1: /* b/c/nae */ + rc |= (flags & EFLG_CF); + break; + case 2: /* z/e */ + rc |= (flags & EFLG_ZF); + break; + case 3: /* be/na */ + rc |= (flags & (EFLG_CF|EFLG_ZF)); + break; + case 4: /* s */ + rc |= (flags & EFLG_SF); + break; + case 5: /* p/pe */ + rc |= (flags & EFLG_PF); + break; + case 7: /* le/ng */ + rc |= (flags & EFLG_ZF); + /* fall through */ + case 6: /* l/nge */ + rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); + break; + } + + /* Odd condition identifiers (lsb == 1) have inverted sense. */ + return (!!rc ^ (condition & 1)); +} + +static void decode_register_operand(struct operand *op, + struct decode_cache *c, + int inhibit_bytereg) +{ + unsigned reg = c->modrm_reg; + int highbyte_regs = c->rex_prefix == 0; + + if (!(c->d & ModRM)) + reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); + op->type = OP_REG; + if ((c->d & ByteOp) && !inhibit_bytereg) { + op->ptr = decode_register(reg, c->regs, highbyte_regs); + op->val = *(u8 *)op->ptr; + op->bytes = 1; + } else { + op->ptr = decode_register(reg, c->regs, 0); + op->bytes = c->op_bytes; + switch (op->bytes) { + case 2: + op->val = *(u16 *)op->ptr; + break; + case 4: + op->val = *(u32 *)op->ptr; + break; + case 8: + op->val = *(u64 *) op->ptr; + break; + } + } + op->orig_val = op->val; +} + +static int decode_modrm(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + u8 sib; + int index_reg = 0, base_reg = 0, scale; + int rc = 0; + + if (c->rex_prefix) { + c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ + index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ + c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ + } + + c->modrm = insn_fetch(u8, 1, c->eip); + c->modrm_mod |= (c->modrm & 0xc0) >> 6; + c->modrm_reg |= (c->modrm & 0x38) >> 3; + c->modrm_rm |= (c->modrm & 0x07); + c->modrm_ea = 0; + c->use_modrm_ea = 1; + + if (c->modrm_mod == 3) { + c->modrm_ptr = decode_register(c->modrm_rm, + c->regs, c->d & ByteOp); + c->modrm_val = *(unsigned long *)c->modrm_ptr; + return rc; + } + + if (c->ad_bytes == 2) { + unsigned bx = c->regs[VCPU_REGS_RBX]; + unsigned bp = c->regs[VCPU_REGS_RBP]; + unsigned si = c->regs[VCPU_REGS_RSI]; + unsigned di = c->regs[VCPU_REGS_RDI]; + + /* 16-bit ModR/M decode. */ + switch (c->modrm_mod) { + case 0: + if (c->modrm_rm == 6) + c->modrm_ea += insn_fetch(u16, 2, c->eip); + break; + case 1: + c->modrm_ea += insn_fetch(s8, 1, c->eip); + break; + case 2: + c->modrm_ea += insn_fetch(u16, 2, c->eip); + break; + } + switch (c->modrm_rm) { + case 0: + c->modrm_ea += bx + si; + break; + case 1: + c->modrm_ea += bx + di; + break; + case 2: + c->modrm_ea += bp + si; + break; + case 3: + c->modrm_ea += bp + di; + break; + case 4: + c->modrm_ea += si; + break; + case 5: + c->modrm_ea += di; + break; + case 6: + if (c->modrm_mod != 0) + c->modrm_ea += bp; + break; + case 7: + c->modrm_ea += bx; + break; + } + if (c->modrm_rm == 2 || c->modrm_rm == 3 || + (c->modrm_rm == 6 && c->modrm_mod != 0)) + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_SS); + c->modrm_ea = (u16)c->modrm_ea; + } else { + /* 32/64-bit ModR/M decode. */ + if ((c->modrm_rm & 7) == 4) { + sib = insn_fetch(u8, 1, c->eip); + index_reg |= (sib >> 3) & 7; + base_reg |= sib & 7; + scale = sib >> 6; + + if ((base_reg & 7) == 5 && c->modrm_mod == 0) + c->modrm_ea += insn_fetch(s32, 4, c->eip); + else + c->modrm_ea += c->regs[base_reg]; + if (index_reg != 4) + c->modrm_ea += c->regs[index_reg] << scale; + } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { + if (ctxt->mode == X86EMUL_MODE_PROT64) + c->rip_relative = 1; + } else + c->modrm_ea += c->regs[c->modrm_rm]; + switch (c->modrm_mod) { + case 0: + if (c->modrm_rm == 5) + c->modrm_ea += insn_fetch(s32, 4, c->eip); + break; + case 1: + c->modrm_ea += insn_fetch(s8, 1, c->eip); + break; + case 2: + c->modrm_ea += insn_fetch(s32, 4, c->eip); + break; + } + } +done: + return rc; +} + +static int decode_abs(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + + switch (c->ad_bytes) { + case 2: + c->modrm_ea = insn_fetch(u16, 2, c->eip); + break; + case 4: + c->modrm_ea = insn_fetch(u32, 4, c->eip); + break; + case 8: + c->modrm_ea = insn_fetch(u64, 8, c->eip); + break; + } +done: + return rc; +} + +int +x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + int mode = ctxt->mode; + int def_op_bytes, def_ad_bytes, group; + + /* Shadow copy of register state. Committed on successful emulation. */ + + memset(c, 0, sizeof(struct decode_cache)); + c->eip = kvm_rip_read(ctxt->vcpu); + ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); + memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_PROT16: + def_op_bytes = def_ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + def_op_bytes = def_ad_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86EMUL_MODE_PROT64: + def_op_bytes = 4; + def_ad_bytes = 8; + break; +#endif + default: + return -1; + } + + c->op_bytes = def_op_bytes; + c->ad_bytes = def_ad_bytes; + + /* Legacy prefixes. */ + for (;;) { + switch (c->b = insn_fetch(u8, 1, c->eip)) { + case 0x66: /* operand-size override */ + /* switch between 2/4 bytes */ + c->op_bytes = def_op_bytes ^ 6; + break; + case 0x67: /* address-size override */ + if (mode == X86EMUL_MODE_PROT64) + /* switch between 4/8 bytes */ + c->ad_bytes = def_ad_bytes ^ 12; + else + /* switch between 2/4 bytes */ + c->ad_bytes = def_ad_bytes ^ 6; + break; + case 0x26: /* ES override */ + case 0x2e: /* CS override */ + case 0x36: /* SS override */ + case 0x3e: /* DS override */ + set_seg_override(c, (c->b >> 3) & 3); + break; + case 0x64: /* FS override */ + case 0x65: /* GS override */ + set_seg_override(c, c->b & 7); + break; + case 0x40 ... 0x4f: /* REX */ + if (mode != X86EMUL_MODE_PROT64) + goto done_prefixes; + c->rex_prefix = c->b; + continue; + case 0xf0: /* LOCK */ + c->lock_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + c->rep_prefix = REPNE_PREFIX; + break; + case 0xf3: /* REP/REPE/REPZ */ + c->rep_prefix = REPE_PREFIX; + break; + default: + goto done_prefixes; + } + + /* Any legacy prefix after a REX prefix nullifies its effect. */ + + c->rex_prefix = 0; + } + +done_prefixes: + + /* REX prefix. */ + if (c->rex_prefix) + if (c->rex_prefix & 8) + c->op_bytes = 8; /* REX.W */ + + /* Opcode byte(s). */ + c->d = opcode_table[c->b]; + if (c->d == 0) { + /* Two-byte opcode? */ + if (c->b == 0x0f) { + c->twobyte = 1; + c->b = insn_fetch(u8, 1, c->eip); + c->d = twobyte_table[c->b]; + } + } + + if (c->d & Group) { + group = c->d & GroupMask; + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + + group = (group << 3) + ((c->modrm >> 3) & 7); + if ((c->d & GroupDual) && (c->modrm >> 6) == 3) + c->d = group2_table[group]; + else + c->d = group_table[group]; + } + + /* Unrecognised? */ + if (c->d == 0) { + DPRINTF("Cannot emulate %02x\n", c->b); + return -1; + } + + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) + c->op_bytes = 8; + + /* ModRM and SIB bytes. */ + if (c->d & ModRM) + rc = decode_modrm(ctxt, ops); + else if (c->d & MemAbs) + rc = decode_abs(ctxt, ops); + if (rc) + goto done; + + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_DS); + + if (!(!c->twobyte && c->b == 0x8d)) + c->modrm_ea += seg_override_base(ctxt, c); + + if (c->ad_bytes != 8) + c->modrm_ea = (u32)c->modrm_ea; + /* + * Decode and fetch the source operand: register, memory + * or immediate. + */ + switch (c->d & SrcMask) { + case SrcNone: + break; + case SrcReg: + decode_register_operand(&c->src, c, 0); + break; + case SrcMem16: + c->src.bytes = 2; + goto srcmem_common; + case SrcMem32: + c->src.bytes = 4; + goto srcmem_common; + case SrcMem: + c->src.bytes = (c->d & ByteOp) ? 1 : + c->op_bytes; + /* Don't fetch the address for invlpg: it could be unmapped. */ + if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) + break; + srcmem_common: + /* + * For instructions with a ModR/M byte, switch to register + * access if Mod = 3. + */ + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->src.type = OP_REG; + c->src.val = c->modrm_val; + c->src.ptr = c->modrm_ptr; + break; + } + c->src.type = OP_MEM; + break; + case SrcImm: + case SrcImmU: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + if (c->src.bytes == 8) + c->src.bytes = 4; + /* NB. Immediates are sign-extended as necessary. */ + switch (c->src.bytes) { + case 1: + c->src.val = insn_fetch(s8, 1, c->eip); + break; + case 2: + c->src.val = insn_fetch(s16, 2, c->eip); + break; + case 4: + c->src.val = insn_fetch(s32, 4, c->eip); + break; + } + if ((c->d & SrcMask) == SrcImmU) { + switch (c->src.bytes) { + case 1: + c->src.val &= 0xff; + break; + case 2: + c->src.val &= 0xffff; + break; + case 4: + c->src.val &= 0xffffffff; + break; + } + } + break; + case SrcImmByte: + case SrcImmUByte: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = 1; + if ((c->d & SrcMask) == SrcImmByte) + c->src.val = insn_fetch(s8, 1, c->eip); + else + c->src.val = insn_fetch(u8, 1, c->eip); + break; + case SrcOne: + c->src.bytes = 1; + c->src.val = 1; + break; + } + + /* + * Decode and fetch the second source operand: register, memory + * or immediate. + */ + switch (c->d & Src2Mask) { + case Src2None: + break; + case Src2CL: + c->src2.bytes = 1; + c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; + break; + case Src2ImmByte: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 1; + c->src2.val = insn_fetch(u8, 1, c->eip); + break; + case Src2Imm16: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 2; + c->src2.val = insn_fetch(u16, 2, c->eip); + break; + case Src2One: + c->src2.bytes = 1; + c->src2.val = 1; + break; + } + + /* Decode and fetch the destination operand: register or memory. */ + switch (c->d & DstMask) { + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + return 0; + case DstReg: + decode_register_operand(&c->dst, c, + c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); + break; + case DstMem: + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.type = OP_REG; + c->dst.val = c->dst.orig_val = c->modrm_val; + c->dst.ptr = c->modrm_ptr; + break; + } + c->dst.type = OP_MEM; + break; + case DstAcc: + c->dst.type = OP_REG; + c->dst.bytes = c->op_bytes; + c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + switch (c->op_bytes) { + case 1: + c->dst.val = *(u8 *)c->dst.ptr; + break; + case 2: + c->dst.val = *(u16 *)c->dst.ptr; + break; + case 4: + c->dst.val = *(u32 *)c->dst.ptr; + break; + } + c->dst.orig_val = c->dst.val; + break; + } + + if (c->rip_relative) + c->modrm_ea += c->eip; + +done: + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; +} + +static inline void emulate_push(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.type = OP_MEM; + c->dst.bytes = c->op_bytes; + c->dst.val = c->src.val; + register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); + c->dst.ptr = (void *) register_address(c, ss_base(ctxt), + c->regs[VCPU_REGS_RSP]); +} + +static int emulate_pop(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + rc = ops->read_emulated(register_address(c, ss_base(ctxt), + c->regs[VCPU_REGS_RSP]), + dest, len, ctxt->vcpu); + if (rc != 0) + return rc; + + register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); + return rc; +} + +static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); + if (rc != 0) + return rc; + return 0; +} + +static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + switch (c->modrm_reg) { + case 0: /* rol */ + emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); + break; + case 1: /* ror */ + emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); + break; + case 2: /* rcl */ + emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); + break; + case 3: /* rcr */ + emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); + break; + case 4: /* sal/shl */ + case 6: /* sal/shl */ + emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); + break; + case 5: /* shr */ + emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); + break; + case 7: /* sar */ + emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); + break; + } +} + +static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + + switch (c->modrm_reg) { + case 0 ... 1: /* test */ + emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + break; + case 2: /* not */ + c->dst.val = ~c->dst.val; + break; + case 3: /* neg */ + emulate_1op("neg", c->dst, ctxt->eflags); + break; + default: + DPRINTF("Cannot emulate %02x\n", c->b); + rc = X86EMUL_UNHANDLEABLE; + break; + } + return rc; +} + +static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + + switch (c->modrm_reg) { + case 0: /* inc */ + emulate_1op("inc", c->dst, ctxt->eflags); + break; + case 1: /* dec */ + emulate_1op("dec", c->dst, ctxt->eflags); + break; + case 2: /* call near abs */ { + long int old_eip; + old_eip = c->eip; + c->eip = c->src.val; + c->src.val = old_eip; + emulate_push(ctxt); + break; + } + case 4: /* jmp abs */ + c->eip = c->src.val; + break; + case 6: /* push */ + emulate_push(ctxt); + break; + } + return 0; +} + +static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long memop) +{ + struct decode_cache *c = &ctxt->decode; + u64 old, new; + int rc; + + rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); + if (rc != 0) + return rc; + + if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || + ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { + + c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); + c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); + ctxt->eflags &= ~EFLG_ZF; + + } else { + new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | + (u32) c->regs[VCPU_REGS_RBX]; + + rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); + if (rc != 0) + return rc; + ctxt->eflags |= EFLG_ZF; + } + return 0; +} + +static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + unsigned long cs; + + rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); + if (rc) + return rc; + if (c->op_bytes == 4) + c->eip = (u32)c->eip; + rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + if (rc) + return rc; + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + return rc; +} + +static inline int writeback(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + int rc; + struct decode_cache *c = &ctxt->decode; + + switch (c->dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: + * in 64-bit mode we zero-extend. + */ + switch (c->dst.bytes) { + case 1: + *(u8 *)c->dst.ptr = (u8)c->dst.val; + break; + case 2: + *(u16 *)c->dst.ptr = (u16)c->dst.val; + break; + case 4: + *c->dst.ptr = (u32)c->dst.val; + break; /* 64b: zero-ext */ + case 8: + *c->dst.ptr = c->dst.val; + break; + } + break; + case OP_MEM: + if (c->lock_prefix) + rc = ops->cmpxchg_emulated( + (unsigned long)c->dst.ptr, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + else + rc = ops->write_emulated( + (unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != 0) + return rc; + break; + case OP_NONE: + /* no writeback */ + break; + default: + break; + } + return 0; +} + +static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) +{ + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); + /* + * an sti; sti; sequence only disable interrupts for the first + * instruction. So, if the last instruction, be it emulated or + * not, left the system with the INT_STI flag enabled, it + * means that the last instruction is an sti. We should not + * leave the flag on in this case. The same goes for mov ss + */ + if (!(int_shadow & mask)) + ctxt->interruptibility = mask; +} + +static inline void +setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, + struct kvm_segment *cs, struct kvm_segment *ss) +{ + memset(cs, 0, sizeof(struct kvm_segment)); + kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); + memset(ss, 0, sizeof(struct kvm_segment)); + + cs->l = 0; /* will be adjusted later */ + cs->base = 0; /* flat segment */ + cs->g = 1; /* 4kb granularity */ + cs->limit = 0xffffffff; /* 4GB limit */ + cs->type = 0x0b; /* Read, Execute, Accessed */ + cs->s = 1; + cs->dpl = 0; /* will be adjusted later */ + cs->present = 1; + cs->db = 1; + + ss->unusable = 0; + ss->base = 0; /* flat segment */ + ss->limit = 0xffffffff; /* 4GB limit */ + ss->g = 1; /* 4kb granularity */ + ss->s = 1; + ss->type = 0x03; /* Read/Write, Accessed */ + ss->db = 1; /* 32bit stack segment */ + ss->dpl = 0; + ss->present = 1; +} + +static int +emulate_syscall(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + + /* syscall is not available in real mode */ + if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL + || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) + return -1; + + setup_syscalls_segments(ctxt, &cs, &ss); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + msr_data >>= 32; + cs.selector = (u16)(msr_data & 0xfffc); + ss.selector = (u16)(msr_data + 8); + + if (is_long_mode(ctxt->vcpu)) { + cs.db = 0; + cs.l = 1; + } + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + c->regs[VCPU_REGS_RCX] = c->eip; + if (is_long_mode(ctxt->vcpu)) { +#ifdef CONFIG_X86_64 + c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; + + kvm_x86_ops->get_msr(ctxt->vcpu, + ctxt->mode == X86EMUL_MODE_PROT64 ? + MSR_LSTAR : MSR_CSTAR, &msr_data); + c->eip = msr_data; + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ctxt->eflags &= ~(msr_data | EFLG_RF); +#endif + } else { + /* legacy mode */ + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + c->eip = (u32)msr_data; + + ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + } + + return 0; +} + +static int +emulate_sysenter(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + + /* inject #UD if LOCK prefix is used */ + if (c->lock_prefix) + return -1; + + /* inject #GP if in real mode or paging is disabled */ + if (ctxt->mode == X86EMUL_MODE_REAL || + !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + /* XXX sysenter/sysexit have not been tested in 64bit mode. + * Therefore, we inject an #UD. + */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + return -1; + + setup_syscalls_segments(ctxt, &cs, &ss); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + switch (ctxt->mode) { + case X86EMUL_MODE_PROT32: + if ((msr_data & 0xfffc) == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + break; + case X86EMUL_MODE_PROT64: + if (msr_data == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + break; + } + + ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + cs.selector = (u16)msr_data; + cs.selector &= ~SELECTOR_RPL_MASK; + ss.selector = cs.selector + 8; + ss.selector &= ~SELECTOR_RPL_MASK; + if (ctxt->mode == X86EMUL_MODE_PROT64 + || is_long_mode(ctxt->vcpu)) { + cs.db = 0; + cs.l = 1; + } + + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + c->eip = msr_data; + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + c->regs[VCPU_REGS_RSP] = msr_data; + + return 0; +} + +static int +emulate_sysexit(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + int usermode; + + /* inject #UD if LOCK prefix is used */ + if (c->lock_prefix) + return -1; + + /* inject #GP if in real mode or paging is disabled */ + if (ctxt->mode == X86EMUL_MODE_REAL + || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + /* sysexit must be called from CPL 0 */ + if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + setup_syscalls_segments(ctxt, &cs, &ss); + + if ((c->rex_prefix & 0x8) != 0x0) + usermode = X86EMUL_MODE_PROT64; + else + usermode = X86EMUL_MODE_PROT32; + + cs.dpl = 3; + ss.dpl = 3; + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + switch (usermode) { + case X86EMUL_MODE_PROT32: + cs.selector = (u16)(msr_data + 16); + if ((msr_data & 0xfffc) == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + ss.selector = (u16)(msr_data + 24); + break; + case X86EMUL_MODE_PROT64: + cs.selector = (u16)(msr_data + 32); + if (msr_data == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + ss.selector = cs.selector + 8; + cs.db = 0; + cs.l = 1; + break; + } + cs.selector |= SELECTOR_RPL_MASK; + ss.selector |= SELECTOR_RPL_MASK; + + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; + c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; + + return 0; +} + +int +x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + unsigned long memop = 0; + u64 msr_data; + unsigned long saved_eip = 0; + struct decode_cache *c = &ctxt->decode; + unsigned int port; + int io_dir_in; + int rc = 0; + + ctxt->interruptibility = 0; + + /* Shadow copy of register state. Committed on successful emulation. + * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't + * modify them. + */ + + memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + saved_eip = c->eip; + + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) + memop = c->modrm_ea; + + if (c->rep_prefix && (c->d & String)) { + /* All REP prefixes have the same first termination condition */ + if (c->regs[VCPU_REGS_RCX] == 0) { + kvm_rip_write(ctxt->vcpu, c->eip); + goto done; + } + /* The second termination condition only applies for REPE + * and REPNE. Test if the repeat string operation prefix is + * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the + * corresponding termination condition according to: + * - if REPE/REPZ and ZF = 0 then done + * - if REPNE/REPNZ and ZF = 1 then done + */ + if ((c->b == 0xa6) || (c->b == 0xa7) || + (c->b == 0xae) || (c->b == 0xaf)) { + if ((c->rep_prefix == REPE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == 0)) { + kvm_rip_write(ctxt->vcpu, c->eip); + goto done; + } + if ((c->rep_prefix == REPNE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { + kvm_rip_write(ctxt->vcpu, c->eip); + goto done; + } + } + c->regs[VCPU_REGS_RCX]--; + c->eip = kvm_rip_read(ctxt->vcpu); + } + + if (c->src.type == OP_MEM) { + c->src.ptr = (unsigned long *)memop; + c->src.val = 0; + rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu); + if (rc != 0) + goto done; + c->src.orig_val = c->src.val; + } + + if ((c->d & DstMask) == ImplicitOps) + goto special_insn; + + + if (c->dst.type == OP_MEM) { + c->dst.ptr = (unsigned long *)memop; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.val = 0; + if (c->d & BitOp) { + unsigned long mask = ~(c->dst.bytes * 8 - 1); + + c->dst.ptr = (void *)c->dst.ptr + + (c->src.val & mask) / 8; + } + if (!(c->d & Mov) && + /* optimisation - avoid slow emulated read */ + ((rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, ctxt->vcpu)) != 0)) + goto done; + } + c->dst.orig_val = c->dst.val; + +special_insn: + + if (c->twobyte) + goto twobyte_insn; + + switch (c->b) { + case 0x00 ... 0x05: + add: /* add */ + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + break; + case 0x08 ... 0x0d: + or: /* or */ + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + break; + case 0x10 ... 0x15: + adc: /* adc */ + emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + break; + case 0x18 ... 0x1d: + sbb: /* sbb */ + emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); + break; + case 0x20 ... 0x25: + and: /* and */ + emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); + break; + case 0x28 ... 0x2d: + sub: /* sub */ + emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); + break; + case 0x30 ... 0x35: + xor: /* xor */ + emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); + break; + case 0x38 ... 0x3d: + cmp: /* cmp */ + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + break; + case 0x40 ... 0x47: /* inc r16/r32 */ + emulate_1op("inc", c->dst, ctxt->eflags); + break; + case 0x48 ... 0x4f: /* dec r16/r32 */ + emulate_1op("dec", c->dst, ctxt->eflags); + break; + case 0x50 ... 0x57: /* push reg */ + emulate_push(ctxt); + break; + case 0x58 ... 0x5f: /* pop reg */ + pop_instruction: + rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != 0) + goto done; + break; + case 0x63: /* movsxd */ + if (ctxt->mode != X86EMUL_MODE_PROT64) + goto cannot_emulate; + c->dst.val = (s32) c->src.val; + break; + case 0x68: /* push imm */ + case 0x6a: /* push imm8 */ + emulate_push(ctxt); + break; + case 0x6c: /* insb */ + case 0x6d: /* insw/insd */ + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + 1, + (c->d & ByteOp) ? 1 : c->op_bytes, + c->rep_prefix ? + address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, + (ctxt->eflags & EFLG_DF), + register_address(c, es_base(ctxt), + c->regs[VCPU_REGS_RDI]), + c->rep_prefix, + c->regs[VCPU_REGS_RDX]) == 0) { + c->eip = saved_eip; + return -1; + } + return 0; + case 0x6e: /* outsb */ + case 0x6f: /* outsw/outsd */ + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + 0, + (c->d & ByteOp) ? 1 : c->op_bytes, + c->rep_prefix ? + address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, + (ctxt->eflags & EFLG_DF), + register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + c->rep_prefix, + c->regs[VCPU_REGS_RDX]) == 0) { + c->eip = saved_eip; + return -1; + } + return 0; + case 0x70 ... 0x7f: /* jcc (short) */ + if (test_cc(c->b, ctxt->eflags)) + jmp_rel(c, c->src.val); + break; + case 0x80 ... 0x83: /* Grp1 */ + switch (c->modrm_reg) { + case 0: + goto add; + case 1: + goto or; + case 2: + goto adc; + case 3: + goto sbb; + case 4: + goto and; + case 5: + goto sub; + case 6: + goto xor; + case 7: + goto cmp; + } + break; + case 0x84 ... 0x85: + emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + break; + case 0x86 ... 0x87: /* xchg */ + xchg: + /* Write back the register source. */ + switch (c->dst.bytes) { + case 1: + *(u8 *) c->src.ptr = (u8) c->dst.val; + break; + case 2: + *(u16 *) c->src.ptr = (u16) c->dst.val; + break; + case 4: + *c->src.ptr = (u32) c->dst.val; + break; /* 64b reg: zero-extend */ + case 8: + *c->src.ptr = c->dst.val; + break; + } + /* + * Write back the memory destination with implicit LOCK + * prefix. + */ + c->dst.val = c->src.val; + c->lock_prefix = 1; + break; + case 0x88 ... 0x8b: /* mov */ + goto mov; + case 0x8c: { /* mov r/m, sreg */ + struct kvm_segment segreg; + + if (c->modrm_reg <= 5) + kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); + else { + printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", + c->modrm); + goto cannot_emulate; + } + c->dst.val = segreg.selector; + break; + } + case 0x8d: /* lea r16/r32, m */ + c->dst.val = c->modrm_ea; + break; + case 0x8e: { /* mov seg, r/m16 */ + uint16_t sel; + int type_bits; + int err; + + sel = c->src.val; + if (c->modrm_reg == VCPU_SREG_SS) + toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + + if (c->modrm_reg <= 5) { + type_bits = (c->modrm_reg == 1) ? 9 : 1; + err = kvm_load_segment_descriptor(ctxt->vcpu, sel, + type_bits, c->modrm_reg); + } else { + printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", + c->modrm); + goto cannot_emulate; + } + + if (err < 0) + goto cannot_emulate; + + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + } + case 0x8f: /* pop (sole member of Grp1a) */ + rc = emulate_grp1a(ctxt, ops); + if (rc != 0) + goto done; + break; + case 0x90: /* nop / xchg r8,rax */ + if (!(c->rex_prefix & 1)) { /* nop */ + c->dst.type = OP_NONE; + break; + } + case 0x91 ... 0x97: /* xchg reg,rax */ + c->src.type = c->dst.type = OP_REG; + c->src.bytes = c->dst.bytes = c->op_bytes; + c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; + c->src.val = *(c->src.ptr); + goto xchg; + case 0x9c: /* pushf */ + c->src.val = (unsigned long) ctxt->eflags; + emulate_push(ctxt); + break; + case 0x9d: /* popf */ + c->dst.type = OP_REG; + c->dst.ptr = (unsigned long *) &ctxt->eflags; + c->dst.bytes = c->op_bytes; + goto pop_instruction; + case 0xa0 ... 0xa1: /* mov */ + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + c->dst.val = c->src.val; + break; + case 0xa2 ... 0xa3: /* mov */ + c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; + break; + case 0xa4 ... 0xa5: /* movs */ + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address(c, + es_base(ctxt), + c->regs[VCPU_REGS_RDI]); + if ((rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, ctxt->vcpu)) != 0) + goto done; + register_address_increment(c, &c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + register_address_increment(c, &c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xa6 ... 0xa7: /* cmps */ + c->src.type = OP_NONE; /* Disable writeback. */ + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.ptr = (unsigned long *)register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]); + if ((rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu)) != 0) + goto done; + + c->dst.type = OP_NONE; /* Disable writeback. */ + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address(c, + es_base(ctxt), + c->regs[VCPU_REGS_RDI]); + if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu)) != 0) + goto done; + + DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); + + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + + register_address_increment(c, &c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->src.bytes + : c->src.bytes); + register_address_increment(c, &c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + + break; + case 0xaa ... 0xab: /* stos */ + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address(c, + es_base(ctxt), + c->regs[VCPU_REGS_RDI]); + c->dst.val = c->regs[VCPU_REGS_RAX]; + register_address_increment(c, &c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xac ... 0xad: /* lods */ + c->dst.type = OP_REG; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + if ((rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, + ctxt->vcpu)) != 0) + goto done; + register_address_increment(c, &c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xae ... 0xaf: /* scas */ + DPRINTF("Urk! I don't handle SCAS.\n"); + goto cannot_emulate; + case 0xb0 ... 0xbf: /* mov r, imm */ + goto mov; + case 0xc0 ... 0xc1: + emulate_grp2(ctxt); + break; + case 0xc3: /* ret */ + c->dst.type = OP_REG; + c->dst.ptr = &c->eip; + c->dst.bytes = c->op_bytes; + goto pop_instruction; + case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ + mov: + c->dst.val = c->src.val; + break; + case 0xcb: /* ret far */ + rc = emulate_ret_far(ctxt, ops); + if (rc) + goto done; + break; + case 0xd0 ... 0xd1: /* Grp2 */ + c->src.val = 1; + emulate_grp2(ctxt); + break; + case 0xd2 ... 0xd3: /* Grp2 */ + c->src.val = c->regs[VCPU_REGS_RCX]; + emulate_grp2(ctxt); + break; + case 0xe4: /* inb */ + case 0xe5: /* in */ + port = c->src.val; + io_dir_in = 1; + goto do_io; + case 0xe6: /* outb */ + case 0xe7: /* out */ + port = c->src.val; + io_dir_in = 0; + goto do_io; + case 0xe8: /* call (near) */ { + long int rel = c->src.val; + c->src.val = (unsigned long) c->eip; + jmp_rel(c, rel); + emulate_push(ctxt); + break; + } + case 0xe9: /* jmp rel */ + goto jmp; + case 0xea: /* jmp far */ + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, + VCPU_SREG_CS) < 0) { + DPRINTF("jmp far: Failed to load CS descriptor\n"); + goto cannot_emulate; + } + + c->eip = c->src.val; + break; + case 0xeb: + jmp: /* jmp rel short */ + jmp_rel(c, c->src.val); + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xec: /* in al,dx */ + case 0xed: /* in (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 1; + goto do_io; + case 0xee: /* out al,dx */ + case 0xef: /* out (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 0; + do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, + (c->d & ByteOp) ? 1 : c->op_bytes, + port) != 0) { + c->eip = saved_eip; + goto cannot_emulate; + } + break; + case 0xf4: /* hlt */ + ctxt->vcpu->arch.halt_request = 1; + break; + case 0xf5: /* cmc */ + /* complement carry flag from eflags reg */ + ctxt->eflags ^= EFLG_CF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xf6 ... 0xf7: /* Grp3 */ + rc = emulate_grp3(ctxt, ops); + if (rc != 0) + goto done; + break; + case 0xf8: /* clc */ + ctxt->eflags &= ~EFLG_CF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfa: /* cli */ + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfb: /* sti */ + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfc: /* cld */ + ctxt->eflags &= ~EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfd: /* std */ + ctxt->eflags |= EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfe ... 0xff: /* Grp4/Grp5 */ + rc = emulate_grp45(ctxt, ops); + if (rc != 0) + goto done; + break; + } + +writeback: + rc = writeback(ctxt, ops); + if (rc != 0) + goto done; + + /* Commit shadow register state. */ + memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(ctxt->vcpu, c->eip); + +done: + if (rc == X86EMUL_UNHANDLEABLE) { + c->eip = saved_eip; + return -1; + } + return 0; + +twobyte_insn: + switch (c->b) { + case 0x01: /* lgdt, lidt, lmsw */ + switch (c->modrm_reg) { + u16 size; + unsigned long address; + + case 0: /* vmcall */ + if (c->modrm_mod != 3 || c->modrm_rm != 1) + goto cannot_emulate; + + rc = kvm_fix_hypercall(ctxt->vcpu); + if (rc) + goto done; + + /* Let the processor re-execute the fixed hypercall */ + c->eip = kvm_rip_read(ctxt->vcpu); + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + case 2: /* lgdt */ + rc = read_descriptor(ctxt, ops, c->src.ptr, + &size, &address, c->op_bytes); + if (rc) + goto done; + realmode_lgdt(ctxt->vcpu, size, address); + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + case 3: /* lidt/vmmcall */ + if (c->modrm_mod == 3) { + switch (c->modrm_rm) { + case 1: + rc = kvm_fix_hypercall(ctxt->vcpu); + if (rc) + goto done; + break; + default: + goto cannot_emulate; + } + } else { + rc = read_descriptor(ctxt, ops, c->src.ptr, + &size, &address, + c->op_bytes); + if (rc) + goto done; + realmode_lidt(ctxt->vcpu, size, address); + } + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + case 4: /* smsw */ + c->dst.bytes = 2; + c->dst.val = realmode_get_cr(ctxt->vcpu, 0); + break; + case 6: /* lmsw */ + realmode_lmsw(ctxt->vcpu, (u16)c->src.val, + &ctxt->eflags); + c->dst.type = OP_NONE; + break; + case 7: /* invlpg*/ + emulate_invlpg(ctxt->vcpu, memop); + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + default: + goto cannot_emulate; + } + break; + case 0x05: /* syscall */ + if (emulate_syscall(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; + break; + case 0x06: + emulate_clts(ctxt->vcpu); + c->dst.type = OP_NONE; + break; + case 0x08: /* invd */ + case 0x09: /* wbinvd */ + case 0x0d: /* GrpP (prefetch) */ + case 0x18: /* Grp16 (prefetch/nop) */ + c->dst.type = OP_NONE; + break; + case 0x20: /* mov cr, reg */ + if (c->modrm_mod != 3) + goto cannot_emulate; + c->regs[c->modrm_rm] = + realmode_get_cr(ctxt->vcpu, c->modrm_reg); + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x21: /* mov from dr to reg */ + if (c->modrm_mod != 3) + goto cannot_emulate; + rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); + if (rc) + goto cannot_emulate; + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x22: /* mov reg, cr */ + if (c->modrm_mod != 3) + goto cannot_emulate; + realmode_set_cr(ctxt->vcpu, + c->modrm_reg, c->modrm_val, &ctxt->eflags); + c->dst.type = OP_NONE; + break; + case 0x23: /* mov from reg to dr */ + if (c->modrm_mod != 3) + goto cannot_emulate; + rc = emulator_set_dr(ctxt, c->modrm_reg, + c->regs[c->modrm_rm]); + if (rc) + goto cannot_emulate; + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x30: + /* wrmsr */ + msr_data = (u32)c->regs[VCPU_REGS_RAX] + | ((u64)c->regs[VCPU_REGS_RDX] << 32); + rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); + if (rc) { + kvm_inject_gp(ctxt->vcpu, 0); + c->eip = kvm_rip_read(ctxt->vcpu); + } + rc = X86EMUL_CONTINUE; + c->dst.type = OP_NONE; + break; + case 0x32: + /* rdmsr */ + rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); + if (rc) { + kvm_inject_gp(ctxt->vcpu, 0); + c->eip = kvm_rip_read(ctxt->vcpu); + } else { + c->regs[VCPU_REGS_RAX] = (u32)msr_data; + c->regs[VCPU_REGS_RDX] = msr_data >> 32; + } + rc = X86EMUL_CONTINUE; + c->dst.type = OP_NONE; + break; + case 0x34: /* sysenter */ + if (emulate_sysenter(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; + break; + case 0x35: /* sysexit */ + if (emulate_sysexit(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; + break; + case 0x40 ... 0x4f: /* cmov */ + c->dst.val = c->dst.orig_val = c->src.val; + if (!test_cc(c->b, ctxt->eflags)) + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x80 ... 0x8f: /* jnz rel, etc*/ + if (test_cc(c->b, ctxt->eflags)) + jmp_rel(c, c->src.val); + c->dst.type = OP_NONE; + break; + case 0xa3: + bt: /* bt */ + c->dst.type = OP_NONE; + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); + break; + case 0xa4: /* shld imm8, r, r/m */ + case 0xa5: /* shld cl, r, r/m */ + emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); + break; + case 0xab: + bts: /* bts */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); + break; + case 0xac: /* shrd imm8, r, r/m */ + case 0xad: /* shrd cl, r, r/m */ + emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); + break; + case 0xae: /* clflush */ + break; + case 0xb0 ... 0xb1: /* cmpxchg */ + /* + * Save real source value, then compare EAX against + * destination. + */ + c->src.orig_val = c->src.val; + c->src.val = c->regs[VCPU_REGS_RAX]; + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + if (ctxt->eflags & EFLG_ZF) { + /* Success: write back to memory. */ + c->dst.val = c->src.orig_val; + } else { + /* Failure: write the value we saw to EAX. */ + c->dst.type = OP_REG; + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + } + break; + case 0xb3: + btr: /* btr */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); + break; + case 0xb6 ... 0xb7: /* movzx */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->d & ByteOp) ? (u8) c->src.val + : (u16) c->src.val; + break; + case 0xba: /* Grp8 */ + switch (c->modrm_reg & 3) { + case 0: + goto bt; + case 1: + goto bts; + case 2: + goto btr; + case 3: + goto btc; + } + break; + case 0xbb: + btc: /* btc */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); + break; + case 0xbe ... 0xbf: /* movsx */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : + (s16) c->src.val; + break; + case 0xc3: /* movnti */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : + (u64) c->src.val; + break; + case 0xc7: /* Grp9 (cmpxchg8b) */ + rc = emulate_grp9(ctxt, ops, memop); + if (rc != 0) + goto done; + c->dst.type = OP_NONE; + break; + } + goto writeback; + +cannot_emulate: + DPRINTF("Cannot emulate %02x\n", c->b); + c->eip = saved_eip; + return -1; +} diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1aa7e6d91d4f..c0e942747b0f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2759,7 +2759,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; /* - * TODO: fix x86_emulate.c to use guest_read/write_register + * TODO: fix emulate.c to use guest_read/write_register * instead of direct ->regs accesses, can save hundred cycles * on Intel for instructions that don't read/change RSP, for * for example. diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c deleted file mode 100644 index c6663d46f328..000000000000 --- a/arch/x86/kvm/x86_emulate.c +++ /dev/null @@ -1,2392 +0,0 @@ -/****************************************************************************** - * x86_emulate.c - * - * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. - * - * Copyright (c) 2005 Keir Fraser - * - * Linux coding style, mod r/m decoder, segment base fixes, real-mode - * privileged instructions: - * - * Copyright (C) 2006 Qumranet - * - * Avi Kivity - * Yaniv Kamay - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 - */ - -#ifndef __KERNEL__ -#include -#include -#include -#define DPRINTF(_f, _a ...) printf(_f , ## _a) -#else -#include -#include "kvm_cache_regs.h" -#define DPRINTF(x...) do {} while (0) -#endif -#include -#include - -#include "mmu.h" /* for is_long_mode() */ - -/* - * Opcode effective-address decode tables. - * Note that we only emulate instructions that have at least one memory - * operand (excluding implicit stack references). We assume that stack - * references and instruction fetches will never occur in special memory - * areas that require emulation. So, for example, 'mov ,' need - * not be handled. - */ - -/* Operand sizes: 8-bit operands or specified/overridden size. */ -#define ByteOp (1<<0) /* 8-bit operands. */ -/* Destination operand type. */ -#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ -#define DstReg (2<<1) /* Register operand. */ -#define DstMem (3<<1) /* Memory operand. */ -#define DstAcc (4<<1) /* Destination Accumulator */ -#define DstMask (7<<1) -/* Source operand type. */ -#define SrcNone (0<<4) /* No source operand. */ -#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ -#define SrcReg (1<<4) /* Register operand. */ -#define SrcMem (2<<4) /* Memory operand. */ -#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ -#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ -#define SrcImm (5<<4) /* Immediate operand. */ -#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ -#define SrcOne (7<<4) /* Implied '1' */ -#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ -#define SrcImmU (9<<4) /* Immediate operand, unsigned */ -#define SrcMask (0xf<<4) -/* Generic ModRM decode. */ -#define ModRM (1<<8) -/* Destination is only written; never read. */ -#define Mov (1<<9) -#define BitOp (1<<10) -#define MemAbs (1<<11) /* Memory operand is absolute displacement */ -#define String (1<<12) /* String instruction (rep capable) */ -#define Stack (1<<13) /* Stack instruction (push/pop) */ -#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ -#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ -#define GroupMask 0xff /* Group number stored in bits 0:7 */ -/* Source 2 operand type */ -#define Src2None (0<<29) -#define Src2CL (1<<29) -#define Src2ImmByte (2<<29) -#define Src2One (3<<29) -#define Src2Imm16 (4<<29) -#define Src2Mask (7<<29) - -enum { - Group1_80, Group1_81, Group1_82, Group1_83, - Group1A, Group3_Byte, Group3, Group4, Group5, Group7, -}; - -static u32 opcode_table[256] = { - /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, - /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, - /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x38 - 0x3F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - 0, 0, - /* 0x40 - 0x47 */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x48 - 0x4F */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x50 - 0x57 */ - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - /* 0x58 - 0x5F */ - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - /* 0x60 - 0x67 */ - 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , - 0, 0, 0, 0, - /* 0x68 - 0x6F */ - SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ - /* 0x70 - 0x77 */ - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - /* 0x78 - 0x7F */ - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - /* 0x80 - 0x87 */ - Group | Group1_80, Group | Group1_81, - Group | Group1_82, Group | Group1_83, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - /* 0x88 - 0x8F */ - ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, - ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, - DstReg | SrcMem | ModRM | Mov, Group | Group1A, - /* 0x90 - 0x97 */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x98 - 0x9F */ - 0, 0, SrcImm | Src2Imm16, 0, - ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, - /* 0xA0 - 0xA7 */ - ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, - ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, - ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | String, ImplicitOps | String, - /* 0xA8 - 0xAF */ - 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | String, ImplicitOps | String, - /* 0xB0 - 0xB7 */ - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - /* 0xB8 - 0xBF */ - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - /* 0xC0 - 0xC7 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, - 0, ImplicitOps | Stack, 0, 0, - ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, - /* 0xC8 - 0xCF */ - 0, 0, 0, ImplicitOps | Stack, - ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, - /* 0xD0 - 0xD7 */ - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - 0, 0, 0, 0, - /* 0xD8 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xE7 */ - 0, 0, 0, 0, - ByteOp | SrcImmUByte, SrcImmUByte, - ByteOp | SrcImmUByte, SrcImmUByte, - /* 0xE8 - 0xEF */ - SrcImm | Stack, SrcImm | ImplicitOps, - SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, - /* 0xF0 - 0xF7 */ - 0, 0, 0, 0, - ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, - /* 0xF8 - 0xFF */ - ImplicitOps, 0, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, -}; - -static u32 twobyte_table[256] = { - /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, - /* 0x10 - 0x1F */ - 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, - /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x40 - 0x47 */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x48 - 0x4F */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x50 - 0x5F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x60 - 0x6F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x70 - 0x7F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x80 - 0x8F */ - SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, - SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, - /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xA0 - 0xA7 */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, - DstMem | SrcReg | Src2ImmByte | ModRM, - DstMem | SrcReg | Src2CL | ModRM, 0, 0, - /* 0xA8 - 0xAF */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, - DstMem | SrcReg | Src2ImmByte | ModRM, - DstMem | SrcReg | Src2CL | ModRM, - ModRM, 0, - /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xD0 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xEF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xF0 - 0xFF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -static u32 group_table[] = { - [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - [Group1_81*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - [Group1_83*8] = - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - [Group1A*8] = - DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, - [Group3_Byte*8] = - ByteOp | SrcImm | DstMem | ModRM, 0, - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, - 0, 0, 0, 0, - [Group3*8] = - DstMem | SrcImm | ModRM, 0, - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, - 0, 0, 0, 0, - [Group4*8] = - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, - 0, 0, 0, 0, 0, 0, - [Group5*8] = - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, - SrcMem | ModRM | Stack, 0, - SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, - [Group7*8] = - 0, 0, ModRM | SrcMem, ModRM | SrcMem, - SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, -}; - -static u32 group2_table[] = { - [Group7*8] = - SrcNone | ModRM, 0, 0, SrcNone | ModRM, - SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, 0, -}; - -/* EFLAGS bit definitions. */ -#define EFLG_VM (1<<17) -#define EFLG_RF (1<<16) -#define EFLG_OF (1<<11) -#define EFLG_DF (1<<10) -#define EFLG_IF (1<<9) -#define EFLG_SF (1<<7) -#define EFLG_ZF (1<<6) -#define EFLG_AF (1<<4) -#define EFLG_PF (1<<2) -#define EFLG_CF (1<<0) - -/* - * Instruction emulation: - * Most instructions are emulated directly via a fragment of inline assembly - * code. This allows us to save/restore EFLAGS and thus very easily pick up - * any modified flags. - */ - -#if defined(CONFIG_X86_64) -#define _LO32 "k" /* force 32-bit operand */ -#define _STK "%%rsp" /* stack pointer */ -#elif defined(__i386__) -#define _LO32 "" /* force 32-bit operand */ -#define _STK "%%esp" /* stack pointer */ -#endif - -/* - * These EFLAGS bits are restored from saved value during emulation, and - * any changes are written back to the saved value after emulation. - */ -#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) - -/* Before executing instruction: restore necessary bits in EFLAGS. */ -#define _PRE_EFLAGS(_sav, _msk, _tmp) \ - /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ - "movl %"_sav",%"_LO32 _tmp"; " \ - "push %"_tmp"; " \ - "push %"_tmp"; " \ - "movl %"_msk",%"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "pushf; " \ - "notl %"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ - "pop %"_tmp"; " \ - "orl %"_LO32 _tmp",("_STK"); " \ - "popf; " \ - "pop %"_sav"; " - -/* After executing instruction: write-back necessary bits in EFLAGS. */ -#define _POST_EFLAGS(_sav, _msk, _tmp) \ - /* _sav |= EFLAGS & _msk; */ \ - "pushf; " \ - "pop %"_tmp"; " \ - "andl %"_msk",%"_LO32 _tmp"; " \ - "orl %"_LO32 _tmp",%"_sav"; " - -#ifdef CONFIG_X86_64 -#define ON64(x) x -#else -#define ON64(x) -#endif - -#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op _suffix " %"_x"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _y ((_src).val), "i" (EFLAGS_MASK)); \ - } while (0) - - -/* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 2: \ - ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ - break; \ - case 4: \ - ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ - break; \ - case 8: \ - ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ - break; \ - } \ - } while (0) - -#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - switch ((_dst).bytes) { \ - case 1: \ - ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ - break; \ - default: \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - _wx, _wy, _lx, _ly, _qx, _qy); \ - break; \ - } \ - } while (0) - -/* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "c", "b", "c", "b", "c", "b", "c") - -/* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "q", "w", "r", _LO32, "r", "", "r") - -/* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - "w", "r", _LO32, "r", "", "r") - -/* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ - do { \ - unsigned long _tmp; \ - _type _clv = (_cl).val; \ - _type _srcv = (_src).val; \ - _type _dstv = (_dst).val; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "2") \ - _op _suffix " %4,%1 \n" \ - _POST_EFLAGS("0", "5", "2") \ - : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ - : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ - ); \ - \ - (_cl).val = (unsigned long) _clv; \ - (_src).val = (unsigned long) _srcv; \ - (_dst).val = (unsigned long) _dstv; \ - } while (0) - -#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ - do { \ - switch ((_dst).bytes) { \ - case 2: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "w", unsigned short); \ - break; \ - case 4: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "l", unsigned int); \ - break; \ - case 8: \ - ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "q", unsigned long)); \ - break; \ - } \ - } while (0) - -#define __emulate_1op(_op, _dst, _eflags, _suffix) \ - do { \ - unsigned long _tmp; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op _suffix " %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "+m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - } while (0) - -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(_op, _dst, _eflags) \ - do { \ - switch ((_dst).bytes) { \ - case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ - case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ - case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ - case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ - } \ - } while (0) - -/* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _size, _eip) \ -({ unsigned long _x; \ - rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ - if (rc != 0) \ - goto done; \ - (_eip) += (_size); \ - (_type)_x; \ -}) - -static inline unsigned long ad_mask(struct decode_cache *c) -{ - return (1UL << (c->ad_bytes << 3)) - 1; -} - -/* Access/update address held in a register, based on addressing mode. */ -static inline unsigned long -address_mask(struct decode_cache *c, unsigned long reg) -{ - if (c->ad_bytes == sizeof(unsigned long)) - return reg; - else - return reg & ad_mask(c); -} - -static inline unsigned long -register_address(struct decode_cache *c, unsigned long base, unsigned long reg) -{ - return base + address_mask(c, reg); -} - -static inline void -register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) -{ - if (c->ad_bytes == sizeof(unsigned long)) - *reg += inc; - else - *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); -} - -static inline void jmp_rel(struct decode_cache *c, int rel) -{ - register_address_increment(c, &c->eip, rel); -} - -static void set_seg_override(struct decode_cache *c, int seg) -{ - c->has_seg_override = true; - c->seg_override = seg; -} - -static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) -{ - if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) - return 0; - - return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); -} - -static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, - struct decode_cache *c) -{ - if (!c->has_seg_override) - return 0; - - return seg_base(ctxt, c->seg_override); -} - -static unsigned long es_base(struct x86_emulate_ctxt *ctxt) -{ - return seg_base(ctxt, VCPU_SREG_ES); -} - -static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) -{ - return seg_base(ctxt, VCPU_SREG_SS); -} - -static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long linear, u8 *dest) -{ - struct fetch_cache *fc = &ctxt->decode.fetch; - int rc; - int size; - - if (linear < fc->start || linear >= fc->end) { - size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); - if (rc) - return rc; - fc->start = linear; - fc->end = linear + size; - } - *dest = fc->data[linear - fc->start]; - return 0; -} - -static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long eip, void *dest, unsigned size) -{ - int rc = 0; - - eip += ctxt->cs_base; - while (size--) { - rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); - if (rc) - return rc; - } - return 0; -} - -/* - * Given the 'reg' portion of a ModRM byte, and a register block, return a - * pointer into the block that addresses the relevant register. - * @highbyte_regs specifies whether to decode AH,CH,DH,BH. - */ -static void *decode_register(u8 modrm_reg, unsigned long *regs, - int highbyte_regs) -{ - void *p; - - p = ®s[modrm_reg]; - if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) - p = (unsigned char *)®s[modrm_reg & 3] + 1; - return p; -} - -static int read_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - void *ptr, - u16 *size, unsigned long *address, int op_bytes) -{ - int rc; - - if (op_bytes == 2) - op_bytes = 3; - *address = 0; - rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); - if (rc) - return rc; - rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); - return rc; -} - -static int test_cc(unsigned int condition, unsigned int flags) -{ - int rc = 0; - - switch ((condition & 15) >> 1) { - case 0: /* o */ - rc |= (flags & EFLG_OF); - break; - case 1: /* b/c/nae */ - rc |= (flags & EFLG_CF); - break; - case 2: /* z/e */ - rc |= (flags & EFLG_ZF); - break; - case 3: /* be/na */ - rc |= (flags & (EFLG_CF|EFLG_ZF)); - break; - case 4: /* s */ - rc |= (flags & EFLG_SF); - break; - case 5: /* p/pe */ - rc |= (flags & EFLG_PF); - break; - case 7: /* le/ng */ - rc |= (flags & EFLG_ZF); - /* fall through */ - case 6: /* l/nge */ - rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); - break; - } - - /* Odd condition identifiers (lsb == 1) have inverted sense. */ - return (!!rc ^ (condition & 1)); -} - -static void decode_register_operand(struct operand *op, - struct decode_cache *c, - int inhibit_bytereg) -{ - unsigned reg = c->modrm_reg; - int highbyte_regs = c->rex_prefix == 0; - - if (!(c->d & ModRM)) - reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); - op->type = OP_REG; - if ((c->d & ByteOp) && !inhibit_bytereg) { - op->ptr = decode_register(reg, c->regs, highbyte_regs); - op->val = *(u8 *)op->ptr; - op->bytes = 1; - } else { - op->ptr = decode_register(reg, c->regs, 0); - op->bytes = c->op_bytes; - switch (op->bytes) { - case 2: - op->val = *(u16 *)op->ptr; - break; - case 4: - op->val = *(u32 *)op->ptr; - break; - case 8: - op->val = *(u64 *) op->ptr; - break; - } - } - op->orig_val = op->val; -} - -static int decode_modrm(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - u8 sib; - int index_reg = 0, base_reg = 0, scale; - int rc = 0; - - if (c->rex_prefix) { - c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ - index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ - c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ - } - - c->modrm = insn_fetch(u8, 1, c->eip); - c->modrm_mod |= (c->modrm & 0xc0) >> 6; - c->modrm_reg |= (c->modrm & 0x38) >> 3; - c->modrm_rm |= (c->modrm & 0x07); - c->modrm_ea = 0; - c->use_modrm_ea = 1; - - if (c->modrm_mod == 3) { - c->modrm_ptr = decode_register(c->modrm_rm, - c->regs, c->d & ByteOp); - c->modrm_val = *(unsigned long *)c->modrm_ptr; - return rc; - } - - if (c->ad_bytes == 2) { - unsigned bx = c->regs[VCPU_REGS_RBX]; - unsigned bp = c->regs[VCPU_REGS_RBP]; - unsigned si = c->regs[VCPU_REGS_RSI]; - unsigned di = c->regs[VCPU_REGS_RDI]; - - /* 16-bit ModR/M decode. */ - switch (c->modrm_mod) { - case 0: - if (c->modrm_rm == 6) - c->modrm_ea += insn_fetch(u16, 2, c->eip); - break; - case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); - break; - case 2: - c->modrm_ea += insn_fetch(u16, 2, c->eip); - break; - } - switch (c->modrm_rm) { - case 0: - c->modrm_ea += bx + si; - break; - case 1: - c->modrm_ea += bx + di; - break; - case 2: - c->modrm_ea += bp + si; - break; - case 3: - c->modrm_ea += bp + di; - break; - case 4: - c->modrm_ea += si; - break; - case 5: - c->modrm_ea += di; - break; - case 6: - if (c->modrm_mod != 0) - c->modrm_ea += bp; - break; - case 7: - c->modrm_ea += bx; - break; - } - if (c->modrm_rm == 2 || c->modrm_rm == 3 || - (c->modrm_rm == 6 && c->modrm_mod != 0)) - if (!c->has_seg_override) - set_seg_override(c, VCPU_SREG_SS); - c->modrm_ea = (u16)c->modrm_ea; - } else { - /* 32/64-bit ModR/M decode. */ - if ((c->modrm_rm & 7) == 4) { - sib = insn_fetch(u8, 1, c->eip); - index_reg |= (sib >> 3) & 7; - base_reg |= sib & 7; - scale = sib >> 6; - - if ((base_reg & 7) == 5 && c->modrm_mod == 0) - c->modrm_ea += insn_fetch(s32, 4, c->eip); - else - c->modrm_ea += c->regs[base_reg]; - if (index_reg != 4) - c->modrm_ea += c->regs[index_reg] << scale; - } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { - if (ctxt->mode == X86EMUL_MODE_PROT64) - c->rip_relative = 1; - } else - c->modrm_ea += c->regs[c->modrm_rm]; - switch (c->modrm_mod) { - case 0: - if (c->modrm_rm == 5) - c->modrm_ea += insn_fetch(s32, 4, c->eip); - break; - case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); - break; - case 2: - c->modrm_ea += insn_fetch(s32, 4, c->eip); - break; - } - } -done: - return rc; -} - -static int decode_abs(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = 0; - - switch (c->ad_bytes) { - case 2: - c->modrm_ea = insn_fetch(u16, 2, c->eip); - break; - case 4: - c->modrm_ea = insn_fetch(u32, 4, c->eip); - break; - case 8: - c->modrm_ea = insn_fetch(u64, 8, c->eip); - break; - } -done: - return rc; -} - -int -x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = 0; - int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, group; - - /* Shadow copy of register state. Committed on successful emulation. */ - - memset(c, 0, sizeof(struct decode_cache)); - c->eip = kvm_rip_read(ctxt->vcpu); - ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - - switch (mode) { - case X86EMUL_MODE_REAL: - case X86EMUL_MODE_PROT16: - def_op_bytes = def_ad_bytes = 2; - break; - case X86EMUL_MODE_PROT32: - def_op_bytes = def_ad_bytes = 4; - break; -#ifdef CONFIG_X86_64 - case X86EMUL_MODE_PROT64: - def_op_bytes = 4; - def_ad_bytes = 8; - break; -#endif - default: - return -1; - } - - c->op_bytes = def_op_bytes; - c->ad_bytes = def_ad_bytes; - - /* Legacy prefixes. */ - for (;;) { - switch (c->b = insn_fetch(u8, 1, c->eip)) { - case 0x66: /* operand-size override */ - /* switch between 2/4 bytes */ - c->op_bytes = def_op_bytes ^ 6; - break; - case 0x67: /* address-size override */ - if (mode == X86EMUL_MODE_PROT64) - /* switch between 4/8 bytes */ - c->ad_bytes = def_ad_bytes ^ 12; - else - /* switch between 2/4 bytes */ - c->ad_bytes = def_ad_bytes ^ 6; - break; - case 0x26: /* ES override */ - case 0x2e: /* CS override */ - case 0x36: /* SS override */ - case 0x3e: /* DS override */ - set_seg_override(c, (c->b >> 3) & 3); - break; - case 0x64: /* FS override */ - case 0x65: /* GS override */ - set_seg_override(c, c->b & 7); - break; - case 0x40 ... 0x4f: /* REX */ - if (mode != X86EMUL_MODE_PROT64) - goto done_prefixes; - c->rex_prefix = c->b; - continue; - case 0xf0: /* LOCK */ - c->lock_prefix = 1; - break; - case 0xf2: /* REPNE/REPNZ */ - c->rep_prefix = REPNE_PREFIX; - break; - case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = REPE_PREFIX; - break; - default: - goto done_prefixes; - } - - /* Any legacy prefix after a REX prefix nullifies its effect. */ - - c->rex_prefix = 0; - } - -done_prefixes: - - /* REX prefix. */ - if (c->rex_prefix) - if (c->rex_prefix & 8) - c->op_bytes = 8; /* REX.W */ - - /* Opcode byte(s). */ - c->d = opcode_table[c->b]; - if (c->d == 0) { - /* Two-byte opcode? */ - if (c->b == 0x0f) { - c->twobyte = 1; - c->b = insn_fetch(u8, 1, c->eip); - c->d = twobyte_table[c->b]; - } - } - - if (c->d & Group) { - group = c->d & GroupMask; - c->modrm = insn_fetch(u8, 1, c->eip); - --c->eip; - - group = (group << 3) + ((c->modrm >> 3) & 7); - if ((c->d & GroupDual) && (c->modrm >> 6) == 3) - c->d = group2_table[group]; - else - c->d = group_table[group]; - } - - /* Unrecognised? */ - if (c->d == 0) { - DPRINTF("Cannot emulate %02x\n", c->b); - return -1; - } - - if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) - c->op_bytes = 8; - - /* ModRM and SIB bytes. */ - if (c->d & ModRM) - rc = decode_modrm(ctxt, ops); - else if (c->d & MemAbs) - rc = decode_abs(ctxt, ops); - if (rc) - goto done; - - if (!c->has_seg_override) - set_seg_override(c, VCPU_SREG_DS); - - if (!(!c->twobyte && c->b == 0x8d)) - c->modrm_ea += seg_override_base(ctxt, c); - - if (c->ad_bytes != 8) - c->modrm_ea = (u32)c->modrm_ea; - /* - * Decode and fetch the source operand: register, memory - * or immediate. - */ - switch (c->d & SrcMask) { - case SrcNone: - break; - case SrcReg: - decode_register_operand(&c->src, c, 0); - break; - case SrcMem16: - c->src.bytes = 2; - goto srcmem_common; - case SrcMem32: - c->src.bytes = 4; - goto srcmem_common; - case SrcMem: - c->src.bytes = (c->d & ByteOp) ? 1 : - c->op_bytes; - /* Don't fetch the address for invlpg: it could be unmapped. */ - if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) - break; - srcmem_common: - /* - * For instructions with a ModR/M byte, switch to register - * access if Mod = 3. - */ - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->src.type = OP_REG; - c->src.val = c->modrm_val; - c->src.ptr = c->modrm_ptr; - break; - } - c->src.type = OP_MEM; - break; - case SrcImm: - case SrcImmU: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - /* NB. Immediates are sign-extended as necessary. */ - switch (c->src.bytes) { - case 1: - c->src.val = insn_fetch(s8, 1, c->eip); - break; - case 2: - c->src.val = insn_fetch(s16, 2, c->eip); - break; - case 4: - c->src.val = insn_fetch(s32, 4, c->eip); - break; - } - if ((c->d & SrcMask) == SrcImmU) { - switch (c->src.bytes) { - case 1: - c->src.val &= 0xff; - break; - case 2: - c->src.val &= 0xffff; - break; - case 4: - c->src.val &= 0xffffffff; - break; - } - } - break; - case SrcImmByte: - case SrcImmUByte: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = 1; - if ((c->d & SrcMask) == SrcImmByte) - c->src.val = insn_fetch(s8, 1, c->eip); - else - c->src.val = insn_fetch(u8, 1, c->eip); - break; - case SrcOne: - c->src.bytes = 1; - c->src.val = 1; - break; - } - - /* - * Decode and fetch the second source operand: register, memory - * or immediate. - */ - switch (c->d & Src2Mask) { - case Src2None: - break; - case Src2CL: - c->src2.bytes = 1; - c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; - break; - case Src2ImmByte: - c->src2.type = OP_IMM; - c->src2.ptr = (unsigned long *)c->eip; - c->src2.bytes = 1; - c->src2.val = insn_fetch(u8, 1, c->eip); - break; - case Src2Imm16: - c->src2.type = OP_IMM; - c->src2.ptr = (unsigned long *)c->eip; - c->src2.bytes = 2; - c->src2.val = insn_fetch(u16, 2, c->eip); - break; - case Src2One: - c->src2.bytes = 1; - c->src2.val = 1; - break; - } - - /* Decode and fetch the destination operand: register or memory. */ - switch (c->d & DstMask) { - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - return 0; - case DstReg: - decode_register_operand(&c->dst, c, - c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); - break; - case DstMem: - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.type = OP_REG; - c->dst.val = c->dst.orig_val = c->modrm_val; - c->dst.ptr = c->modrm_ptr; - break; - } - c->dst.type = OP_MEM; - break; - case DstAcc: - c->dst.type = OP_REG; - c->dst.bytes = c->op_bytes; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - switch (c->op_bytes) { - case 1: - c->dst.val = *(u8 *)c->dst.ptr; - break; - case 2: - c->dst.val = *(u16 *)c->dst.ptr; - break; - case 4: - c->dst.val = *(u32 *)c->dst.ptr; - break; - } - c->dst.orig_val = c->dst.val; - break; - } - - if (c->rip_relative) - c->modrm_ea += c->eip; - -done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; -} - -static inline void emulate_push(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]); -} - -static int emulate_pop(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - void *dest, int len) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - - rc = ops->read_emulated(register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]), - dest, len, ctxt->vcpu); - if (rc != 0) - return rc; - - register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); - return rc; -} - -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - - rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); - if (rc != 0) - return rc; - return 0; -} - -static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - switch (c->modrm_reg) { - case 0: /* rol */ - emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); - break; - case 1: /* ror */ - emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); - break; - case 2: /* rcl */ - emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); - break; - case 3: /* rcr */ - emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); - break; - case 4: /* sal/shl */ - case 6: /* sal/shl */ - emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); - break; - case 5: /* shr */ - emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); - break; - case 7: /* sar */ - emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); - break; - } -} - -static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = 0; - - switch (c->modrm_reg) { - case 0 ... 1: /* test */ - emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); - break; - case 2: /* not */ - c->dst.val = ~c->dst.val; - break; - case 3: /* neg */ - emulate_1op("neg", c->dst, ctxt->eflags); - break; - default: - DPRINTF("Cannot emulate %02x\n", c->b); - rc = X86EMUL_UNHANDLEABLE; - break; - } - return rc; -} - -static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - - switch (c->modrm_reg) { - case 0: /* inc */ - emulate_1op("inc", c->dst, ctxt->eflags); - break; - case 1: /* dec */ - emulate_1op("dec", c->dst, ctxt->eflags); - break; - case 2: /* call near abs */ { - long int old_eip; - old_eip = c->eip; - c->eip = c->src.val; - c->src.val = old_eip; - emulate_push(ctxt); - break; - } - case 4: /* jmp abs */ - c->eip = c->src.val; - break; - case 6: /* push */ - emulate_push(ctxt); - break; - } - return 0; -} - -static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long memop) -{ - struct decode_cache *c = &ctxt->decode; - u64 old, new; - int rc; - - rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); - if (rc != 0) - return rc; - - if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || - ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { - - c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); - c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); - ctxt->eflags &= ~EFLG_ZF; - - } else { - new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | - (u32) c->regs[VCPU_REGS_RBX]; - - rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); - if (rc != 0) - return rc; - ctxt->eflags |= EFLG_ZF; - } - return 0; -} - -static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - unsigned long cs; - - rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); - if (rc) - return rc; - if (c->op_bytes == 4) - c->eip = (u32)c->eip; - rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); - if (rc) - return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); - return rc; -} - -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - int rc; - struct decode_cache *c = &ctxt->decode; - - switch (c->dst.type) { - case OP_REG: - /* The 4-byte case *is* correct: - * in 64-bit mode we zero-extend. - */ - switch (c->dst.bytes) { - case 1: - *(u8 *)c->dst.ptr = (u8)c->dst.val; - break; - case 2: - *(u16 *)c->dst.ptr = (u16)c->dst.val; - break; - case 4: - *c->dst.ptr = (u32)c->dst.val; - break; /* 64b: zero-ext */ - case 8: - *c->dst.ptr = c->dst.val; - break; - } - break; - case OP_MEM: - if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - (unsigned long)c->dst.ptr, - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - else - rc = ops->write_emulated( - (unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - if (rc != 0) - return rc; - break; - case OP_NONE: - /* no writeback */ - break; - default: - break; - } - return 0; -} - -static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) -{ - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); - /* - * an sti; sti; sequence only disable interrupts for the first - * instruction. So, if the last instruction, be it emulated or - * not, left the system with the INT_STI flag enabled, it - * means that the last instruction is an sti. We should not - * leave the flag on in this case. The same goes for mov ss - */ - if (!(int_shadow & mask)) - ctxt->interruptibility = mask; -} - -static inline void -setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, - struct kvm_segment *cs, struct kvm_segment *ss) -{ - memset(cs, 0, sizeof(struct kvm_segment)); - kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); - memset(ss, 0, sizeof(struct kvm_segment)); - - cs->l = 0; /* will be adjusted later */ - cs->base = 0; /* flat segment */ - cs->g = 1; /* 4kb granularity */ - cs->limit = 0xffffffff; /* 4GB limit */ - cs->type = 0x0b; /* Read, Execute, Accessed */ - cs->s = 1; - cs->dpl = 0; /* will be adjusted later */ - cs->present = 1; - cs->db = 1; - - ss->unusable = 0; - ss->base = 0; /* flat segment */ - ss->limit = 0xffffffff; /* 4GB limit */ - ss->g = 1; /* 4kb granularity */ - ss->s = 1; - ss->type = 0x03; /* Read/Write, Accessed */ - ss->db = 1; /* 32bit stack segment */ - ss->dpl = 0; - ss->present = 1; -} - -static int -emulate_syscall(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; - u64 msr_data; - - /* syscall is not available in real mode */ - if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) - return -1; - - setup_syscalls_segments(ctxt, &cs, &ss); - - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); - msr_data >>= 32; - cs.selector = (u16)(msr_data & 0xfffc); - ss.selector = (u16)(msr_data + 8); - - if (is_long_mode(ctxt->vcpu)) { - cs.db = 0; - cs.l = 1; - } - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); - - c->regs[VCPU_REGS_RCX] = c->eip; - if (is_long_mode(ctxt->vcpu)) { -#ifdef CONFIG_X86_64 - c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; - - kvm_x86_ops->get_msr(ctxt->vcpu, - ctxt->mode == X86EMUL_MODE_PROT64 ? - MSR_LSTAR : MSR_CSTAR, &msr_data); - c->eip = msr_data; - - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); - ctxt->eflags &= ~(msr_data | EFLG_RF); -#endif - } else { - /* legacy mode */ - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); - c->eip = (u32)msr_data; - - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); - } - - return 0; -} - -static int -emulate_sysenter(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; - u64 msr_data; - - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || - !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - /* XXX sysenter/sysexit have not been tested in 64bit mode. - * Therefore, we inject an #UD. - */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - return -1; - - setup_syscalls_segments(ctxt, &cs, &ss); - - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); - switch (ctxt->mode) { - case X86EMUL_MODE_PROT32: - if ((msr_data & 0xfffc) == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - break; - case X86EMUL_MODE_PROT64: - if (msr_data == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - break; - } - - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); - cs.selector = (u16)msr_data; - cs.selector &= ~SELECTOR_RPL_MASK; - ss.selector = cs.selector + 8; - ss.selector &= ~SELECTOR_RPL_MASK; - if (ctxt->mode == X86EMUL_MODE_PROT64 - || is_long_mode(ctxt->vcpu)) { - cs.db = 0; - cs.l = 1; - } - - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); - - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); - c->eip = msr_data; - - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); - c->regs[VCPU_REGS_RSP] = msr_data; - - return 0; -} - -static int -emulate_sysexit(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; - u64 msr_data; - int usermode; - - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - /* sysexit must be called from CPL 0 */ - if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - setup_syscalls_segments(ctxt, &cs, &ss); - - if ((c->rex_prefix & 0x8) != 0x0) - usermode = X86EMUL_MODE_PROT64; - else - usermode = X86EMUL_MODE_PROT32; - - cs.dpl = 3; - ss.dpl = 3; - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); - switch (usermode) { - case X86EMUL_MODE_PROT32: - cs.selector = (u16)(msr_data + 16); - if ((msr_data & 0xfffc) == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - ss.selector = (u16)(msr_data + 24); - break; - case X86EMUL_MODE_PROT64: - cs.selector = (u16)(msr_data + 32); - if (msr_data == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - ss.selector = cs.selector + 8; - cs.db = 0; - cs.l = 1; - break; - } - cs.selector |= SELECTOR_RPL_MASK; - ss.selector |= SELECTOR_RPL_MASK; - - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); - - c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; - c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; - - return 0; -} - -int -x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) -{ - unsigned long memop = 0; - u64 msr_data; - unsigned long saved_eip = 0; - struct decode_cache *c = &ctxt->decode; - unsigned int port; - int io_dir_in; - int rc = 0; - - ctxt->interruptibility = 0; - - /* Shadow copy of register state. Committed on successful emulation. - * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't - * modify them. - */ - - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - saved_eip = c->eip; - - if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) - memop = c->modrm_ea; - - if (c->rep_prefix && (c->d & String)) { - /* All REP prefixes have the same first termination condition */ - if (c->regs[VCPU_REGS_RCX] == 0) { - kvm_rip_write(ctxt->vcpu, c->eip); - goto done; - } - /* The second termination condition only applies for REPE - * and REPNE. Test if the repeat string operation prefix is - * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the - * corresponding termination condition according to: - * - if REPE/REPZ and ZF = 0 then done - * - if REPNE/REPNZ and ZF = 1 then done - */ - if ((c->b == 0xa6) || (c->b == 0xa7) || - (c->b == 0xae) || (c->b == 0xaf)) { - if ((c->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) { - kvm_rip_write(ctxt->vcpu, c->eip); - goto done; - } - if ((c->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { - kvm_rip_write(ctxt->vcpu, c->eip); - goto done; - } - } - c->regs[VCPU_REGS_RCX]--; - c->eip = kvm_rip_read(ctxt->vcpu); - } - - if (c->src.type == OP_MEM) { - c->src.ptr = (unsigned long *)memop; - c->src.val = 0; - rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu); - if (rc != 0) - goto done; - c->src.orig_val = c->src.val; - } - - if ((c->d & DstMask) == ImplicitOps) - goto special_insn; - - - if (c->dst.type == OP_MEM) { - c->dst.ptr = (unsigned long *)memop; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.val = 0; - if (c->d & BitOp) { - unsigned long mask = ~(c->dst.bytes * 8 - 1); - - c->dst.ptr = (void *)c->dst.ptr + - (c->src.val & mask) / 8; - } - if (!(c->d & Mov) && - /* optimisation - avoid slow emulated read */ - ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0)) - goto done; - } - c->dst.orig_val = c->dst.val; - -special_insn: - - if (c->twobyte) - goto twobyte_insn; - - switch (c->b) { - case 0x00 ... 0x05: - add: /* add */ - emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); - break; - case 0x08 ... 0x0d: - or: /* or */ - emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); - break; - case 0x10 ... 0x15: - adc: /* adc */ - emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); - break; - case 0x18 ... 0x1d: - sbb: /* sbb */ - emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); - break; - case 0x20 ... 0x25: - and: /* and */ - emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); - break; - case 0x28 ... 0x2d: - sub: /* sub */ - emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); - break; - case 0x30 ... 0x35: - xor: /* xor */ - emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); - break; - case 0x38 ... 0x3d: - cmp: /* cmp */ - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - break; - case 0x40 ... 0x47: /* inc r16/r32 */ - emulate_1op("inc", c->dst, ctxt->eflags); - break; - case 0x48 ... 0x4f: /* dec r16/r32 */ - emulate_1op("dec", c->dst, ctxt->eflags); - break; - case 0x50 ... 0x57: /* push reg */ - emulate_push(ctxt); - break; - case 0x58 ... 0x5f: /* pop reg */ - pop_instruction: - rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); - if (rc != 0) - goto done; - break; - case 0x63: /* movsxd */ - if (ctxt->mode != X86EMUL_MODE_PROT64) - goto cannot_emulate; - c->dst.val = (s32) c->src.val; - break; - case 0x68: /* push imm */ - case 0x6a: /* push imm8 */ - emulate_push(ctxt); - break; - case 0x6c: /* insb */ - case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, - 1, - (c->d & ByteOp) ? 1 : c->op_bytes, - c->rep_prefix ? - address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, - (ctxt->eflags & EFLG_DF), - register_address(c, es_base(ctxt), - c->regs[VCPU_REGS_RDI]), - c->rep_prefix, - c->regs[VCPU_REGS_RDX]) == 0) { - c->eip = saved_eip; - return -1; - } - return 0; - case 0x6e: /* outsb */ - case 0x6f: /* outsw/outsd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, - 0, - (c->d & ByteOp) ? 1 : c->op_bytes, - c->rep_prefix ? - address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, - (ctxt->eflags & EFLG_DF), - register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - c->rep_prefix, - c->regs[VCPU_REGS_RDX]) == 0) { - c->eip = saved_eip; - return -1; - } - return 0; - case 0x70 ... 0x7f: /* jcc (short) */ - if (test_cc(c->b, ctxt->eflags)) - jmp_rel(c, c->src.val); - break; - case 0x80 ... 0x83: /* Grp1 */ - switch (c->modrm_reg) { - case 0: - goto add; - case 1: - goto or; - case 2: - goto adc; - case 3: - goto sbb; - case 4: - goto and; - case 5: - goto sub; - case 6: - goto xor; - case 7: - goto cmp; - } - break; - case 0x84 ... 0x85: - emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); - break; - case 0x86 ... 0x87: /* xchg */ - xchg: - /* Write back the register source. */ - switch (c->dst.bytes) { - case 1: - *(u8 *) c->src.ptr = (u8) c->dst.val; - break; - case 2: - *(u16 *) c->src.ptr = (u16) c->dst.val; - break; - case 4: - *c->src.ptr = (u32) c->dst.val; - break; /* 64b reg: zero-extend */ - case 8: - *c->src.ptr = c->dst.val; - break; - } - /* - * Write back the memory destination with implicit LOCK - * prefix. - */ - c->dst.val = c->src.val; - c->lock_prefix = 1; - break; - case 0x88 ... 0x8b: /* mov */ - goto mov; - case 0x8c: { /* mov r/m, sreg */ - struct kvm_segment segreg; - - if (c->modrm_reg <= 5) - kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); - else { - printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; - } - c->dst.val = segreg.selector; - break; - } - case 0x8d: /* lea r16/r32, m */ - c->dst.val = c->modrm_ea; - break; - case 0x8e: { /* mov seg, r/m16 */ - uint16_t sel; - int type_bits; - int err; - - sel = c->src.val; - if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); - - if (c->modrm_reg <= 5) { - type_bits = (c->modrm_reg == 1) ? 9 : 1; - err = kvm_load_segment_descriptor(ctxt->vcpu, sel, - type_bits, c->modrm_reg); - } else { - printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; - } - - if (err < 0) - goto cannot_emulate; - - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - } - case 0x8f: /* pop (sole member of Grp1a) */ - rc = emulate_grp1a(ctxt, ops); - if (rc != 0) - goto done; - break; - case 0x90: /* nop / xchg r8,rax */ - if (!(c->rex_prefix & 1)) { /* nop */ - c->dst.type = OP_NONE; - break; - } - case 0x91 ... 0x97: /* xchg reg,rax */ - c->src.type = c->dst.type = OP_REG; - c->src.bytes = c->dst.bytes = c->op_bytes; - c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; - c->src.val = *(c->src.ptr); - goto xchg; - case 0x9c: /* pushf */ - c->src.val = (unsigned long) ctxt->eflags; - emulate_push(ctxt); - break; - case 0x9d: /* popf */ - c->dst.type = OP_REG; - c->dst.ptr = (unsigned long *) &ctxt->eflags; - c->dst.bytes = c->op_bytes; - goto pop_instruction; - case 0xa0 ... 0xa1: /* mov */ - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - c->dst.val = c->src.val; - break; - case 0xa2 ... 0xa3: /* mov */ - c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; - break; - case 0xa4 ... 0xa5: /* movs */ - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address(c, - es_base(ctxt), - c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0) - goto done; - register_address_increment(c, &c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - register_address_increment(c, &c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; - case 0xa6 ... 0xa7: /* cmps */ - c->src.type = OP_NONE; /* Disable writeback. */ - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = (unsigned long *)register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]); - if ((rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu)) != 0) - goto done; - - c->dst.type = OP_NONE; /* Disable writeback. */ - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address(c, - es_base(ctxt), - c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) - goto done; - - DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); - - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - - register_address_increment(c, &c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->src.bytes - : c->src.bytes); - register_address_increment(c, &c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - - break; - case 0xaa ... 0xab: /* stos */ - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address(c, - es_base(ctxt), - c->regs[VCPU_REGS_RDI]); - c->dst.val = c->regs[VCPU_REGS_RAX]; - register_address_increment(c, &c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; - case 0xac ... 0xad: /* lods */ - c->dst.type = OP_REG; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) - goto done; - register_address_increment(c, &c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; - case 0xae ... 0xaf: /* scas */ - DPRINTF("Urk! I don't handle SCAS.\n"); - goto cannot_emulate; - case 0xb0 ... 0xbf: /* mov r, imm */ - goto mov; - case 0xc0 ... 0xc1: - emulate_grp2(ctxt); - break; - case 0xc3: /* ret */ - c->dst.type = OP_REG; - c->dst.ptr = &c->eip; - c->dst.bytes = c->op_bytes; - goto pop_instruction; - case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ - mov: - c->dst.val = c->src.val; - break; - case 0xcb: /* ret far */ - rc = emulate_ret_far(ctxt, ops); - if (rc) - goto done; - break; - case 0xd0 ... 0xd1: /* Grp2 */ - c->src.val = 1; - emulate_grp2(ctxt); - break; - case 0xd2 ... 0xd3: /* Grp2 */ - c->src.val = c->regs[VCPU_REGS_RCX]; - emulate_grp2(ctxt); - break; - case 0xe4: /* inb */ - case 0xe5: /* in */ - port = c->src.val; - io_dir_in = 1; - goto do_io; - case 0xe6: /* outb */ - case 0xe7: /* out */ - port = c->src.val; - io_dir_in = 0; - goto do_io; - case 0xe8: /* call (near) */ { - long int rel = c->src.val; - c->src.val = (unsigned long) c->eip; - jmp_rel(c, rel); - emulate_push(ctxt); - break; - } - case 0xe9: /* jmp rel */ - goto jmp; - case 0xea: /* jmp far */ - if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, - VCPU_SREG_CS) < 0) { - DPRINTF("jmp far: Failed to load CS descriptor\n"); - goto cannot_emulate; - } - - c->eip = c->src.val; - break; - case 0xeb: - jmp: /* jmp rel short */ - jmp_rel(c, c->src.val); - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xec: /* in al,dx */ - case 0xed: /* in (e/r)ax,dx */ - port = c->regs[VCPU_REGS_RDX]; - io_dir_in = 1; - goto do_io; - case 0xee: /* out al,dx */ - case 0xef: /* out (e/r)ax,dx */ - port = c->regs[VCPU_REGS_RDX]; - io_dir_in = 0; - do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, - (c->d & ByteOp) ? 1 : c->op_bytes, - port) != 0) { - c->eip = saved_eip; - goto cannot_emulate; - } - break; - case 0xf4: /* hlt */ - ctxt->vcpu->arch.halt_request = 1; - break; - case 0xf5: /* cmc */ - /* complement carry flag from eflags reg */ - ctxt->eflags ^= EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xf6 ... 0xf7: /* Grp3 */ - rc = emulate_grp3(ctxt, ops); - if (rc != 0) - goto done; - break; - case 0xf8: /* clc */ - ctxt->eflags &= ~EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfb: /* sti */ - toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfc: /* cld */ - ctxt->eflags &= ~EFLG_DF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfd: /* std */ - ctxt->eflags |= EFLG_DF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfe ... 0xff: /* Grp4/Grp5 */ - rc = emulate_grp45(ctxt, ops); - if (rc != 0) - goto done; - break; - } - -writeback: - rc = writeback(ctxt, ops); - if (rc != 0) - goto done; - - /* Commit shadow register state. */ - memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(ctxt->vcpu, c->eip); - -done: - if (rc == X86EMUL_UNHANDLEABLE) { - c->eip = saved_eip; - return -1; - } - return 0; - -twobyte_insn: - switch (c->b) { - case 0x01: /* lgdt, lidt, lmsw */ - switch (c->modrm_reg) { - u16 size; - unsigned long address; - - case 0: /* vmcall */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - goto cannot_emulate; - - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) - goto done; - - /* Let the processor re-execute the fixed hypercall */ - c->eip = kvm_rip_read(ctxt->vcpu); - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, c->src.ptr, - &size, &address, c->op_bytes); - if (rc) - goto done; - realmode_lgdt(ctxt->vcpu, size, address); - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 3: /* lidt/vmmcall */ - if (c->modrm_mod == 3) { - switch (c->modrm_rm) { - case 1: - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) - goto done; - break; - default: - goto cannot_emulate; - } - } else { - rc = read_descriptor(ctxt, ops, c->src.ptr, - &size, &address, - c->op_bytes); - if (rc) - goto done; - realmode_lidt(ctxt->vcpu, size, address); - } - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 4: /* smsw */ - c->dst.bytes = 2; - c->dst.val = realmode_get_cr(ctxt->vcpu, 0); - break; - case 6: /* lmsw */ - realmode_lmsw(ctxt->vcpu, (u16)c->src.val, - &ctxt->eflags); - c->dst.type = OP_NONE; - break; - case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, memop); - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - default: - goto cannot_emulate; - } - break; - case 0x05: /* syscall */ - if (emulate_syscall(ctxt) == -1) - goto cannot_emulate; - else - goto writeback; - break; - case 0x06: - emulate_clts(ctxt->vcpu); - c->dst.type = OP_NONE; - break; - case 0x08: /* invd */ - case 0x09: /* wbinvd */ - case 0x0d: /* GrpP (prefetch) */ - case 0x18: /* Grp16 (prefetch/nop) */ - c->dst.type = OP_NONE; - break; - case 0x20: /* mov cr, reg */ - if (c->modrm_mod != 3) - goto cannot_emulate; - c->regs[c->modrm_rm] = - realmode_get_cr(ctxt->vcpu, c->modrm_reg); - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x21: /* mov from dr to reg */ - if (c->modrm_mod != 3) - goto cannot_emulate; - rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); - if (rc) - goto cannot_emulate; - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x22: /* mov reg, cr */ - if (c->modrm_mod != 3) - goto cannot_emulate; - realmode_set_cr(ctxt->vcpu, - c->modrm_reg, c->modrm_val, &ctxt->eflags); - c->dst.type = OP_NONE; - break; - case 0x23: /* mov from reg to dr */ - if (c->modrm_mod != 3) - goto cannot_emulate; - rc = emulator_set_dr(ctxt, c->modrm_reg, - c->regs[c->modrm_rm]); - if (rc) - goto cannot_emulate; - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x30: - /* wrmsr */ - msr_data = (u32)c->regs[VCPU_REGS_RAX] - | ((u64)c->regs[VCPU_REGS_RDX] << 32); - rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); - if (rc) { - kvm_inject_gp(ctxt->vcpu, 0); - c->eip = kvm_rip_read(ctxt->vcpu); - } - rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; - break; - case 0x32: - /* rdmsr */ - rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); - if (rc) { - kvm_inject_gp(ctxt->vcpu, 0); - c->eip = kvm_rip_read(ctxt->vcpu); - } else { - c->regs[VCPU_REGS_RAX] = (u32)msr_data; - c->regs[VCPU_REGS_RDX] = msr_data >> 32; - } - rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; - break; - case 0x34: /* sysenter */ - if (emulate_sysenter(ctxt) == -1) - goto cannot_emulate; - else - goto writeback; - break; - case 0x35: /* sysexit */ - if (emulate_sysexit(ctxt) == -1) - goto cannot_emulate; - else - goto writeback; - break; - case 0x40 ... 0x4f: /* cmov */ - c->dst.val = c->dst.orig_val = c->src.val; - if (!test_cc(c->b, ctxt->eflags)) - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x80 ... 0x8f: /* jnz rel, etc*/ - if (test_cc(c->b, ctxt->eflags)) - jmp_rel(c, c->src.val); - c->dst.type = OP_NONE; - break; - case 0xa3: - bt: /* bt */ - c->dst.type = OP_NONE; - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); - break; - case 0xa4: /* shld imm8, r, r/m */ - case 0xa5: /* shld cl, r, r/m */ - emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); - break; - case 0xab: - bts: /* bts */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); - break; - case 0xac: /* shrd imm8, r, r/m */ - case 0xad: /* shrd cl, r, r/m */ - emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); - break; - case 0xae: /* clflush */ - break; - case 0xb0 ... 0xb1: /* cmpxchg */ - /* - * Save real source value, then compare EAX against - * destination. - */ - c->src.orig_val = c->src.val; - c->src.val = c->regs[VCPU_REGS_RAX]; - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - if (ctxt->eflags & EFLG_ZF) { - /* Success: write back to memory. */ - c->dst.val = c->src.orig_val; - } else { - /* Failure: write the value we saw to EAX. */ - c->dst.type = OP_REG; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - } - break; - case 0xb3: - btr: /* btr */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); - break; - case 0xb6 ... 0xb7: /* movzx */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->d & ByteOp) ? (u8) c->src.val - : (u16) c->src.val; - break; - case 0xba: /* Grp8 */ - switch (c->modrm_reg & 3) { - case 0: - goto bt; - case 1: - goto bts; - case 2: - goto btr; - case 3: - goto btc; - } - break; - case 0xbb: - btc: /* btc */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); - break; - case 0xbe ... 0xbf: /* movsx */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : - (s16) c->src.val; - break; - case 0xc3: /* movnti */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : - (u64) c->src.val; - break; - case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = emulate_grp9(ctxt, ops, memop); - if (rc != 0) - goto done; - c->dst.type = OP_NONE; - break; - } - goto writeback; - -cannot_emulate: - DPRINTF("Cannot emulate %02x\n", c->b); - c->eip = saved_eip; - return -1; -} -- cgit v1.2.3-70-g09d2 From 0aaeb3b1087b21fef434f1bae6e6495c12ee2f55 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 13 Aug 2009 20:05:48 +0530 Subject: Documentation: Update KVM list email address The KVM list moved to vger.kernel.org last year Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- Documentation/ioctl/ioctl-number.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index dbea4f95fc85..999a2017df77 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -192,7 +192,7 @@ Code Seq# Include File Comments 0xAD 00 Netfilter device in development: 0xAE all linux/kvm.h Kernel-based Virtual Machine - + 0xB0 all RATIO devices in development: 0xB1 00-1F PPPoX -- cgit v1.2.3-70-g09d2 From da18acffc3f13c4f187c20bc5dcfcb110b374b48 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 10 Aug 2009 15:59:25 +0300 Subject: KVM: export kvm_para.h kvm_para.h contains userspace interface and so should be exported. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- include/asm-generic/Kbuild.asm | 5 +++++ include/linux/Kbuild | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm index 290910e4ede4..96d7c9804dc1 100644 --- a/include/asm-generic/Kbuild.asm +++ b/include/asm-generic/Kbuild.asm @@ -3,6 +3,11 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h \ header-y += kvm.h endif +ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \ + $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),) +header-y += kvm_para.h +endif + ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/a.out.h \ $(srctree)/include/asm-$(SRCARCH)/a.out.h),) unifdef-y += a.out.h diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 334a3593cdfd..cff4a101f266 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -268,6 +268,10 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h \ $(srctree)/include/asm-$(SRCARCH)/kvm.h),) unifdef-y += kvm.h endif +ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \ + $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),) +unifdef-y += kvm_para.h +endif unifdef-y += llc.h unifdef-y += loop.h unifdef-y += lp.h -- cgit v1.2.3-70-g09d2 From 27c238106862fb0dd3e229c48c1ef56502b0ec88 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Aug 2009 15:31:11 +0300 Subject: KVM: Add __KERNEL__ guards to exported headers Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_para.h | 4 ++++ arch/s390/include/asm/kvm_para.h | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arch/ia64/include/asm/kvm_para.h b/arch/ia64/include/asm/kvm_para.h index 0d6d8ca07b8c..1588aee781a2 100644 --- a/arch/ia64/include/asm/kvm_para.h +++ b/arch/ia64/include/asm/kvm_para.h @@ -19,9 +19,13 @@ * */ +#ifdef __KERNEL__ + static inline unsigned int kvm_arch_para_features(void) { return 0; } #endif + +#endif diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h index 2c503796b619..6964db226f83 100644 --- a/arch/s390/include/asm/kvm_para.h +++ b/arch/s390/include/asm/kvm_para.h @@ -13,6 +13,8 @@ #ifndef __S390_KVM_PARA_H #define __S390_KVM_PARA_H +#ifdef __KERNEL__ + /* * Hypercalls for KVM on s390. The calling convention is similar to the * s390 ABI, so we use R2-R6 for parameters 1-5. In addition we use R1 @@ -147,4 +149,6 @@ static inline unsigned int kvm_arch_para_features(void) return 0; } +#endif + #endif /* __S390_KVM_PARA_H */ -- cgit v1.2.3-70-g09d2 From fa6870c6b64214aced218d930ec7221e2a9767b8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Aug 2009 15:31:33 +0300 Subject: KVM: Add missing #include Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_para.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index b8a3305ae093..c584076a47f4 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_KVM_PARA_H #define _ASM_X86_KVM_PARA_H +#include + /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. */ -- cgit v1.2.3-70-g09d2 From 7bdb588827265bf42dd420e95d34275c83603568 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Wed, 19 Aug 2009 01:26:28 +0300 Subject: KVM: x86 emulator: Add adc and sbb missing decoder flags Add missing decoder flags for adc and sbb instructions (opcodes 0x14-0x15, 0x1c-0x1d) Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2eb807a7b620..1be5cd640e93 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -100,11 +100,11 @@ static u32 opcode_table[256] = { /* 0x10 - 0x17 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, /* 0x18 - 0x1F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, -- cgit v1.2.3-70-g09d2 From d9048d3278ed4d395e63899257c3fa377693968b Mon Sep 17 00:00:00 2001 From: Mikhail Ershov Date: Wed, 19 Aug 2009 14:08:07 +0400 Subject: KVM: Use kvm_{read,write}_guest_virt() to read and write segment descriptors Segment descriptors tables can be placed on two non-contiguous pages. This patch makes reading segment descriptors by linear address. Signed-off-by: Mikhail Ershov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c0e942747b0f..59a8ba4d56bd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4021,7 +4021,6 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { - gpa_t gpa; struct descriptor_table dtable; u16 index = selector >> 3; @@ -4031,16 +4030,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); return 1; } - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); - gpa += index * 8; - return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); + return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); } /* allowed just for 8 bytes segments */ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { - gpa_t gpa; struct descriptor_table dtable; u16 index = selector >> 3; @@ -4048,9 +4044,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) return 1; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); - gpa += index * 8; - return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); } static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, -- cgit v1.2.3-70-g09d2 From 95eb84a7588d7d7afd3096807efc052adc7479e1 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 19 Aug 2009 09:52:18 +0800 Subject: KVM: VMX: Fix EPT with WP bit change during paging QNX update WP bit when paging enabled, which is not covered yet. This one fix QNX boot with EPT. Cc: stable@kernel.org Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2b7e7bd190fd..1ee811cef236 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1642,7 +1642,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, vcpu->arch.cr4); - *hw_cr0 &= ~X86_CR0_WP; } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1651,9 +1650,10 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, vcpu->arch.cr4); - if (!(vcpu->arch.cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; } + + if (!(cr0 & X86_CR0_WP)) + *hw_cr0 &= ~X86_CR0_WP; } static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, -- cgit v1.2.3-70-g09d2 From 88c808fd42b53a7e01a2ac3253ef31fef74cb5af Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 17 Aug 2009 22:49:40 +0300 Subject: KVM: Protect update_cr8_intercept() when running without an apic update_cr8_intercept() can be triggered from userspace while there is no apic present. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 59a8ba4d56bd..35e7fc54de35 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3500,6 +3500,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!kvm_x86_ops->update_cr8_intercept) return; + if (!vcpu->arch.apic) + return; + if (!vcpu->arch.apic->vapic_addr) max_irr = kvm_lapic_find_highest_irr(vcpu); else -- cgit v1.2.3-70-g09d2 From 5dadbfd64724c41716d4fc82df6f01b023d5b15d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 23 Aug 2009 17:08:04 +0300 Subject: KVM: Document KVM_CAP_IRQCHIP Signed-off-by: Avi Kivity --- Documentation/kvm/api.txt | 76 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index 1b1c22da211b..5a4bc8cf6d04 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt @@ -517,6 +517,82 @@ struct kvm_fpu { __u32 pad2; }; +4.23 KVM_CREATE_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: none +Returns: 0 on success, -1 on error + +Creates an interrupt controller model in the kernel. On x86, creates a virtual +ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a +local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23 +only go to the IOAPIC. On ia64, a IOSAPIC is created. + +4.24 KVM_IRQ_LINE + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: struct kvm_irq_level +Returns: 0 on success, -1 on error + +Sets the level of a GSI input to the interrupt controller model in the kernel. +Requires that an interrupt controller model has been previously created with +KVM_CREATE_IRQCHIP. Note that edge-triggered interrupts require the level +to be set to 1 and then back to 0. + +struct kvm_irq_level { + union { + __u32 irq; /* GSI */ + __s32 status; /* not used for KVM_IRQ_LEVEL */ + }; + __u32 level; /* 0 or 1 */ +}; + +4.25 KVM_GET_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: struct kvm_irqchip (in/out) +Returns: 0 on success, -1 on error + +Reads the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP into a buffer provided by the caller. + +struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + +4.26 KVM_SET_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: struct kvm_irqchip (in) +Returns: 0 on success, -1 on error + +Sets the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP from a buffer provided by the caller. + +struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by -- cgit v1.2.3-70-g09d2 From 60f24784a92c25c269a5e741a8ce8ff63e887be8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 27 Aug 2009 13:37:06 +0300 Subject: KVM: Optimize kvm_mmu_unprotect_page_virt() for tdp We know no pages are protected, so we can short-circuit the whole thing (including fairly nasty guest memory accesses). Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6f38178af926..eca41ae9f453 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2694,6 +2694,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa_t gpa; int r; + if (tdp_enabled) + return 0; + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); spin_lock(&vcpu->kvm->mmu_lock); -- cgit v1.2.3-70-g09d2 From 5fff7d270bd6a4759b6d663741b729cdee370257 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 27 Aug 2009 18:41:30 +0300 Subject: KVM: VMX: Fix cr8 exiting control clobbering by EPT Don't call adjust_vmx_controls() two times for the same control. It restores options that were dropped earlier. This loses us the cr8 exit control, which causes a massive performance regression Windows x64. Cc: stable@kernel.org Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1ee811cef236..eec04129402f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1262,12 +1262,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT enabled */ - min &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_INVLPG_EXITING); - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, - &_cpu_based_exec_control) < 0) - return -EIO; + _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, vmx_capability.ept, vmx_capability.vpid); } -- cgit v1.2.3-70-g09d2 From a20316d2aa41a8f4fd171648bad8f044f6060826 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 31 Aug 2009 03:04:31 -0400 Subject: KVM guest: fix bogus wallclock physical address calculation The use of __pa() to calculate the address of a C-visible symbol is wrong, and can lead to unpredictable results. See arch/x86/include/asm/page.h for details. It should be replaced with __pa_symbol(), that does the correct math here, by taking relocations into account. This ensures the correct wallclock data structure physical address is passed to the hypervisor. Cc: stable@kernel.org Signed-off-by: Glauber Costa Signed-off-by: Avi Kivity --- arch/x86/kernel/kvmclock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 223af43f1526..e5efcdcca31b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -50,8 +50,8 @@ static unsigned long kvm_get_wallclock(void) struct timespec ts; int low, high; - low = (int)__pa(&wall_clock); - high = ((u64)__pa(&wall_clock) >> 32); + low = (int)__pa_symbol(&wall_clock); + high = ((u64)__pa_symbol(&wall_clock) >> 32); native_write_msr(MSR_KVM_WALL_CLOCK, low, high); vcpu_time = &get_cpu_var(hv_clock); -- cgit v1.2.3-70-g09d2 From 65a82211636f156a276cac3f8665605ae18f371f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 3 Sep 2009 12:10:34 +0300 Subject: KVM: Fix coalesced interrupt reporting in IOAPIC This bug was introduced by b4a2f5e723e4f7df467. Cc: stable@kernel.org Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- virt/kvm/ioapic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index fa05f67423ab..9fe140bb38ec 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -193,6 +193,8 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) if ((edge && old_irr != ioapic->irr) || (!edge && !entry.fields.remote_irr)) ret = ioapic_service(ioapic, irq); + else + ret = 0; /* report coalesced interrupt */ } trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); } -- cgit v1.2.3-70-g09d2 From 6ba661787594868512a71c129062ebd57d0c01e7 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 25 Aug 2009 01:13:10 -0300 Subject: KVM guest: do not batch pte updates from interrupt context Commit b8bcfe997e4 made paravirt pte updates synchronous in interrupt context. Unfortunately the KVM pv mmu code caches the lazy/nonlazy mode internally, so a pte update from interrupt context during a lazy mmu operation can be batched while it should be performed synchronously. https://bugzilla.redhat.com/show_bug.cgi?id=518022 Drop the internal mode variable and use paravirt_get_lazy_mode(), which returns the correct state. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kernel/kvm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index c664d515f613..63b0ec8d3d4a 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -34,7 +34,6 @@ struct kvm_para_state { u8 mmu_queue[MMU_QUEUE_SIZE]; int mmu_queue_len; - enum paravirt_lazy_mode mode; }; static DEFINE_PER_CPU(struct kvm_para_state, para_state); @@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len) { struct kvm_para_state *state = kvm_para_state(); - if (state->mode != PARAVIRT_LAZY_MMU) { + if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { kvm_mmu_op(buffer, len); return; } @@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn) static void kvm_enter_lazy_mmu(void) { - struct kvm_para_state *state = kvm_para_state(); - paravirt_enter_lazy_mmu(); - state->mode = paravirt_get_lazy_mode(); } static void kvm_leave_lazy_mmu(void) @@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void) mmu_queue_flush(state); paravirt_leave_lazy_mmu(); - state->mode = paravirt_get_lazy_mode(); } static void __init paravirt_ops_setup(void) -- cgit v1.2.3-70-g09d2 From 3d53c27d05950390712f92c5ad1604c60190ed64 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 Sep 2009 12:34:07 +0300 Subject: KVM: Use thread debug register storage instead of kvm specific data Instead of saving the debug registers from the processor to a kvm data structure, rely in the debug registers stored in the thread structure. This allows us not to save dr6 and dr7. Reduces lightweight vmexit cost by 350 cycles, or 11 percent. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 3 --- arch/x86/kvm/x86.c | 22 +++++++--------------- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33901be75a36..e8f166a02c79 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -362,9 +362,6 @@ struct kvm_vcpu_arch { u32 pat; int switch_db_regs; - unsigned long host_db[KVM_NR_DB_REGS]; - unsigned long host_dr6; - unsigned long host_dr7; unsigned long db[KVM_NR_DB_REGS]; unsigned long dr6; unsigned long dr7; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 35e7fc54de35..3e893c4fdd39 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3623,14 +3623,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_guest_enter(); - get_debugreg(vcpu->arch.host_dr6, 6); - get_debugreg(vcpu->arch.host_dr7, 7); if (unlikely(vcpu->arch.switch_db_regs)) { - get_debugreg(vcpu->arch.host_db[0], 0); - get_debugreg(vcpu->arch.host_db[1], 1); - get_debugreg(vcpu->arch.host_db[2], 2); - get_debugreg(vcpu->arch.host_db[3], 3); - set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); @@ -3641,15 +3634,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu, kvm_run); - if (unlikely(vcpu->arch.switch_db_regs)) { - set_debugreg(0, 7); - set_debugreg(vcpu->arch.host_db[0], 0); - set_debugreg(vcpu->arch.host_db[1], 1); - set_debugreg(vcpu->arch.host_db[2], 2); - set_debugreg(vcpu->arch.host_db[3], 3); + if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { + set_debugreg(current->thread.debugreg0, 0); + set_debugreg(current->thread.debugreg1, 1); + set_debugreg(current->thread.debugreg2, 2); + set_debugreg(current->thread.debugreg3, 3); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(current->thread.debugreg7, 7); } - set_debugreg(vcpu->arch.host_dr6, 6); - set_debugreg(vcpu->arch.host_dr7, 7); set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); -- cgit v1.2.3-70-g09d2 From e8a48342e9bc1c65ffe4bb3b3ac964e726dbd4c0 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 Sep 2009 16:06:25 +0300 Subject: KVM: VMX: Conditionally reload debug register 6 Only reload debug register 6 if we're running with the guest's debug registers. Saves around 150 cycles from the guest lightweight exit path. dr6 contains a couple of bits that are updated on #DB, so intercept that unconditionally and update those bits then. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index eec04129402f..f482100eff87 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -540,10 +540,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; + /* + * Unconditionally intercept #DB so we can maintain dr6 without + * reading it every exit. + */ + eb |= 1u << DB_VECTOR; if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - eb |= 1u << DB_VECTOR; if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) eb |= 1u << BP_VECTOR; } @@ -3632,7 +3634,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) */ vmcs_writel(HOST_CR0, read_cr0()); - set_debugreg(vcpu->arch.dr6, 6); + if (vcpu->arch.switch_db_regs) + set_debugreg(vcpu->arch.dr6, 6); asm( /* Store host registers */ @@ -3734,7 +3737,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; - get_debugreg(vcpu->arch.dr6, 6); + if (vcpu->arch.switch_db_regs) + get_debugreg(vcpu->arch.dr6, 6); vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) -- cgit v1.2.3-70-g09d2 From 542423b0dd162a9dbf91109461703bd0e545c71f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 27 Aug 2009 15:07:30 +0300 Subject: KVM: VMX: call vmx_load_host_state() only if msr is cached No need to call it before each kvm_(set|get)_msr_common() Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f482100eff87..cc6e00a9f724 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1007,9 +1007,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: - vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { + vmx_load_host_state(to_vmx(vcpu)); data = msr->data; break; } @@ -1066,9 +1066,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } /* Otherwise falls through to kvm_set_msr_common */ default: - vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); if (msr) { + vmx_load_host_state(vmx); msr->data = data; break; } -- cgit v1.2.3-70-g09d2 From e3904e6ed0d525e383eb961ed1da0596a10e5387 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 8 Sep 2009 14:50:51 -0300 Subject: KVM: x86: drop duplicate kvm_flush_remote_tlb calls kvm_mmu_slot_remove_write_access already calls it. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3e893c4fdd39..7627ff607a90 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2152,7 +2152,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - kvm_flush_remote_tlbs(kvm); memslot = &kvm->memslots[log->slot]; n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; memset(memslot->dirty_bitmap, 0, n); @@ -4896,7 +4895,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm, kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); - kvm_flush_remote_tlbs(kvm); return 0; } -- cgit v1.2.3-70-g09d2 From 4da748960a6bd7b1e123e01bfa8f2dbcb6be209e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 27 Aug 2009 16:25:04 +0300 Subject: KVM: fix misreporting of coalesced interrupts by kvm tracer Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5b9d1ae09cad..1ae5ceba7eb2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -376,7 +376,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, result = !apic_test_and_set_irr(vector, apic); trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, result); + trig_mode, vector, !result); if (!result) { if (trig_mode) apic_debug("level trig mode repeatedly for " -- cgit v1.2.3-70-g09d2 From 0a79b009525b160081d75cef5dbf45817956acf2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 Sep 2009 12:03:25 +0300 Subject: KVM: VMX: Check cpl before emulating debug register access Debug registers may only be accessed from cpl 0. Unfortunately, vmx will code to emulate the instruction even though it was issued from guest userspace, possibly leading to an unexpected trap later. Cc: stable@kernel.org Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx.c | 2 ++ arch/x86/kvm/x86.c | 13 +++++++++++++ 3 files changed, 16 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e8f166a02c79..3be000435fad 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -620,6 +620,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, u32 error_code); +bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); int kvm_pic_set_irq(void *opaque, int irq, int level); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cc6e00a9f724..f3812014bd0b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2934,6 +2934,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long val; int dr, reg; + if (!kvm_require_cpl(vcpu, 0)) + return 1; dr = vmcs_readl(GUEST_DR7); if (dr & DR7_GD) { /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7627ff607a90..4137cc579ba5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -222,6 +222,19 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); +/* + * Checks if cpl <= required_cpl; if true, return true. Otherwise queue + * a #GP and return false. + */ +bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) +{ + if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) + return true; + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return false; +} +EXPORT_SYMBOL_GPL(kvm_require_cpl); + /* * Load the pae pdptrs. Return true is they are all valid. */ -- cgit v1.2.3-70-g09d2 From 28bcb112183cb822a394b155a2e7d788fe4a109a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 3 Sep 2009 17:35:35 +0200 Subject: KVM: fix compile warnings on s390 CC arch/s390/kvm/../../../virt/kvm/kvm_main.o arch/s390/kvm/../../../virt/kvm/kvm_main.c: In function '__kvm_set_memory_region': arch/s390/kvm/../../../virt/kvm/kvm_main.c:485: warning: unused variable 'j' arch/s390/kvm/../../../virt/kvm/kvm_main.c:484: warning: unused variable 'lpages' arch/s390/kvm/../../../virt/kvm/kvm_main.c:483: warning: unused variable 'ugfn' Cc: Carsten Otte Signed-off-by: Heiko Carstens Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1df4c0422f68..897bff3b7df9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1091,9 +1091,8 @@ int __kvm_set_memory_region(struct kvm *kvm, { int r; gfn_t base_gfn; - unsigned long npages, ugfn; - int lpages; - unsigned long i, j; + unsigned long npages; + unsigned long i; struct kvm_memory_slot *memslot; struct kvm_memory_slot old, new; @@ -1171,6 +1170,9 @@ int __kvm_set_memory_region(struct kvm *kvm, goto skip_lpage; for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + unsigned long ugfn; + unsigned long j; + int lpages; int level = i + 2; /* Avoid unused variable warning if no large pages */ -- cgit v1.2.3-70-g09d2 From 6223011fb9f90fab92635f1f782196cbd2ccf24f Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 28 Jul 2009 17:53:24 +0200 Subject: KVM: correct error-handling code This code is not executed before file has been initialized to the result of calling eventfd_fget. This function returns an ERR_PTR value in an error case instead of NULL. Thus the test that file is not NULL is always true. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @match exists@ expression x, E; statement S1, S2; @@ x = eventfd_fget(...) ... when != x = E ( * if (x == NULL || ...) S1 else S2 | * if (x == NULL && ...) S1 else S2 ) // Signed-off-by: Julia Lawall Signed-off-by: Avi Kivity --- virt/kvm/eventfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 99017e8a92ac..bb4ebd89b9ff 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -230,7 +230,7 @@ fail: if (eventfd && !IS_ERR(eventfd)) eventfd_ctx_put(eventfd); - if (file && !IS_ERR(file)) + if (!IS_ERR(file)) fput(file); kfree(irqfd); -- cgit v1.2.3-70-g09d2 From 8e616fc8d343bd7f0f0a0c22407fdcb77f6d22b1 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 10 Sep 2009 17:21:34 -0300 Subject: MAINTAINERS: update KVM entry Add myself to KVM MAINTAINERS entry. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8dca9d89c6c1..24f26940b91b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2928,6 +2928,7 @@ F: include/linux/sunrpc/ KERNEL VIRTUAL MACHINE (KVM) M: Avi Kivity +M: Marcelo Tosatti L: kvm@vger.kernel.org W: http://kvm.qumranet.com S: Supported -- cgit v1.2.3-70-g09d2