From af585b921e5d1e919947c4b1164b59507fe7cd7b Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 14 Oct 2010 11:22:46 +0200 Subject: KVM: Halt vcpu if page it tries to access is swapped out If a guest accesses swapped out memory do not swap it in from vcpu thread context. Schedule work to do swapping and put vcpu into halted state instead. Interrupts will still be delivered to the guest and if interrupt will cause reschedule guest will continue to run another task. [avi: remove call to get_user_pages_noio(), nacked by Linus; this makes everything synchrnous again] Acked-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 31 ++++++++++++++++ include/trace/events/kvm.h | 90 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a0557422715e..e56acc7857e2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -40,6 +40,7 @@ #define KVM_REQ_KICK 9 #define KVM_REQ_DEACTIVATE_FPU 10 #define KVM_REQ_EVENT 11 +#define KVM_REQ_APF_HALT 12 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 @@ -74,6 +75,26 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev); +#ifdef CONFIG_KVM_ASYNC_PF +struct kvm_async_pf { + struct work_struct work; + struct list_head link; + struct list_head queue; + struct kvm_vcpu *vcpu; + struct mm_struct *mm; + gva_t gva; + unsigned long addr; + struct kvm_arch_async_pf arch; + struct page *page; + bool done; +}; + +void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); +void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, + struct kvm_arch_async_pf *arch); +#endif + struct kvm_vcpu { struct kvm *kvm; #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -104,6 +125,15 @@ struct kvm_vcpu { gpa_t mmio_phys_addr; #endif +#ifdef CONFIG_KVM_ASYNC_PF + struct { + u32 queued; + struct list_head queue; + struct list_head done; + spinlock_t lock; + } async_pf; +#endif + struct kvm_vcpu_arch arch; }; @@ -302,6 +332,7 @@ void kvm_set_page_accessed(struct page *page); pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); +pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 6dd3a51ab1cb..a78a5e574632 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -185,6 +185,96 @@ TRACE_EVENT(kvm_age_page, __entry->referenced ? "YOUNG" : "OLD") ); +#ifdef CONFIG_KVM_ASYNC_PF +TRACE_EVENT( + kvm_try_async_get_page, + TP_PROTO(bool async, u64 pfn), + TP_ARGS(async, pfn), + + TP_STRUCT__entry( + __field(__u64, pfn) + ), + + TP_fast_assign( + __entry->pfn = (!async) ? pfn : (u64)-1; + ), + + TP_printk("pfn %#llx", __entry->pfn) +); + +TRACE_EVENT( + kvm_async_pf_not_present, + TP_PROTO(u64 gva), + TP_ARGS(gva), + + TP_STRUCT__entry( + __field(__u64, gva) + ), + + TP_fast_assign( + __entry->gva = gva; + ), + + TP_printk("gva %#llx not present", __entry->gva) +); + +TRACE_EVENT( + kvm_async_pf_ready, + TP_PROTO(u64 gva), + TP_ARGS(gva), + + TP_STRUCT__entry( + __field(__u64, gva) + ), + + TP_fast_assign( + __entry->gva = gva; + ), + + TP_printk("gva %#llx ready", __entry->gva) +); + +TRACE_EVENT( + kvm_async_pf_completed, + TP_PROTO(unsigned long address, struct page *page, u64 gva), + TP_ARGS(address, page, gva), + + TP_STRUCT__entry( + __field(unsigned long, address) + __field(pfn_t, pfn) + __field(u64, gva) + ), + + TP_fast_assign( + __entry->address = address; + __entry->pfn = page ? page_to_pfn(page) : 0; + __entry->gva = gva; + ), + + TP_printk("gva %#llx address %#lx pfn %#llx", __entry->gva, + __entry->address, __entry->pfn) +); + +TRACE_EVENT( + kvm_async_pf_doublefault, + TP_PROTO(u64 gva, u64 gfn), + TP_ARGS(gva, gfn), + + TP_STRUCT__entry( + __field(u64, gva) + __field(u64, gfn) + ), + + TP_fast_assign( + __entry->gva = gva; + __entry->gfn = gfn; + ), + + TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn) +); + +#endif + #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ -- cgit v1.2.3-70-g09d2 From 49c7754ce57063b819b01eb8a4290841ad0886c4 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 18 Oct 2010 15:22:23 +0200 Subject: KVM: Add memory slot versioning and use it to provide fast guest write interface Keep track of memslots changes by keeping generation number in memslots structure. Provide kvm_write_guest_cached() function that skips gfn_to_hva() translation if memslots was not changed since previous invocation. Acked-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 1 + include/linux/kvm_host.h | 7 +++++ include/linux/kvm_types.h | 7 +++++ virt/kvm/kvm_main.c | 75 +++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 78 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 71beb27597fd..bd254779d1cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3190,6 +3190,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, } memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + slots->generation++; old_slots = kvm->memslots; rcu_assign_pointer(kvm->memslots, slots); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e56acc7857e2..e6748204cd56 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -199,6 +199,7 @@ struct kvm_irq_routing_table {}; struct kvm_memslots { int nmemslots; + u64 generation; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS]; }; @@ -352,12 +353,18 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len); +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len); +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + gpa_t gpa); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); +void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, + gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 7ac0d4eee430..fa7cc7244cbd 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -67,4 +67,11 @@ struct kvm_lapic_irq { u32 dest_id; }; +struct gfn_to_hva_cache { + u64 generation; + gpa_t gpa; + unsigned long hva; + struct kvm_memory_slot *memslot; +}; + #endif /* __KVM_TYPES_H__ */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 75fd590c0214..228f00f87966 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -687,6 +687,7 @@ skip_lpage: memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); if (mem->slot >= slots->nmemslots) slots->nmemslots = mem->slot + 1; + slots->generation++; slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; old_memslots = kvm->memslots; @@ -721,6 +722,7 @@ skip_lpage: memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); if (mem->slot >= slots->nmemslots) slots->nmemslots = mem->slot + 1; + slots->generation++; /* actual memory is freed via old in kvm_free_physmem_slot below */ if (!npages) { @@ -851,10 +853,10 @@ int kvm_is_error_hva(unsigned long addr) } EXPORT_SYMBOL_GPL(kvm_is_error_hva); -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, + gfn_t gfn) { int i; - struct kvm_memslots *slots = kvm_memslots(kvm); for (i = 0; i < slots->nmemslots; ++i) { struct kvm_memory_slot *memslot = &slots->memslots[i]; @@ -865,6 +867,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) } return NULL; } + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_memslot(kvm_memslots(kvm), gfn); +} EXPORT_SYMBOL_GPL(gfn_to_memslot); int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) @@ -927,12 +934,9 @@ int memslot_id(struct kvm *kvm, gfn_t gfn) return memslot - slots->memslots; } -static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, +static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages) { - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(kvm, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return bad_hva(); @@ -944,7 +948,7 @@ static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { - return gfn_to_hva_many(kvm, gfn, NULL); + return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); } EXPORT_SYMBOL_GPL(gfn_to_hva); @@ -1054,7 +1058,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, unsigned long addr; gfn_t entry; - addr = gfn_to_hva_many(kvm, gfn, &entry); + addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); if (kvm_is_error_hva(addr)) return -1; @@ -1238,6 +1242,47 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, return 0; } +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + gpa_t gpa) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + int offset = offset_in_page(gpa); + gfn_t gfn = gpa >> PAGE_SHIFT; + + ghc->gpa = gpa; + ghc->generation = slots->generation; + ghc->memslot = __gfn_to_memslot(slots, gfn); + ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); + if (!kvm_is_error_hva(ghc->hva)) + ghc->hva += offset; + else + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); + +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + int r; + + if (slots->generation != ghc->generation) + kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); + + if (kvm_is_error_hva(ghc->hva)) + return -EFAULT; + + r = copy_to_user((void __user *)ghc->hva, data, len); + if (r) + return -EFAULT; + mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_guest_cached); + int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); @@ -1263,11 +1308,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } EXPORT_SYMBOL_GPL(kvm_clear_guest); -void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, + gfn_t gfn) { - struct kvm_memory_slot *memslot; - - memslot = gfn_to_memslot(kvm, gfn); if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; @@ -1275,6 +1318,14 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) } } +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *memslot; + + memslot = gfn_to_memslot(kvm, gfn); + mark_page_dirty_in_slot(kvm, memslot, gfn); +} + /* * The vCPU has executed a HLT instruction with in-kernel mode enabled. */ -- cgit v1.2.3-70-g09d2 From 344d9588a9df06182684168be4f1408b55c7da3e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 14 Oct 2010 11:22:50 +0200 Subject: KVM: Add PV MSR to enable asynchronous page faults delivery. Guest enables async PF vcpu functionality using this MSR. Reviewed-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- Documentation/kvm/cpuid.txt | 3 +++ Documentation/kvm/msr.txt | 35 ++++++++++++++++++++++++++++++++++- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/include/asm/kvm_para.h | 4 ++++ arch/x86/kvm/x86.c | 38 ++++++++++++++++++++++++++++++++++++-- include/linux/kvm.h | 1 + include/linux/kvm_host.h | 1 + virt/kvm/async_pf.c | 20 ++++++++++++++++++++ 8 files changed, 101 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt index 14a12ea92b7f..882068538c9c 100644 --- a/Documentation/kvm/cpuid.txt +++ b/Documentation/kvm/cpuid.txt @@ -36,6 +36,9 @@ KVM_FEATURE_MMU_OP || 2 || deprecated. KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs || || 0x4b564d00 and 0x4b564d01 ------------------------------------------------------------------------------ +KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by + || || writing to msr 0x4b564d02 +------------------------------------------------------------------------------ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side || || per-cpu warps are expected in || || kvmclock. diff --git a/Documentation/kvm/msr.txt b/Documentation/kvm/msr.txt index 8ddcfe84c09a..e67b4a8783df 100644 --- a/Documentation/kvm/msr.txt +++ b/Documentation/kvm/msr.txt @@ -3,7 +3,6 @@ Glauber Costa , Red Hat Inc, 2010 ===================================================== KVM makes use of some custom MSRs to service some requests. -At present, this facility is only used by kvmclock. Custom MSRs have a range reserved for them, that goes from 0x4b564d00 to 0x4b564dff. There are MSRs outside this area, @@ -151,3 +150,37 @@ MSR_KVM_SYSTEM_TIME: 0x12 return PRESENT; } else return NON_PRESENT; + +MSR_KVM_ASYNC_PF_EN: 0x4b564d02 + data: Bits 63-6 hold 64-byte aligned physical address of a + 64 byte memory area which must be in guest RAM and must be + zeroed. Bits 5-1 are reserved and should be zero. Bit 0 is 1 + when asynchronous page faults are enabled on the vcpu 0 when + disabled. + + First 4 byte of 64 byte memory location will be written to by + the hypervisor at the time of asynchronous page fault (APF) + injection to indicate type of asynchronous page fault. Value + of 1 means that the page referred to by the page fault is not + present. Value 2 means that the page is now available. Disabling + interrupt inhibits APFs. Guest must not enable interrupt + before the reason is read, or it may be overwritten by another + APF. Since APF uses the same exception vector as regular page + fault guest must reset the reason to 0 before it does + something that can generate normal page fault. If during page + fault APF reason is 0 it means that this is regular page + fault. + + During delivery of type 1 APF cr2 contains a token that will + be used to notify a guest when missing page becomes + available. When page becomes available type 2 APF is sent with + cr2 set to the token associated with the page. There is special + kind of token 0xffffffff which tells vcpu that it should wake + up all processes waiting for APFs and no individual type 2 APFs + will be sent. + + If APF is disabled while there are outstanding APFs, they will + not be delivered. + + Currently type 2 APF will be always delivered on the same vcpu as + type 1 was, but guest should not rely on that. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c3076bcf5ef7..0d7039804b4c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -419,6 +419,8 @@ struct kvm_vcpu_arch { struct { bool halted; gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; + struct gfn_to_hva_cache data; + u64 msr_val; } apf; }; diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index e3faaaf4301e..8662ae0a035c 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -20,6 +20,7 @@ * are available. The use of 0x11 and 0x12 is deprecated */ #define KVM_FEATURE_CLOCKSOURCE2 3 +#define KVM_FEATURE_ASYNC_PF 4 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. @@ -32,9 +33,12 @@ /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ #define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 +#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 #define KVM_MAX_MMU_OP_BATCH 32 +#define KVM_ASYNC_PF_ENABLED (1 << 0) + /* Operations for KVM_HC_MMU_OP */ #define KVM_MMU_OP_WRITE_PTE 1 #define KVM_MMU_OP_FLUSH_TLB 2 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd254779d1cc..063c07296764 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -783,12 +783,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 7 +#define KVM_SAVE_MSRS_BEGIN 8 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_APIC_ASSIST_PAGE, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1425,6 +1425,29 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 0; } +static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) +{ + gpa_t gpa = data & ~0x3f; + + /* Bits 1:5 are resrved, Should be zero */ + if (data & 0x3e) + return 1; + + vcpu->arch.apf.msr_val = data; + + if (!(data & KVM_ASYNC_PF_ENABLED)) { + kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + return 0; + } + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) + return 1; + + kvm_async_pf_wakeup_all(vcpu); + return 0; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1506,6 +1529,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } break; } + case MSR_KVM_ASYNC_PF_EN: + if (kvm_pv_enable_async_pf(vcpu, data)) + return 1; + break; case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: @@ -1782,6 +1809,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_SYSTEM_TIME_NEW: data = vcpu->arch.time; break; + case MSR_KVM_ASYNC_PF_EN: + data = vcpu->arch.apf.msr_val; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -1929,6 +1959,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: + case KVM_CAP_ASYNC_PF: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -5792,6 +5823,8 @@ free_vcpu: void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + vcpu->arch.apf.msr_val = 0; + vcpu_load(vcpu); kvm_mmu_unload(vcpu); vcpu_put(vcpu); @@ -5811,6 +5844,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.dr7 = DR7_FIXED_1; kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu->arch.apf.msr_val = 0; kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 919ae53adc5c..ea2dc1a2e13d 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -540,6 +540,7 @@ struct kvm_ppc_pvinfo { #endif #define KVM_CAP_PPC_GET_PVINFO 57 #define KVM_CAP_PPC_IRQ_LEVEL 58 +#define KVM_CAP_ASYNC_PF 59 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e6748204cd56..ee4314e15ead 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -93,6 +93,7 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, struct kvm_arch_async_pf *arch); +int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif struct kvm_vcpu { diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index e97eae965a4c..1f59498561b2 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -190,3 +190,23 @@ retry_sync: kmem_cache_free(async_pf_cache, work); return 0; } + +int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) +{ + struct kvm_async_pf *work; + + if (!list_empty(&vcpu->async_pf.done)) + return 0; + + work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC); + if (!work) + return -ENOMEM; + + work->page = bad_page; + get_page(bad_page); + INIT_LIST_HEAD(&work->queue); /* for list_del to work */ + + list_add_tail(&work->link, &vcpu->async_pf.done); + vcpu->async_pf.queued++; + return 0; +} -- cgit v1.2.3-70-g09d2 From 7c90705bf2a373aa238661bdb6446f27299ef489 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 14 Oct 2010 11:22:53 +0200 Subject: KVM: Inject asynchronous page fault into a PV guest if page is swapped out. Send async page fault to a PV guest if it accesses swapped out memory. Guest will choose another task to run upon receiving the fault. Allow async page fault injection only when guest is in user mode since otherwise guest may be in non-sleepable context and will not be able to reschedule. Vcpu will be halted if guest will fault on the same page again or if vcpu executes kernel code. Acked-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/x86.c | 43 ++++++++++++++++++++++++++++++++++++----- include/trace/events/kvm.h | 17 ++++++++++------ virt/kvm/async_pf.c | 3 ++- 5 files changed, 55 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0d7039804b4c..167375cc49ff 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -421,6 +421,7 @@ struct kvm_vcpu_arch { gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; struct gfn_to_hva_cache data; u64 msr_val; + u32 id; } apf; }; @@ -596,6 +597,7 @@ struct kvm_x86_ops { }; struct kvm_arch_async_pf { + u32 token; gfn_t gfn; }; @@ -819,6 +821,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); +bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b2c60986a7ce..64f90db369fb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2592,6 +2592,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) { struct kvm_arch_async_pf arch; + arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; arch.gfn = gfn; return kvm_setup_async_pf(vcpu, gva, gfn, &arch); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 063c07296764..ac4c368afd40 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6248,20 +6248,53 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) } } +static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) +{ + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, + sizeof(val)); +} + void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { - trace_kvm_async_pf_not_present(work->gva); - - kvm_make_request(KVM_REQ_APF_HALT, vcpu); + trace_kvm_async_pf_not_present(work->arch.token, work->gva); kvm_add_async_pf_gfn(vcpu, work->arch.gfn); + + if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || + kvm_x86_ops->get_cpl(vcpu) == 0) + kvm_make_request(KVM_REQ_APF_HALT, vcpu); + else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { + vcpu->arch.fault.error_code = 0; + vcpu->arch.fault.address = work->arch.token; + kvm_inject_page_fault(vcpu); + } } void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { - trace_kvm_async_pf_ready(work->gva); - kvm_del_async_pf_gfn(vcpu, work->arch.gfn); + trace_kvm_async_pf_ready(work->arch.token, work->gva); + if (is_error_page(work->page)) + work->arch.token = ~0; /* broadcast wakeup */ + else + kvm_del_async_pf_gfn(vcpu, work->arch.gfn); + + if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && + !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { + vcpu->arch.fault.error_code = 0; + vcpu->arch.fault.address = work->arch.token; + kvm_inject_page_fault(vcpu); + } +} + +bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) + return true; + else + return !kvm_event_needs_reinjection(vcpu) && + kvm_x86_ops->interrupt_allowed(vcpu); } EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index a78a5e574632..9c2cc6a96e82 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -204,34 +204,39 @@ TRACE_EVENT( TRACE_EVENT( kvm_async_pf_not_present, - TP_PROTO(u64 gva), - TP_ARGS(gva), + TP_PROTO(u64 token, u64 gva), + TP_ARGS(token, gva), TP_STRUCT__entry( + __field(__u64, token) __field(__u64, gva) ), TP_fast_assign( + __entry->token = token; __entry->gva = gva; ), - TP_printk("gva %#llx not present", __entry->gva) + TP_printk("token %#llx gva %#llx not present", __entry->token, + __entry->gva) ); TRACE_EVENT( kvm_async_pf_ready, - TP_PROTO(u64 gva), - TP_ARGS(gva), + TP_PROTO(u64 token, u64 gva), + TP_ARGS(token, gva), TP_STRUCT__entry( + __field(__u64, token) __field(__u64, gva) ), TP_fast_assign( + __entry->token = token; __entry->gva = gva; ), - TP_printk("gva %#llx ready", __entry->gva) + TP_printk("token %#llx gva %#llx ready", __entry->token, __entry->gva) ); TRACE_EVENT( diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 1f59498561b2..60df9e059e69 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -124,7 +124,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) { struct kvm_async_pf *work; - if (list_empty_careful(&vcpu->async_pf.done)) + if (list_empty_careful(&vcpu->async_pf.done) || + !kvm_arch_can_inject_async_page_present(vcpu)) return; spin_lock(&vcpu->async_pf.lock); -- cgit v1.2.3-70-g09d2 From 612819c3c6e67bac8fceaa7cc402f13b1b63f7e4 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 22 Oct 2010 14:18:18 -0200 Subject: KVM: propagate fault r/w information to gup(), allow read-only memory As suggested by Andrea, pass r/w error code to gup(), upgrading read fault to writable if host pte allows it. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 27 +++++++++++++++--------- arch/x86/kvm/paging_tmpl.h | 13 ++++++++---- include/linux/kvm_host.h | 5 ++++- virt/kvm/kvm_main.c | 51 +++++++++++++++++++++++++++++++++++++--------- 4 files changed, 71 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 99433943170c..53509f5973db 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2216,7 +2216,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int level, gfn_t gfn, pfn_t pfn) + int map_writable, int level, gfn_t gfn, pfn_t pfn) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -2225,9 +2225,13 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { - mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, + unsigned pte_access = ACC_ALL; + + if (!map_writable) + pte_access &= ~ACC_WRITE_MASK; + mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, 0, write, 1, &pt_write, - level, gfn, pfn, false, true); + level, gfn, pfn, false, map_writable); direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; break; @@ -2288,6 +2292,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) int level; pfn_t pfn; unsigned long mmu_seq; + bool map_writable; level = mapping_level(vcpu, gfn); @@ -2302,7 +2307,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - pfn = gfn_to_pfn(vcpu->kvm, gfn); + pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, &map_writable); /* mmio */ if (is_error_pfn(pfn)) @@ -2312,7 +2317,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, v, write, level, gfn, pfn); + r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); @@ -2611,11 +2616,11 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu) } static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn, - gva_t gva, pfn_t *pfn) + gva_t gva, pfn_t *pfn, bool write, bool *writable) { bool async; - *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async); + *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); if (!async) return false; /* *pfn has correct page already */ @@ -2632,7 +2637,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn, return true; } - *pfn = gfn_to_pfn(vcpu->kvm, gfn); + *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); return false; } @@ -2645,6 +2650,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, int level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; + int write = error_code & PFERR_WRITE_MASK; + bool map_writable; ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -2660,7 +2667,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, no_apf, gfn, gpa, &pfn)) + if (try_async_pf(vcpu, no_apf, gfn, gpa, &pfn, write, &map_writable)) return 0; /* mmio */ @@ -2670,7 +2677,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, + r = __direct_map(vcpu, gpa, write, map_writable, level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index d6b281e989b1..ba00eefa7bcd 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -427,7 +427,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int user_fault, int write_fault, int hlevel, - int *ptwrite, pfn_t pfn) + int *ptwrite, pfn_t pfn, bool map_writable) { unsigned access = gw->pt_access; struct kvm_mmu_page *sp = NULL; @@ -501,7 +501,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, user_fault, write_fault, dirty, ptwrite, it.level, - gw->gfn, pfn, false, true); + gw->gfn, pfn, false, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); return it.sptep; @@ -539,6 +539,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; unsigned long mmu_seq; + bool map_writable; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -569,13 +570,17 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, no_apf, walker.gfn, addr, &pfn)) + if (try_async_pf(vcpu, no_apf, walker.gfn, addr, &pfn, write_fault, + &map_writable)) return 0; /* mmio */ if (is_error_pfn(pfn)) return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); + if (!map_writable) + walker.pte_access &= ~ACC_WRITE_MASK; + spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; @@ -583,7 +588,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - level, &write_pt, pfn); + level, &write_pt, pfn, map_writable); (void)sptep; pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, sptep, *sptep, write_pt); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ee4314e15ead..462b982fedfb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -334,8 +334,11 @@ void kvm_set_page_accessed(struct page *page); pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); -pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async); +pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, + bool write_fault, bool *writable); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); +pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, + bool *writable); pfn_t gfn_to_pfn_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); int memslot_id(struct kvm *kvm, gfn_t gfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 475a100f3a22..2803b4db2a38 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -959,7 +959,7 @@ static pfn_t get_fault_pfn(void) } static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, - bool *async) + bool *async, bool write_fault, bool *writable) { struct page *page[1]; int npages = 0; @@ -968,12 +968,34 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, /* we can do it either atomically or asynchronously, not both */ BUG_ON(atomic && async); + BUG_ON(!write_fault && !writable); + + if (writable) + *writable = true; + if (atomic || async) npages = __get_user_pages_fast(addr, 1, 1, page); if (unlikely(npages != 1) && !atomic) { might_sleep(); - npages = get_user_pages_fast(addr, 1, 1, page); + + if (writable) + *writable = write_fault; + + npages = get_user_pages_fast(addr, 1, write_fault, page); + + /* map read fault as writable if possible */ + if (unlikely(!write_fault) && npages == 1) { + struct page *wpage[1]; + + npages = __get_user_pages_fast(addr, 1, 1, wpage); + if (npages == 1) { + *writable = true; + put_page(page[0]); + page[0] = wpage[0]; + } + npages = 1; + } } if (unlikely(npages != 1)) { @@ -1011,11 +1033,12 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) { - return hva_to_pfn(kvm, addr, true, NULL); + return hva_to_pfn(kvm, addr, true, NULL, true, NULL); } EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); -static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async) +static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, + bool write_fault, bool *writable) { unsigned long addr; @@ -1028,32 +1051,40 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async) return page_to_pfn(bad_page); } - return hva_to_pfn(kvm, addr, atomic, async); + return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); } pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, true, NULL); + return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); -pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async) +pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, + bool write_fault, bool *writable) { - return __gfn_to_pfn(kvm, gfn, false, async); + return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); } EXPORT_SYMBOL_GPL(gfn_to_pfn_async); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, false, NULL); + return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn); +pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, + bool *writable) +{ + return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); + pfn_t gfn_to_pfn_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn) { unsigned long addr = gfn_to_hva_memslot(slot, gfn); - return hva_to_pfn(kvm, addr, false, NULL); + return hva_to_pfn(kvm, addr, false, NULL, true, NULL); } int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, -- cgit v1.2.3-70-g09d2 From 64be5007066173d11a4635eedd57d41a3b3a7027 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 24 Oct 2010 16:49:08 +0200 Subject: KVM: x86: trace "exit to userspace" event Add tracepoint for userspace exit. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- include/trace/events/kvm.h | 30 ++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 1 + 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 9c2cc6a96e82..c86f4e8e0bc9 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -6,6 +6,36 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm +#define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x } + +#define kvm_trace_exit_reason \ + ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL), \ + ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \ + ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ + ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ + ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI) + +TRACE_EVENT(kvm_userspace_exit, + TP_PROTO(__u32 reason, int errno), + TP_ARGS(reason, errno), + + TP_STRUCT__entry( + __field( __u32, reason ) + __field( int, errno ) + ), + + TP_fast_assign( + __entry->reason = reason; + __entry->errno = errno; + ), + + TP_printk("reason %s (%d)", + __entry->errno < 0 ? + (__entry->errno == -EINTR ? "restart" : "error") : + __print_symbolic(__entry->reason, kvm_trace_exit_reason), + __entry->errno < 0 ? -__entry->errno : __entry->reason) +); + #if defined(__KVM_HAVE_IOAPIC) TRACE_EVENT(kvm_set_irq, TP_PROTO(unsigned int gsi, int level, int irq_source_id), diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2803b4db2a38..880370caf9ed 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1562,6 +1562,7 @@ static long kvm_vcpu_ioctl(struct file *filp, if (arg) goto out; r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; case KVM_GET_REGS: { struct kvm_regs *kvm_regs; -- cgit v1.2.3-70-g09d2 From 515a01279a187415322a80736800a7d6325876ab Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 27 Oct 2010 18:23:54 +0900 Subject: KVM: pre-allocate one more dirty bitmap to avoid vmalloc() Currently x86's kvm_vm_ioctl_get_dirty_log() needs to allocate a bitmap by vmalloc() which will be used in the next logging and this has been causing bad effect to VGA and live-migration: vmalloc() consumes extra systime, triggers tlb flush, etc. This patch resolves this issue by pre-allocating one more bitmap and switching between two bitmaps during dirty logging. Performance improvement: I measured performance for the case of VGA update by trace-cmd. The result was 1.5 times faster than the original one. In the case of live migration, the improvement ratio depends on the workload and the guest memory size. In general, the larger the memory size is the more benefits we get. Note: This does not change other architectures's logic but the allocation size becomes twice. This will increase the actual memory consumption only when the new size changes the number of pages allocated by vmalloc(). Signed-off-by: Takuya Yoshikawa Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 16 +++++----------- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 11 +++++++++-- 3 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a2a785472431..35f82f2c66f6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3208,18 +3208,15 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memslots *slots, *old_slots; unsigned long *dirty_bitmap; - r = -ENOMEM; - dirty_bitmap = vmalloc(n); - if (!dirty_bitmap) - goto out; + dirty_bitmap = memslot->dirty_bitmap_head; + if (memslot->dirty_bitmap == dirty_bitmap) + dirty_bitmap += n / sizeof(long); memset(dirty_bitmap, 0, n); r = -ENOMEM; slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); - if (!slots) { - vfree(dirty_bitmap); + if (!slots) goto out; - } memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; slots->generation++; @@ -3235,11 +3232,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, spin_unlock(&kvm->mmu_lock); r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { - vfree(dirty_bitmap); + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) goto out; - } - vfree(dirty_bitmap); } else { r = -EFAULT; if (clear_user(log->dirty_bitmap, n)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 462b982fedfb..bcf71c7730f0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -150,6 +150,7 @@ struct kvm_memory_slot { unsigned long flags; unsigned long *rmap; unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_head; struct { unsigned long rmap_pde; int write_count; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0021c2862140..27649fdaa007 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -449,8 +449,9 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) if (!memslot->dirty_bitmap) return; - vfree(memslot->dirty_bitmap); + vfree(memslot->dirty_bitmap_head); memslot->dirty_bitmap = NULL; + memslot->dirty_bitmap_head = NULL; } /* @@ -537,15 +538,21 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) return 0; } +/* + * Allocation size is twice as large as the actual dirty bitmap size. + * This makes it possible to do double buffering: see x86's + * kvm_vm_ioctl_get_dirty_log(). + */ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { - unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot); + unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); memslot->dirty_bitmap = vmalloc(dirty_bytes); if (!memslot->dirty_bitmap) return -ENOMEM; memset(memslot->dirty_bitmap, 0, dirty_bytes); + memslot->dirty_bitmap_head = memslot->dirty_bitmap; return 0; } -- cgit v1.2.3-70-g09d2 From c9b263d2be9c535b410f6617710534f798bf0ff0 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 1 Nov 2010 16:58:43 +0800 Subject: KVM: fix tracing kvm_try_async_get_page Tracing 'async' and *pfn is useless, since 'async' is always true, and '*pfn' is always "fault_pfn' We can trace 'gva' and 'gfn' instead, it can help us to see the life-cycle of an async_pf Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- include/trace/events/kvm.h | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 53509f5973db..272e30200030 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2628,7 +2628,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn, put_page(pfn_to_page(*pfn)); if (!no_apf && can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(async, *pfn); + trace_kvm_try_async_get_page(gva, gfn); if (kvm_find_async_pf_gfn(vcpu, gfn)) { trace_kvm_async_pf_doublefault(gva, gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index c86f4e8e0bc9..d94d6c312ca1 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -218,18 +218,20 @@ TRACE_EVENT(kvm_age_page, #ifdef CONFIG_KVM_ASYNC_PF TRACE_EVENT( kvm_try_async_get_page, - TP_PROTO(bool async, u64 pfn), - TP_ARGS(async, pfn), + TP_PROTO(u64 gva, u64 gfn), + TP_ARGS(gva, gfn), TP_STRUCT__entry( - __field(__u64, pfn) + __field(u64, gva) + __field(u64, gfn) ), TP_fast_assign( - __entry->pfn = (!async) ? pfn : (u64)-1; + __entry->gva = gva; + __entry->gfn = gfn; ), - TP_printk("pfn %#llx", __entry->pfn) + TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn) ); TRACE_EVENT( -- cgit v1.2.3-70-g09d2 From 0730388b97d20cc568c25b42b9a23b28959b481f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 1 Nov 2010 16:59:39 +0800 Subject: KVM: cleanup async_pf tracepoints Use 'DECLARE_EVENT_CLASS' to cleanup async_pf tracepoints Acked-by: Gleb Natapov Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- include/trace/events/kvm.h | 76 +++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index d94d6c312ca1..46e3cd8e197a 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -216,59 +216,71 @@ TRACE_EVENT(kvm_age_page, ); #ifdef CONFIG_KVM_ASYNC_PF -TRACE_EVENT( - kvm_try_async_get_page, +DECLARE_EVENT_CLASS(kvm_async_get_page_class, + TP_PROTO(u64 gva, u64 gfn), + TP_ARGS(gva, gfn), TP_STRUCT__entry( - __field(u64, gva) + __field(__u64, gva) __field(u64, gfn) - ), + ), TP_fast_assign( __entry->gva = gva; __entry->gfn = gfn; - ), + ), TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn) ); -TRACE_EVENT( - kvm_async_pf_not_present, +DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page, + + TP_PROTO(u64 gva, u64 gfn), + + TP_ARGS(gva, gfn) +); + +DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault, + + TP_PROTO(u64 gva, u64 gfn), + + TP_ARGS(gva, gfn) +); + +DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready, + TP_PROTO(u64 token, u64 gva), + TP_ARGS(token, gva), TP_STRUCT__entry( __field(__u64, token) __field(__u64, gva) - ), + ), TP_fast_assign( __entry->token = token; __entry->gva = gva; - ), + ), + + TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva) - TP_printk("token %#llx gva %#llx not present", __entry->token, - __entry->gva) ); -TRACE_EVENT( - kvm_async_pf_ready, +DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present, + TP_PROTO(u64 token, u64 gva), - TP_ARGS(token, gva), - TP_STRUCT__entry( - __field(__u64, token) - __field(__u64, gva) - ), + TP_ARGS(token, gva) +); - TP_fast_assign( - __entry->token = token; - __entry->gva = gva; - ), +DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready, + + TP_PROTO(u64 token, u64 gva), - TP_printk("token %#llx gva %#llx ready", __entry->token, __entry->gva) + TP_ARGS(token, gva) ); TRACE_EVENT( @@ -292,24 +304,6 @@ TRACE_EVENT( __entry->address, __entry->pfn) ); -TRACE_EVENT( - kvm_async_pf_doublefault, - TP_PROTO(u64 gva, u64 gfn), - TP_ARGS(gva, gfn), - - TP_STRUCT__entry( - __field(u64, gva) - __field(u64, gfn) - ), - - TP_fast_assign( - __entry->gva = gva; - __entry->gfn = gfn; - ), - - TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn) -); - #endif #endif /* _TRACE_KVM_MAIN_H */ -- cgit v1.2.3-70-g09d2 From d89f5eff70a31237ffa1e21c51d23ca532110aea Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 9 Nov 2010 17:02:49 +0100 Subject: KVM: Clean up vm creation and release IA64 support forces us to abstract the allocation of the kvm structure. But instead of mixing this up with arch-specific initialization and doing the same on destruction, split both steps. This allows to move generic destruction calls into generic code. It also fixes error clean-up on failures of kvm_create_vm for IA64. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 4 ++++ arch/ia64/kvm/kvm-ia64.c | 28 +++++++--------------------- arch/powerpc/kvm/powerpc.c | 20 +++----------------- arch/s390/kvm/kvm-s390.c | 23 ++++++----------------- arch/x86/kvm/x86.c | 12 ++---------- include/linux/kvm_host.h | 15 ++++++++++++++- virt/kvm/kvm_main.c | 19 +++++++++++++------ 7 files changed, 49 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 2f229e5de498..2689ee54a1c9 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -590,6 +590,10 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); void kvm_sal_emul(struct kvm_vcpu *vcpu); +#define __KVM_HAVE_ARCH_VM_ALLOC 1 +struct kvm *kvm_arch_alloc_vm(void); +void kvm_arch_free_vm(struct kvm *kvm); + #endif /* __ASSEMBLY__*/ #endif diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index f56a6316e134..48a48bdc59c3 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -749,7 +749,7 @@ out: return r; } -static struct kvm *kvm_alloc_kvm(void) +struct kvm *kvm_arch_alloc_vm(void) { struct kvm *kvm; @@ -760,7 +760,7 @@ static struct kvm *kvm_alloc_kvm(void) vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE)); if (!vm_base) - return ERR_PTR(-ENOMEM); + return NULL; memset((void *)vm_base, 0, KVM_VM_DATA_SIZE); kvm = (struct kvm *)(vm_base + @@ -806,10 +806,12 @@ static void kvm_build_io_pmt(struct kvm *kvm) #define GUEST_PHYSICAL_RR4 0x2739 #define VMM_INIT_RR 0x1660 -static void kvm_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm) { BUG_ON(!kvm); + kvm->arch.is_sn2 = ia64_platform_is("sn2"); + kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0; kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4; kvm->arch.vmm_init_rr = VMM_INIT_RR; @@ -823,21 +825,8 @@ static void kvm_init_vm(struct kvm *kvm) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); -} - -struct kvm *kvm_arch_create_vm(void) -{ - struct kvm *kvm = kvm_alloc_kvm(); - - if (IS_ERR(kvm)) - return ERR_PTR(-ENOMEM); - - kvm->arch.is_sn2 = ia64_platform_is("sn2"); - - kvm_init_vm(kvm); - - return kvm; + return 0; } static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, @@ -1357,7 +1346,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -EINVAL; } -static void free_kvm(struct kvm *kvm) +void kvm_arch_free_vm(struct kvm *kvm) { unsigned long vm_base = kvm->arch.vm_base; @@ -1399,9 +1388,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) #endif kfree(kvm->arch.vioapic); kvm_release_vm_pages(kvm); - kvm_free_physmem(kvm); - cleanup_srcu_struct(&kvm->srcu); - free_kvm(kvm); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 38f756f25053..99758460efde 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -145,18 +145,12 @@ void kvm_arch_check_processor_compat(void *rtn) *(int *)rtn = kvmppc_core_check_processor_compat(); } -struct kvm *kvm_arch_create_vm(void) +int kvm_arch_init_vm(struct kvm *kvm) { - struct kvm *kvm; - - kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); - if (!kvm) - return ERR_PTR(-ENOMEM); - - return kvm; + return 0; } -static void kvmppc_free_vcpus(struct kvm *kvm) +void kvm_arch_destroy_vm(struct kvm *kvm) { unsigned int i; struct kvm_vcpu *vcpu; @@ -176,14 +170,6 @@ void kvm_arch_sync_events(struct kvm *kvm) { } -void kvm_arch_destroy_vm(struct kvm *kvm) -{ - kvmppc_free_vcpus(kvm); - kvm_free_physmem(kvm); - cleanup_srcu_struct(&kvm->srcu); - kfree(kvm); -} - int kvm_dev_ioctl_check_extension(long ext) { int r; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 985d825494f1..bade533ba288 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -164,24 +164,18 @@ long kvm_arch_vm_ioctl(struct file *filp, return r; } -struct kvm *kvm_arch_create_vm(void) +int kvm_arch_init_vm(struct kvm *kvm) { - struct kvm *kvm; int rc; char debug_name[16]; rc = s390_enable_sie(); if (rc) - goto out_nokvm; - - rc = -ENOMEM; - kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); - if (!kvm) - goto out_nokvm; + goto out_err; kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL); if (!kvm->arch.sca) - goto out_nosca; + goto out_err; sprintf(debug_name, "kvm-%u", current->pid); @@ -195,13 +189,11 @@ struct kvm *kvm_arch_create_vm(void) debug_register_view(kvm->arch.dbf, &debug_sprintf_view); VM_EVENT(kvm, 3, "%s", "vm created"); - return kvm; + return 0; out_nodbf: free_page((unsigned long)(kvm->arch.sca)); -out_nosca: - kfree(kvm); -out_nokvm: - return ERR_PTR(rc); +out_err: + return rc; } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -240,11 +232,8 @@ void kvm_arch_sync_events(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_free_vcpus(kvm); - kvm_free_physmem(kvm); free_page((unsigned long)(kvm->arch.sca)); debug_unregister(kvm->arch.dbf); - cleanup_srcu_struct(&kvm->srcu); - kfree(kvm); } /* Section: vcpu related */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5daead833669..b7ee61d5bc81 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5961,13 +5961,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.pio_data); } -struct kvm *kvm_arch_create_vm(void) +int kvm_arch_init_vm(struct kvm *kvm) { - struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); - - if (!kvm) - return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5976,7 +5971,7 @@ struct kvm *kvm_arch_create_vm(void) spin_lock_init(&kvm->arch.tsc_write_lock); - return kvm; + return 0; } static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) @@ -6021,13 +6016,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); - kvm_free_physmem(kvm); if (kvm->arch.apic_access_page) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); - cleanup_srcu_struct(&kvm->srcu); - kfree(kvm); } int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index bcf71c7730f0..2d63f2c0137c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -441,7 +442,19 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); void kvm_free_physmem(struct kvm *kvm); -struct kvm *kvm_arch_create_vm(void); +#ifndef __KVM_HAVE_ARCH_VM_ALLOC +static inline struct kvm *kvm_arch_alloc_vm(void) +{ + return kzalloc(sizeof(struct kvm), GFP_KERNEL); +} + +static inline void kvm_arch_free_vm(struct kvm *kvm) +{ + kfree(kvm); +} +#endif + +int kvm_arch_init_vm(struct kvm *kvm); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fce0578eab0e..4023264c4cd5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -383,11 +383,15 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) static struct kvm *kvm_create_vm(void) { - int r = 0, i; - struct kvm *kvm = kvm_arch_create_vm(); + int r, i; + struct kvm *kvm = kvm_arch_alloc_vm(); - if (IS_ERR(kvm)) - goto out; + if (!kvm) + return ERR_PTR(-ENOMEM); + + r = kvm_arch_init_vm(kvm); + if (r) + goto out_err_nodisable; r = hardware_enable_all(); if (r) @@ -427,7 +431,7 @@ static struct kvm *kvm_create_vm(void) spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); -out: + return kvm; out_err: @@ -438,7 +442,7 @@ out_err_nodisable: for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm->buses[i]); kfree(kvm->memslots); - kfree(kvm); + kvm_arch_free_vm(kvm); return ERR_PTR(r); } @@ -512,6 +516,9 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_arch_flush_shadow(kvm); #endif kvm_arch_destroy_vm(kvm); + kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); + kvm_arch_free_vm(kvm); hardware_disable_all(); mmdrop(mm); } -- cgit v1.2.3-70-g09d2 From 0645211c43df0b96c51e12980066b3227e10b164 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 16 Nov 2010 22:30:03 +0100 Subject: KVM: Switch assigned device IRQ forwarding to threaded handler This improves the IRQ forwarding for assigned devices: By using the kernel's threaded IRQ scheme, we can get rid of the latency-prone work queue and simplify the code in the same run. Moreover, we no longer have to hold assigned_dev_lock while raising the guest IRQ, which can be a lenghty operation as we may have to iterate over all VCPUs. The lock is now only used for synchronizing masking vs. unmasking of INTx-type IRQs, thus is renames to intx_lock. Acked-by: Alex Williamson Acked-by: Michael S. Tsirkin Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 12 +----- virt/kvm/assigned-dev.c | 107 +++++++++++++++-------------------------------- 2 files changed, 36 insertions(+), 83 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2d63f2c0137c..9fe7fefe76b1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -470,16 +470,8 @@ struct kvm_irq_ack_notifier { void (*irq_acked)(struct kvm_irq_ack_notifier *kian); }; -#define KVM_ASSIGNED_MSIX_PENDING 0x1 -struct kvm_guest_msix_entry { - u32 vector; - u16 entry; - u16 flags; -}; - struct kvm_assigned_dev_kernel { struct kvm_irq_ack_notifier ack_notifier; - struct work_struct interrupt_work; struct list_head list; int assigned_dev_id; int host_segnr; @@ -490,13 +482,13 @@ struct kvm_assigned_dev_kernel { bool host_irq_disabled; struct msix_entry *host_msix_entries; int guest_irq; - struct kvm_guest_msix_entry *guest_msix_entries; + struct msix_entry *guest_msix_entries; unsigned long irq_requested_type; int irq_source_id; int flags; struct pci_dev *dev; struct kvm *kvm; - spinlock_t assigned_dev_lock; + spinlock_t intx_lock; }; struct kvm_irq_mask_notifier { diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index ecc4419fadca..1d77ce16360a 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -55,58 +55,31 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel return index; } -static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) +static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) { - struct kvm_assigned_dev_kernel *assigned_dev; - int i; + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + u32 vector; + int index; - assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, - interrupt_work); + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { + spin_lock(&assigned_dev->intx_lock); + disable_irq_nosync(irq); + assigned_dev->host_irq_disabled = true; + spin_unlock(&assigned_dev->intx_lock); + } - spin_lock_irq(&assigned_dev->assigned_dev_lock); if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - struct kvm_guest_msix_entry *guest_entries = - assigned_dev->guest_msix_entries; - for (i = 0; i < assigned_dev->entries_nr; i++) { - if (!(guest_entries[i].flags & - KVM_ASSIGNED_MSIX_PENDING)) - continue; - guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; + index = find_index_from_host_irq(assigned_dev, irq); + if (index >= 0) { + vector = assigned_dev-> + guest_msix_entries[index].vector; kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, - guest_entries[i].vector, 1); + assigned_dev->irq_source_id, vector, 1); } } else kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, assigned_dev->guest_irq, 1); - spin_unlock_irq(&assigned_dev->assigned_dev_lock); -} - -static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) -{ - unsigned long flags; - struct kvm_assigned_dev_kernel *assigned_dev = - (struct kvm_assigned_dev_kernel *) dev_id; - - spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - int index = find_index_from_host_irq(assigned_dev, irq); - if (index < 0) - goto out; - assigned_dev->guest_msix_entries[index].flags |= - KVM_ASSIGNED_MSIX_PENDING; - } - - schedule_work(&assigned_dev->interrupt_work); - - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; - } - -out: - spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); return IRQ_HANDLED; } @@ -114,7 +87,6 @@ out: static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_assigned_dev_kernel *dev; - unsigned long flags; if (kian->gsi == -1) return; @@ -127,12 +99,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) /* The guest irq may be shared so this ack may be * from another device. */ - spin_lock_irqsave(&dev->assigned_dev_lock, flags); + spin_lock(&dev->intx_lock); if (dev->host_irq_disabled) { enable_irq(dev->host_irq); dev->host_irq_disabled = false; } - spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); + spin_unlock(&dev->intx_lock); } static void deassign_guest_irq(struct kvm *kvm, @@ -155,28 +127,19 @@ static void deassign_host_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev) { /* - * In kvm_free_device_irq, cancel_work_sync return true if: - * 1. work is scheduled, and then cancelled. - * 2. work callback is executed. - * - * The first one ensured that the irq is disabled and no more events - * would happen. But for the second one, the irq may be enabled (e.g. - * for MSI). So we disable irq here to prevent further events. + * We disable irq here to prevent further events. * * Notice this maybe result in nested disable if the interrupt type is * INTx, but it's OK for we are going to free it. * * If this function is a part of VM destroy, please ensure that till * now, the kvm state is still legal for probably we also have to wait - * interrupt_work done. + * on a currently running IRQ handler. */ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { int i; for (i = 0; i < assigned_dev->entries_nr; i++) - disable_irq_nosync(assigned_dev-> - host_msix_entries[i].vector); - - cancel_work_sync(&assigned_dev->interrupt_work); + disable_irq(assigned_dev->host_msix_entries[i].vector); for (i = 0; i < assigned_dev->entries_nr; i++) free_irq(assigned_dev->host_msix_entries[i].vector, @@ -188,8 +151,7 @@ static void deassign_host_irq(struct kvm *kvm, pci_disable_msix(assigned_dev->dev); } else { /* Deal with MSI and INTx */ - disable_irq_nosync(assigned_dev->host_irq); - cancel_work_sync(&assigned_dev->interrupt_work); + disable_irq(assigned_dev->host_irq); free_irq(assigned_dev->host_irq, (void *)assigned_dev); @@ -268,8 +230,9 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, * on the same interrupt line is not a happy situation: there * are going to be long delays in accepting, acking, etc. */ - if (request_irq(dev->host_irq, kvm_assigned_dev_intr, - 0, "kvm_assigned_intx_device", (void *)dev)) + if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, + IRQF_ONESHOT, "kvm_assigned_intx_device", + (void *)dev)) return -EIO; return 0; } @@ -287,8 +250,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, } dev->host_irq = dev->dev->irq; - if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, - "kvm_assigned_msi_device", (void *)dev)) { + if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, + 0, "kvm_assigned_msi_device", (void *)dev)) { pci_disable_msi(dev->dev); return -EIO; } @@ -313,10 +276,10 @@ static int assigned_device_enable_host_msix(struct kvm *kvm, return r; for (i = 0; i < dev->entries_nr; i++) { - r = request_irq(dev->host_msix_entries[i].vector, - kvm_assigned_dev_intr, 0, - "kvm_assigned_msix_device", - (void *)dev); + r = request_threaded_irq(dev->host_msix_entries[i].vector, + NULL, kvm_assigned_dev_thread, + 0, "kvm_assigned_msix_device", + (void *)dev); if (r) goto err; } @@ -557,12 +520,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, match->host_devfn = assigned_dev->devfn; match->flags = assigned_dev->flags; match->dev = dev; - spin_lock_init(&match->assigned_dev_lock); + spin_lock_init(&match->intx_lock); match->irq_source_id = -1; match->kvm = kvm; match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - INIT_WORK(&match->interrupt_work, - kvm_assigned_dev_interrupt_work_handler); list_add(&match->list, &kvm->arch.assigned_dev_head); @@ -654,9 +615,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, r = -ENOMEM; goto msix_nr_out; } - adev->guest_msix_entries = kzalloc( - sizeof(struct kvm_guest_msix_entry) * - entry_nr->entry_nr, GFP_KERNEL); + adev->guest_msix_entries = + kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, + GFP_KERNEL); if (!adev->guest_msix_entries) { kfree(adev->host_msix_entries); r = -ENOMEM; -- cgit v1.2.3-70-g09d2 From 1e001d49f9f9a0e3eb72939ad49d9a2c7754e9c1 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 16 Nov 2010 22:30:04 +0100 Subject: KVM: Refactor IRQ names of assigned devices Cosmetic change, but it helps to correlate IRQs with PCI devices. Acked-by: Alex Williamson Acked-by: Michael S. Tsirkin Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 1 + virt/kvm/assigned-dev.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9fe7fefe76b1..4bd663d6443d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -489,6 +489,7 @@ struct kvm_assigned_dev_kernel { struct pci_dev *dev; struct kvm *kvm; spinlock_t intx_lock; + char irq_name[32]; }; struct kvm_irq_mask_notifier { diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 1d77ce16360a..7623408dbc81 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -231,8 +231,7 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, * are going to be long delays in accepting, acking, etc. */ if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - IRQF_ONESHOT, "kvm_assigned_intx_device", - (void *)dev)) + IRQF_ONESHOT, dev->irq_name, (void *)dev)) return -EIO; return 0; } @@ -251,7 +250,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, dev->host_irq = dev->dev->irq; if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - 0, "kvm_assigned_msi_device", (void *)dev)) { + 0, dev->irq_name, (void *)dev)) { pci_disable_msi(dev->dev); return -EIO; } @@ -278,8 +277,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm, for (i = 0; i < dev->entries_nr; i++) { r = request_threaded_irq(dev->host_msix_entries[i].vector, NULL, kvm_assigned_dev_thread, - 0, "kvm_assigned_msix_device", - (void *)dev); + 0, dev->irq_name, (void *)dev); if (r) goto err; } @@ -336,6 +334,9 @@ static int assign_host_irq(struct kvm *kvm, if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) return r; + snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", + pci_name(dev->dev)); + switch (host_irq_type) { case KVM_DEV_IRQ_HOST_INTX: r = assigned_device_enable_host_intx(kvm, dev); -- cgit v1.2.3-70-g09d2 From bd2b53b20fcd0d6c4c815b54e6d464e34429d3a4 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 18 Nov 2010 19:09:08 +0200 Subject: KVM: fast-path msi injection with irqfd Store irq routing table pointer in the irqfd object, and use that to inject MSI directly without bouncing out to a kernel thread. While we touch this structure, rearrange irqfd fields to make fastpath better packed for better cache utilization. This also adds some comments about locking rules and rcu usage in code. Some notes on the design: - Use pointer into the rt instead of copying an entry, to make it possible to use rcu, thus side-stepping locking complexities. We also save some memory this way. - Old workqueue code is still used for level irqs. I don't think we DTRT with level anyway, however, it seems easier to keep the code around as it has been thought through and debugged, and fix level later than rip out and re-instate it later. Signed-off-by: Michael S. Tsirkin Acked-by: Marcelo Tosatti Acked-by: Gregory Haskins Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 16 +++++++++ virt/kvm/eventfd.c | 91 +++++++++++++++++++++++++++++++++++++++++------- virt/kvm/irq_comm.c | 7 ++-- 3 files changed, 99 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4bd663d6443d..f17beae3cca0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -240,6 +241,10 @@ struct kvm { struct mutex irq_lock; #ifdef CONFIG_HAVE_KVM_IRQCHIP + /* + * Update side is protected by irq_lock and, + * if configured, irqfds.lock. + */ struct kvm_irq_routing_table __rcu *irq_routing; struct hlist_head mask_notifier_list; struct hlist_head irq_ack_notifier_list; @@ -511,6 +516,8 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, unsigned long *deliver_bitmask); #endif int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, + int irq_source_id, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); @@ -652,17 +659,26 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} void kvm_eventfd_init(struct kvm *kvm); int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); void kvm_irqfd_release(struct kvm *kvm); +void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); #else static inline void kvm_eventfd_init(struct kvm *kvm) {} + static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) { return -EINVAL; } static inline void kvm_irqfd_release(struct kvm *kvm) {} + +static inline void kvm_irq_routing_update(struct kvm *kvm, + struct kvm_irq_routing_table *irq_rt) +{ + rcu_assign_pointer(kvm->irq_routing, irq_rt); +} + static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { return -ENOSYS; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index c1f1e3c62984..2ca4535f4fb7 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -44,14 +44,19 @@ */ struct _irqfd { - struct kvm *kvm; - struct eventfd_ctx *eventfd; - int gsi; - struct list_head list; - poll_table pt; - wait_queue_t wait; - struct work_struct inject; - struct work_struct shutdown; + /* Used for MSI fast-path */ + struct kvm *kvm; + wait_queue_t wait; + /* Update side is protected by irqfds.lock */ + struct kvm_kernel_irq_routing_entry __rcu *irq_entry; + /* Used for level IRQ fast-path */ + int gsi; + struct work_struct inject; + /* Used for setup/shutdown */ + struct eventfd_ctx *eventfd; + struct list_head list; + poll_table pt; + struct work_struct shutdown; }; static struct workqueue_struct *irqfd_cleanup_wq; @@ -125,14 +130,22 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) { struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); unsigned long flags = (unsigned long)key; + struct kvm_kernel_irq_routing_entry *irq; + struct kvm *kvm = irqfd->kvm; - if (flags & POLLIN) + if (flags & POLLIN) { + rcu_read_lock(); + irq = rcu_dereference(irqfd->irq_entry); /* An event has been signaled, inject an interrupt */ - schedule_work(&irqfd->inject); + if (irq) + kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); + else + schedule_work(&irqfd->inject); + rcu_read_unlock(); + } if (flags & POLLHUP) { /* The eventfd is closing, detach from KVM */ - struct kvm *kvm = irqfd->kvm; unsigned long flags; spin_lock_irqsave(&kvm->irqfds.lock, flags); @@ -163,9 +176,31 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, add_wait_queue(wqh, &irqfd->wait); } +/* Must be called under irqfds.lock */ +static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, + struct kvm_irq_routing_table *irq_rt) +{ + struct kvm_kernel_irq_routing_entry *e; + struct hlist_node *n; + + if (irqfd->gsi >= irq_rt->nr_rt_entries) { + rcu_assign_pointer(irqfd->irq_entry, NULL); + return; + } + + hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) { + /* Only fast-path MSI. */ + if (e->type == KVM_IRQ_ROUTING_MSI) + rcu_assign_pointer(irqfd->irq_entry, e); + else + rcu_assign_pointer(irqfd->irq_entry, NULL); + } +} + static int kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) { + struct kvm_irq_routing_table *irq_rt; struct _irqfd *irqfd, *tmp; struct file *file = NULL; struct eventfd_ctx *eventfd = NULL; @@ -215,6 +250,10 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) goto fail; } + irq_rt = rcu_dereference_protected(kvm->irq_routing, + lockdep_is_held(&kvm->irqfds.lock)); + irqfd_update(kvm, irqfd, irq_rt); + events = file->f_op->poll(file, &irqfd->pt); list_add_tail(&irqfd->list, &kvm->irqfds.items); @@ -271,8 +310,17 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) + if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { + /* + * This rcu_assign_pointer is needed for when + * another thread calls kvm_irqfd_update before + * we flush workqueue below. + * It is paired with synchronize_rcu done by caller + * of that function. + */ + rcu_assign_pointer(irqfd->irq_entry, NULL); irqfd_deactivate(irqfd); + } } spin_unlock_irq(&kvm->irqfds.lock); @@ -321,6 +369,25 @@ kvm_irqfd_release(struct kvm *kvm) } +/* + * Change irq_routing and irqfd. + * Caller must invoke synchronize_rcu afterwards. + */ +void kvm_irq_routing_update(struct kvm *kvm, + struct kvm_irq_routing_table *irq_rt) +{ + struct _irqfd *irqfd; + + spin_lock_irq(&kvm->irqfds.lock); + + rcu_assign_pointer(kvm->irq_routing, irq_rt); + + list_for_each_entry(irqfd, &kvm->irqfds.items, list) + irqfd_update(kvm, irqfd, irq_rt); + + spin_unlock_irq(&kvm->irqfds.lock); +} + /* * create a host-wide workqueue for issuing deferred shutdown requests * aggregated from all vm* instances. We need our own isolated single-thread diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 8edca9141b78..9f614b4e365f 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -114,8 +114,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level) +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level) { struct kvm_lapic_irq irq; @@ -409,8 +409,9 @@ int kvm_set_irq_routing(struct kvm *kvm, mutex_lock(&kvm->irq_lock); old = kvm->irq_routing; - rcu_assign_pointer(kvm->irq_routing, new); + kvm_irq_routing_update(kvm, new); mutex_unlock(&kvm->irq_lock); + synchronize_rcu(); new = old; -- cgit v1.2.3-70-g09d2 From 27923eb19c5d1197bd9d1472abdc2e749f21387a Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 25 Nov 2010 10:25:44 +0100 Subject: KVM: PPC: Fix compile warning KVM compilation fails with the following warning: include/linux/kvm_host.h: In function 'kvm_irq_routing_update': include/linux/kvm_host.h:679:2: error: 'struct kvm' has no member named 'irq_routing' That function is only used and reasonable to have on systems that implement an in-kernel interrupt chip. PPC doesn't. Fix by #ifdef'ing it out when no irqchip is available. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f17beae3cca0..da0794f707f6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -673,11 +673,13 @@ static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) static inline void kvm_irqfd_release(struct kvm *kvm) {} +#ifdef CONFIG_HAVE_KVM_IRQCHIP static inline void kvm_irq_routing_update(struct kvm *kvm, struct kvm_irq_routing_table *irq_rt) { rcu_assign_pointer(kvm->irq_routing, irq_rt); } +#endif static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { -- cgit v1.2.3-70-g09d2 From a4ee1ca4a36e7857d90ae8c2b85f1bde9a042c10 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 23 Nov 2010 11:13:00 +0800 Subject: KVM: MMU: delay flush all tlbs on sync_page path Quote from Avi: | I don't think we need to flush immediately; set a "tlb dirty" bit somewhere | that is cleareded when we flush the tlb. kvm_mmu_notifier_invalidate_page() | can consult the bit and force a flush if set. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 12 ++++++++++-- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 7 ++++++- 3 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a43f4ccd30bb..2b3d66c7b68d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -746,6 +746,14 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. + * + * Note: + * We should flush all tlbs if spte is dropped even though guest is + * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page + * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't + * used by guest then tlbs are not flushed, so guest is allowed to access the + * freed pages. + * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -781,14 +789,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - kvm_flush_remote_tlbs(vcpu->kvm); + vcpu->kvm->tlbs_dirty++; continue; } if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i], shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu->kvm); + vcpu->kvm->tlbs_dirty++; continue; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index da0794f707f6..ac4e83a1a10d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -254,6 +254,7 @@ struct kvm { struct mmu_notifier mmu_notifier; unsigned long mmu_notifier_seq; long mmu_notifier_count; + long tlbs_dirty; #endif }; @@ -382,6 +383,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); void kvm_resched(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); + void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_reload_remote_mmus(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5156d458a84d..ee99b77e4451 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -168,8 +168,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) void kvm_flush_remote_tlbs(struct kvm *kvm) { + int dirty_count = kvm->tlbs_dirty; + + smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; + cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } void kvm_reload_remote_mmus(struct kvm *kvm) @@ -249,7 +253,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; - need_tlb_flush = kvm_unmap_hva(kvm, address); + need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); @@ -293,6 +297,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm->mmu_notifier_count++; for (; start < end; start += PAGE_SIZE) need_tlb_flush |= kvm_unmap_hva(kvm, start); + need_tlb_flush |= kvm->tlbs_dirty; spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); -- cgit v1.2.3-70-g09d2 From d4dbf470096c51cb4785167ea59fdbdea87ccbe4 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 7 Dec 2010 12:59:07 +0900 Subject: KVM: MMU: Make the way of accessing lpage_info more generic Large page information has two elements but one of them, write_count, alone is accessed by a helper function. This patch replaces this helper function with more generic one which returns newly named kvm_lpage_info structure and use it to access the other element rmap_pde. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 54 ++++++++++++++++++++++-------------------------- include/linux/kvm_host.h | 10 +++++---- 2 files changed, 31 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index abda57fac659..475a1225f6ec 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -477,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) } /* - * Return the pointer to the largepage write count for a given - * gfn, handling slots that are not large page aligned. + * Return the pointer to the large page information for a given gfn, + * handling slots that are not large page aligned. */ -static int *slot_largepage_idx(gfn_t gfn, - struct kvm_memory_slot *slot, - int level) +static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, + struct kvm_memory_slot *slot, + int level) { unsigned long idx; idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); - return &slot->lpage_info[level - 2][idx].write_count; + return &slot->lpage_info[level - 2][idx]; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; - int *write_count; + struct kvm_lpage_info *linfo; int i; slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - write_count = slot_largepage_idx(gfn, slot, i); - *write_count += 1; + linfo = lpage_info_slot(gfn, slot, i); + linfo->write_count += 1; } } static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; - int *write_count; + struct kvm_lpage_info *linfo; int i; slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - write_count = slot_largepage_idx(gfn, slot, i); - *write_count -= 1; - WARN_ON(*write_count < 0); + linfo = lpage_info_slot(gfn, slot, i); + linfo->write_count -= 1; + WARN_ON(linfo->write_count < 0); } } @@ -525,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm, int level) { struct kvm_memory_slot *slot; - int *largepage_idx; + struct kvm_lpage_info *linfo; slot = gfn_to_memslot(kvm, gfn); if (slot) { - largepage_idx = slot_largepage_idx(gfn, slot, level); - return *largepage_idx; + linfo = lpage_info_slot(gfn, slot, level); + return linfo->write_count; } return 1; @@ -585,16 +585,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) { struct kvm_memory_slot *slot; - unsigned long idx; + struct kvm_lpage_info *linfo; slot = gfn_to_memslot(kvm, gfn); if (likely(level == PT_PAGE_TABLE_LEVEL)) return &slot->rmap[gfn - slot->base_gfn]; - idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); + linfo = lpage_info_slot(gfn, slot, level); - return &slot->lpage_info[level - 2][idx].rmap_pde; + return &linfo->rmap_pde; } /* @@ -882,19 +881,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + gfn_t gfn = memslot->base_gfn + gfn_offset; ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { - unsigned long idx; - int sh; - - sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); - idx = ((memslot->base_gfn+gfn_offset) >> sh) - - (memslot->base_gfn >> sh); - ret |= handler(kvm, - &memslot->lpage_info[j][idx].rmap_pde, - data); + struct kvm_lpage_info *linfo; + + linfo = lpage_info_slot(gfn, memslot, + PT_DIRECTORY_LEVEL + j); + ret |= handler(kvm, &linfo->rmap_pde, data); } trace_kvm_age_page(hva, memslot, ret); retval |= ret; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ac4e83a1a10d..bd0da8f12500 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -146,6 +146,11 @@ struct kvm_vcpu { */ #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) +struct kvm_lpage_info { + unsigned long rmap_pde; + int write_count; +}; + struct kvm_memory_slot { gfn_t base_gfn; unsigned long npages; @@ -153,10 +158,7 @@ struct kvm_memory_slot { unsigned long *rmap; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_head; - struct { - unsigned long rmap_pde; - int write_count; - } *lpage_info[KVM_NR_PAGE_SIZES - 1]; + struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; unsigned long userspace_addr; int user_alloc; int id; -- cgit v1.2.3-70-g09d2 From 5c663a1534d27d817e17eed06a83d08f497f9f4f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 8 Dec 2010 18:04:51 +0200 Subject: KVM: Fix build error on s390 due to missing tlbs_dirty Make it available for all archs. Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index bd0da8f12500..b5021db21858 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -256,8 +256,8 @@ struct kvm { struct mmu_notifier mmu_notifier; unsigned long mmu_notifier_seq; long mmu_notifier_count; - long tlbs_dirty; #endif + long tlbs_dirty; }; /* The guest did something we don't support. */ -- cgit v1.2.3-70-g09d2