From 94171b19c3f1f4d9d4c0e3aaa1aa161def1ec7ea Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 27 Jul 2017 11:54:53 +0530 Subject: powerpc/mm: Rename find_linux_pte_or_hugepte() Add newer helpers to make the function usage simpler. It is always recommended to use find_current_mm_pte() for walking the page table. If we cannot use find_current_mm_pte(), it should be documented why the said usage of __find_linux_pte() is safe against a parallel THP split. For now we have KVM code using __find_linux_pte(). This is because kvm code ends up calling __find_linux_pte() in real mode with MSR_EE=0 but with PACA soft_enabled = 1. We may want to fix that later and make sure we keep the MSR_EE and PACA soft_enabled in sync. When we do that we can switch kvm to use find_linux_pte(). Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/pgtable.h | 10 +--------- arch/powerpc/include/asm/pte-walk.h | 35 ++++++++++++++++++++++++++++++++++ arch/powerpc/kernel/eeh.c | 4 ++-- arch/powerpc/kernel/io-workarounds.c | 5 +++-- arch/powerpc/kvm/book3s_64_mmu_hv.c | 5 +++-- arch/powerpc/kvm/book3s_64_mmu_radix.c | 28 +++++++++++++-------------- arch/powerpc/kvm/book3s_64_vio_hv.c | 12 +++++++++++- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 18 ++++++++--------- arch/powerpc/kvm/e500_mmu_host.c | 3 ++- arch/powerpc/mm/hash_utils_64.c | 5 +++-- arch/powerpc/mm/hugetlbpage.c | 24 +++++++++++++---------- arch/powerpc/mm/tlb_hash64.c | 6 ++++-- arch/powerpc/perf/callchain.c | 3 ++- 13 files changed, 103 insertions(+), 55 deletions(-) create mode 100644 arch/powerpc/include/asm/pte-walk.h diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index afae9a336136..eb9d57defb75 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -66,16 +66,8 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_large(pmd) 0 #endif -pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, - bool *is_thp, unsigned *shift); -static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, - bool *is_thp, unsigned *shift) -{ - VM_WARN(!arch_irqs_disabled(), - "%s called with irq enabled\n", __func__); - return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift); -} +/* can we use this in kvm */ unsigned long vmalloc_to_phys(void *vmalloc_addr); void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); diff --git a/arch/powerpc/include/asm/pte-walk.h b/arch/powerpc/include/asm/pte-walk.h new file mode 100644 index 000000000000..2d633e9d686c --- /dev/null +++ b/arch/powerpc/include/asm/pte-walk.h @@ -0,0 +1,35 @@ +#ifndef _ASM_POWERPC_PTE_WALK_H +#define _ASM_POWERPC_PTE_WALK_H + +#include + +/* Don't use this directly */ +extern pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hshift); + +static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hshift) +{ + VM_WARN(!arch_irqs_disabled(), "%s called with irq enabled\n", __func__); + return __find_linux_pte(pgdir, ea, is_thp, hshift); +} + +static inline pte_t *find_init_mm_pte(unsigned long ea, unsigned *hshift) +{ + pgd_t *pgdir = init_mm.pgd; + return __find_linux_pte(pgdir, ea, NULL, hshift); +} +/* + * This is what we should always use. Any other lockless page table lookup needs + * careful audit against THP split. + */ +static inline pte_t *find_current_mm_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hshift) +{ + VM_WARN(!arch_irqs_disabled(), "%s called with irq enabled\n", __func__); + VM_WARN(pgdir != current->mm->pgd, + "%s lock less page table lookup called on wrong mm\n", __func__); + return __find_linux_pte(pgdir, ea, is_thp, hshift); +} + +#endif /* _ASM_POWERPC_PTE_WALK_H */ diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 63992b2d8e15..5e6887c40528 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -44,6 +44,7 @@ #include #include #include +#include /** Overview: @@ -352,8 +353,7 @@ static inline unsigned long eeh_token_to_phys(unsigned long token) * worried about _PAGE_SPLITTING/collapse. Also we will not hit * page table free, because of init_mm. */ - ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, - NULL, &hugepage_shift); + ptep = find_init_mm_pte(token, &hugepage_shift); if (!ptep) return token; WARN_ON(hugepage_shift); diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index a582e0d42525..bbe85f5aea71 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -19,6 +19,8 @@ #include #include #include +#include + #define IOWA_MAX_BUS 8 @@ -75,8 +77,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) * We won't find huge pages here (iomem). Also can't hit * a page table free due to init_mm */ - ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr, - NULL, &hugepage_shift); + ptep = find_init_mm_pte(vaddr, &hugepage_shift); if (ptep == NULL) paddr = 0; else { diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8cb0190e2a73..4b219db39c47 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "trace_hv.h" @@ -597,8 +598,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * hugepage split and collapse. */ local_irq_save(flags); - ptep = find_linux_pte_or_hugepte(current->mm->pgd, - hva, NULL, NULL); + ptep = find_current_mm_pte(current->mm->pgd, + hva, NULL, NULL); if (ptep) { pte = kvmppc_read_update_linux_pte(ptep, 1); if (__pte_write(pte)) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index f6b3e67c5762..7d719c8aa0bb 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -17,6 +17,7 @@ #include #include #include +#include /* * Supported radix tree geometry. @@ -359,8 +360,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (writing) pgflags |= _PAGE_DIRTY; local_irq_save(flags); - ptep = __find_linux_pte_or_hugepte(current->mm->pgd, hva, - NULL, NULL); + ptep = find_current_mm_pte(current->mm->pgd, hva, NULL, NULL); if (ptep) { pte = READ_ONCE(*ptep); if (pte_present(pte) && @@ -374,8 +374,12 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, spin_unlock(&kvm->mmu_lock); return RESUME_GUEST; } - ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, - gpa, NULL, &shift); + /* + * We are walking the secondary page table here. We can do this + * without disabling irq. + */ + ptep = __find_linux_pte(kvm->arch.pgtable, + gpa, NULL, &shift); if (ptep && pte_present(*ptep)) { kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift); @@ -427,8 +431,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, pgflags |= _PAGE_WRITE; } else { local_irq_save(flags); - ptep = __find_linux_pte_or_hugepte(current->mm->pgd, - hva, NULL, NULL); + ptep = find_current_mm_pte(current->mm->pgd, + hva, NULL, NULL); if (ptep && pte_write(*ptep) && pte_dirty(*ptep)) pgflags |= _PAGE_WRITE; local_irq_restore(flags); @@ -499,8 +503,7 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned int shift; unsigned long old; - ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, - NULL, &shift); + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep)) { old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, gpa, shift); @@ -525,8 +528,7 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned int shift; int ref = 0; - ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, - NULL, &shift); + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) { kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, gpa, shift); @@ -545,8 +547,7 @@ int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned int shift; int ref = 0; - ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, - NULL, &shift); + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) ref = 1; return ref; @@ -562,8 +563,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, unsigned int shift; int ret = 0; - ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, - NULL, &shift); + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { ret = 1; if (shift) diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 3adfd2f5301c..c32e9bfe75b1 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -39,6 +39,7 @@ #include #include #include +#include #ifdef CONFIG_BUG @@ -353,7 +354,16 @@ static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, pte_t *ptep, pte; unsigned shift = 0; - ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift); + /* + * Called in real mode with MSR_EE = 0. We are safe here. + * It is ok to do the lookup with arch.pgdir here, because + * we are doing this on secondary cpus and current task there + * is not the hypervisor. Also this is safe against THP in the + * host, because an IPI to primary thread will wait for the secondary + * to exit which will agains result in the below page table walk + * to finish. + */ + ptep = __find_linux_pte(vcpu->arch.pgdir, ua, NULL, &shift); if (!ptep || !pte_present(*ptep)) return -ENXIO; pte = *ptep; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 584c74c8119f..fedb0139524c 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -22,6 +22,7 @@ #include #include #include +#include /* Translate address of a vmalloc'd thing to a linear map address */ static void *real_vmalloc_addr(void *x) @@ -31,9 +32,9 @@ static void *real_vmalloc_addr(void *x) /* * assume we don't have huge pages in vmalloc space... * So don't worry about THP collapse/split. Called - * Only in realmode, hence won't need irq_save/restore. + * Only in realmode with MSR_EE = 0, hence won't need irq_save/restore. */ - p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL, NULL); + p = find_init_mm_pte(addr, NULL); if (!p || !pte_present(*p)) return NULL; addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); @@ -230,14 +231,13 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, * If we had a page table table change after lookup, we would * retry via mmu_notifier_retry. */ - if (realmode) - ptep = __find_linux_pte_or_hugepte(pgdir, hva, NULL, - &hpage_shift); - else { + if (!realmode) local_irq_save(irq_flags); - ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL, - &hpage_shift); - } + /* + * If called in real mode we have MSR_EE = 0. Otherwise + * we disable irq above. + */ + ptep = __find_linux_pte(pgdir, hva, NULL, &hpage_shift); if (ptep) { pte_t pte; unsigned int host_pte_size; diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 77fd043b3ecc..c6c734424c70 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "e500.h" #include "timing.h" @@ -476,7 +477,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, * can't run hence pfn won't change. */ local_irq_save(flags); - ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL, NULL); + ptep = find_linux_pte(pgdir, hva, NULL, NULL); if (ptep) { pte_t pte = READ_ONCE(*ptep); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 7a20669c19e7..5b10b4fcbf76 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -61,6 +61,7 @@ #include #include #include +#include #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -1297,7 +1298,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, #endif /* CONFIG_PPC_64K_PAGES */ /* Get PTE and page size from page tables */ - ptep = __find_linux_pte_or_hugepte(pgdir, ea, &is_thp, &hugeshift); + ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift); if (ptep == NULL || !pte_present(*ptep)) { DBG_LOW(" no PTE !\n"); rc = 1; @@ -1526,7 +1527,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, * THP pages use update_mmu_cache_pmd. We don't do * hash preload there. Hence can ignore THP here */ - ptep = find_linux_pte_or_hugepte(pgdir, ea, NULL, &hugepage_shift); + ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift); if (!ptep) goto out_exit; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index e1bf5ca397fe..70a3a2bdf06c 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -24,6 +24,8 @@ #include #include #include +#include + #ifdef CONFIG_HUGETLB_PAGE @@ -60,8 +62,11 @@ static unsigned nr_gpages; pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { - /* Only called for hugetlbfs pages, hence can ignore THP */ - return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); + /* + * Only called for hugetlbfs pages, hence can ignore THP and the + * irq disabled walk. + */ + return __find_linux_pte(mm->pgd, addr, NULL, NULL); } static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, @@ -886,9 +891,8 @@ void flush_dcache_icache_hugepage(struct page *page) * This function need to be called with interrupts disabled. We use this variant * when we have MSR[EE] = 0 but the paca->soft_enabled = 1 */ - -pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, - bool *is_thp, unsigned *shift) +pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hpage_shift) { pgd_t pgd, *pgdp; pud_t pud, *pudp; @@ -897,8 +901,8 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, hugepd_t *hpdp = NULL; unsigned pdshift = PGDIR_SHIFT; - if (shift) - *shift = 0; + if (hpage_shift) + *hpage_shift = 0; if (is_thp) *is_thp = false; @@ -968,11 +972,11 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, ret_pte = hugepte_offset(*hpdp, ea, pdshift); pdshift = hugepd_shift(*hpdp); out: - if (shift) - *shift = pdshift; + if (hpage_shift) + *hpage_shift = pdshift; return ret_pte; } -EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte); +EXPORT_SYMBOL_GPL(__find_linux_pte); int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index b5b0fb97b9c0..71cb2742bd16 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -29,6 +29,8 @@ #include #include #include +#include + #include @@ -207,8 +209,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, &is_thp, - &hugepage_shift); + pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp, + &hugepage_shift); unsigned long pte; if (ptep == NULL) diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 0fc26714780a..0af051a1974e 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -22,6 +22,7 @@ #ifdef CONFIG_PPC64 #include "../kernel/ppc32.h" #endif +#include /* @@ -127,7 +128,7 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb) return -EFAULT; local_irq_save(flags); - ptep = find_linux_pte_or_hugepte(pgdir, addr, NULL, &shift); + ptep = find_current_mm_pte(pgdir, addr, NULL, &shift); if (!ptep) goto err_out; if (!shift) -- cgit v1.2.3-70-g09d2 From 94a04bc25a2c6296bd0c5e82c10e8231c2b11f77 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 25 Aug 2017 14:30:33 +1000 Subject: KVM: PPC: Book3S HV: POWER9 does not require secondary thread management POWER9 CPUs have independent MMU contexts per thread, so KVM does not need to quiesce secondary threads, so the hwthread_req/hwthread_state protocol does not have to be used. So patch it away on POWER9, and patch away the branch from the Linux idle wakeup to kvm_start_guest that is never used. Add a warning and error out of kvmppc_grab_hwthread in case it is ever called on POWER9. This avoids a hwsync in the idle wakeup path on POWER9. Signed-off-by: Nicholas Piggin Acked-by: Paul Mackerras [mpe: Use WARN(...) instead of WARN_ON()/pr_err(...)] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_book3s_asm.h | 4 ++++ arch/powerpc/kernel/idle_book3s.S | 35 +++++++++++++++++++++---------- arch/powerpc/kvm/book3s_hv.c | 13 +++++++++++- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 8 +++++++ 4 files changed, 48 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 7cea76f11c26..83596f32f50b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -104,6 +104,10 @@ struct kvmppc_host_state { u8 napping; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + /* + * hwthread_req/hwthread_state pair is used to pull sibling threads + * out of guest on pre-ISAv3.0B CPUs where threads share MMU. + */ u8 hwthread_req; u8 hwthread_state; u8 host_ipi; diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 516ebef905c0..294d024bfb61 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -242,13 +242,20 @@ enter_winkle: /* * r3 - PSSCR value corresponding to the requested stop state. */ -power_enter_stop: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - /* Tell KVM we're entering idle */ +power_enter_stop_kvm_rm: + /* + * This is currently unused because POWER9 KVM does not have to + * gather secondary threads into sibling mode, but the code is + * here in case that function is required. + * + * Tell KVM we're entering idle. + */ li r4,KVM_HWTHREAD_IN_IDLE /* DO THIS IN REAL MODE! See comment above. */ stb r4,HSTATE_HWTHREAD_STATE(r13) #endif +power_enter_stop: /* * Check if we are executing the lite variant with ESL=EC=0 */ @@ -411,6 +418,18 @@ pnv_powersave_wakeup_mce: b pnv_powersave_wakeup +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +kvm_start_guest_check: + li r0,KVM_HWTHREAD_IN_KERNEL + stb r0,HSTATE_HWTHREAD_STATE(r13) + /* Order setting hwthread_state vs. testing hwthread_req */ + sync + lbz r0,HSTATE_HWTHREAD_REQ(r13) + cmpwi r0,0 + beqlr + b kvm_start_guest +#endif + /* * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss @@ -435,15 +454,9 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) mr r3,r12 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - li r0,KVM_HWTHREAD_IN_KERNEL - stb r0,HSTATE_HWTHREAD_STATE(r13) - /* Order setting hwthread_state vs. testing hwthread_req */ - sync - lbz r0,HSTATE_HWTHREAD_REQ(r13) - cmpwi r0,0 - beq 1f - b kvm_start_guest -1: +BEGIN_FTR_SECTION + bl kvm_start_guest_check +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #endif /* Return SRR1 from power7_nap() */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0b436df746fc..8bad44b46dc8 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2111,6 +2111,15 @@ static int kvmppc_grab_hwthread(int cpu) struct paca_struct *tpaca; long timeout = 10000; + /* + * ISA v3.0 idle routines do not set hwthread_state or test + * hwthread_req, so they can not grab idle threads. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + WARN(1, "KVM: can not control sibling threads\n"); + return -EBUSY; + } + tpaca = &paca[cpu]; /* Ensure the thread won't go into the kernel if it wakes */ @@ -2145,10 +2154,12 @@ static void kvmppc_release_hwthread(int cpu) struct paca_struct *tpaca; tpaca = &paca[cpu]; - tpaca->kvm_hstate.hwthread_req = 0; tpaca->kvm_hstate.kvm_vcpu = NULL; tpaca->kvm_hstate.kvm_vcore = NULL; tpaca->kvm_hstate.kvm_split_mode = NULL; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + tpaca->kvm_hstate.hwthread_req = 0; + } static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index cb44065e2946..eacf3d06eb75 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -149,9 +149,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) subf r4, r4, r3 mtspr SPRN_DEC, r4 +BEGIN_FTR_SECTION /* hwthread_req may have got set by cede or no vcpu, so clear it */ li r0, 0 stb r0, HSTATE_HWTHREAD_REQ(r13) +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* * For external interrupts we need to call the Linux @@ -314,6 +316,7 @@ kvm_novcpu_exit: * Relocation is off and most register values are lost. * r13 points to the PACA. * r3 contains the SRR1 wakeup value, SRR1 is trashed. + * This is not used by ISAv3.0B processors. */ .globl kvm_start_guest kvm_start_guest: @@ -432,6 +435,9 @@ kvm_secondary_got_guest: * While waiting we also need to check if we get given a vcpu to run. */ kvm_no_guest: +BEGIN_FTR_SECTION + twi 31,0,0 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r3, HSTATE_HWTHREAD_REQ(r13) cmpwi r3, 0 bne 53f @@ -2466,8 +2472,10 @@ kvm_do_nap: clrrdi r0, r0, 1 mtspr SPRN_CTRLT, r0 +BEGIN_FTR_SECTION li r0,1 stb r0,HSTATE_HWTHREAD_REQ(r13) +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mfspr r5,SPRN_LPCR ori r5,r5,LPCR_PECE0 | LPCR_PECE1 BEGIN_FTR_SECTION -- cgit v1.2.3-70-g09d2 From 73e77c0982fd25ddb536339906412cbed78d0b79 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 13 Jul 2017 10:38:29 +0300 Subject: KVM: PPC: e500: Fix some NULL dereferences on error There are some error paths in kvmppc_core_vcpu_create_e500() where we forget to set the error code. It means that we return ERR_PTR(0) which is NULL and it results in a NULL pointer dereference in the caller. Signed-off-by: Dan Carpenter Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/e500.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 32fdab57d604..f9f6468f4171 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -455,16 +455,20 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, if (err) goto free_vcpu; - if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) + if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) { + err = -ENOMEM; goto uninit_vcpu; + } err = kvmppc_e500_tlb_init(vcpu_e500); if (err) goto uninit_id; vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO); - if (!vcpu->arch.shared) + if (!vcpu->arch.shared) { + err = -ENOMEM; goto uninit_tlb; + } return vcpu; -- cgit v1.2.3-70-g09d2 From 50a1a25987146c5ab15d6c1642a5043730ace0a5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 13 Jul 2017 10:38:49 +0300 Subject: KVM: PPC: e500mc: Fix a NULL dereference We should set "err = -ENOMEM;", otherwise it means we're returning ERR_PTR(0) which is NULL. It results in a NULL pointer dereference in the caller. Signed-off-by: Dan Carpenter Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/e500mc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index f48a0c22e8f9..d0b6b5788afc 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -331,8 +331,10 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, goto uninit_vcpu; vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - if (!vcpu->arch.shared) + if (!vcpu->arch.shared) { + err = -ENOMEM; goto uninit_tlb; + } return vcpu; -- cgit v1.2.3-70-g09d2 From d182b8fd6084412963cdb1a16d04c2f07234e82b Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Mon, 31 Jul 2017 14:39:59 -0700 Subject: KVM: PPC: Book3S HV: Fix setting of storage key in H_ENTER In handling a H_ENTER hypercall, the code in kvmppc_do_h_enter clobbers the high-order two bits of the storage key, which is stored in a split field in the second doubleword of the HPTE. Any storage key number above 7 hence fails to operate correctly. This makes sure we preserve all the bits of the storage key. Acked-by: Balbir Singh Signed-off-by: Ram Pai Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 1 + arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 6981a52b3887..5c6b18a71d55 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -104,6 +104,7 @@ #define HPTE_R_C ASM_CONST(0x0000000000000080) #define HPTE_R_R ASM_CONST(0x0000000000000100) #define HPTE_R_KEY_LO ASM_CONST(0x0000000000000e00) +#define HPTE_R_KEY (HPTE_R_KEY_LO | HPTE_R_KEY_HI) #define HPTE_V_1TB_SEG ASM_CONST(0x4000000000000000) #define HPTE_V_VRMA_MASK ASM_CONST(0x4001ffffff000000) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 584c74c8119f..61c8248b41c3 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -269,7 +269,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, if (!realmode) local_irq_restore(irq_flags); - ptel &= ~(HPTE_R_PP0 - psize); + ptel &= HPTE_R_KEY | HPTE_R_PP0 | (psize-1); ptel |= pa; if (pa) -- cgit v1.2.3-70-g09d2 From eaac112eac8604917bc6c39680ff67a937626a96 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 13 Aug 2017 11:33:38 +1000 Subject: KVM: PPC: Book3S HV: Fix H_REGISTER_VPA VPA size validation KVM currently validates the size of the VPA registered by the client against sizeof(struct lppaca), however we align (and therefore size) that struct to 1kB to avoid crossing a 4kB boundary in the client. PAPR calls for sizes >= 640 bytes to be accepted. Hard code this with a comment. Signed-off-by: Nicholas Piggin Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 359c79cdf0cc..1182cfd79857 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -485,7 +485,13 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, switch (subfunc) { case H_VPA_REG_VPA: /* register VPA */ - if (len < sizeof(struct lppaca)) + /* + * The size of our lppaca is 1kB because of the way we align + * it for the guest to avoid crossing a 4kB boundary. We only + * use 640 bytes of the structure though, so we should accept + * clients that set a size of 640. + */ + if (len < 640) break; vpap = &tvcpu->arch.vpa; err = 0; -- cgit v1.2.3-70-g09d2 From 0bfa33c7f71e7e9de289582d9652110daa1ec8a1 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 15 Aug 2017 14:37:01 +1000 Subject: KVM: PPC: Book3S HV: Fix invalid use of register expression binutils >= 2.26 now warns about misuse of register expressions in assembler operands that are actually literals. In this instance r0 is being used where a literal 0 should be used. Signed-off-by: Andreas Schwab [mpe: Split into separate KVM patch, tweak change log] Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c52184a8efdf..0bc400f882f4 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -976,7 +976,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) #ifdef CONFIG_KVM_XICS /* We are entering the guest on that thread, push VCPU to XIVE */ ld r10, HSTATE_XIVE_TIMA_PHYS(r13) - cmpldi cr0, r10, r0 + cmpldi cr0, r10, 0 beq no_xive ld r11, VCPU_XIVE_SAVED_STATE(r4) li r9, TM_QW1_OS -- cgit v1.2.3-70-g09d2 From a4faf2e77a565432278430deaaba2f3295b6a2df Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 25 Aug 2017 19:52:12 +1000 Subject: KVM: PPC: Book3S HV: Fix case where HDEC is treated as 32-bit on POWER9 Commit 2f2724630f7a ("KVM: PPC: Book3S HV: Cope with host using large decrementer mode", 2017-05-22) added code to treat the hypervisor decrementer (HDEC) as a 64-bit value on POWER9 rather than 32-bit. Unfortunately, that commit missed one place where HDEC is treated as a 32-bit value. This fixes it. This bug should not have any user-visible consequences that I can think of, beyond an occasional unnecessary exit to the host kernel. If the hypervisor decrementer has gone negative, then the bottom 32 bits will be negative for about 4 seconds after that, so as long as we get out of the guest within those 4 seconds we won't conclude that the HDEC interrupt is spurious. Reported-by: Suraj Jitindar Singh Fixes: 2f2724630f7a ("KVM: PPC: Book3S HV: Cope with host using large decrementer mode") Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 0bc400f882f4..29ec462d11c5 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1280,7 +1280,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER bne 2f mfspr r3,SPRN_HDEC - cmpwi r3,0 + EXTEND_HDEC(r3) + cmpdi r3,0 mr r4,r9 bge fast_guest_return 2: -- cgit v1.2.3-70-g09d2 From e3bfed1df379c18f20feb06427d952b766e2c00f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 25 Aug 2017 19:53:39 +1000 Subject: KVM: PPC: Book3S HV: Report storage key support to userspace This adds information about storage keys to the struct returned by the KVM_PPC_GET_SMMU_INFO ioctl. The new fields replace a pad field, which was zeroed by previous kernel versions. Thus userspace that knows about the new fields will see zeroes when running on an older kernel, indicating that storage keys are not supported. The size of the structure has not changed. The number of keys is hard-coded for the CPUs supported by HV KVM, which is just POWER7, POWER8 and POWER9. Signed-off-by: Paul Mackerras Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 8 ++++++++ include/uapi/linux/kvm.h | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1182cfd79857..f62ad2e9085f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3331,6 +3331,14 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, if (radix_enabled()) return -EINVAL; + /* + * POWER7, POWER8 and POWER9 all support 32 storage keys for data. + * POWER7 doesn't support keys for instruction accesses, + * POWER8 and POWER9 do. + */ + info->data_keys = 32; + info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0; + info->flags = KVM_PPC_PAGE_SIZES_REAL; if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) info->flags |= KVM_PPC_1T_SEGMENTS; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 6cd63c18708a..838887587411 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -711,7 +711,8 @@ struct kvm_ppc_one_seg_page_size { struct kvm_ppc_smmu_info { __u64 flags; __u32 slb_size; - __u32 pad; + __u16 data_keys; /* # storage keys supported for data */ + __u16 instr_keys; /* # storage keys supported for instructions */ struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; }; -- cgit v1.2.3-70-g09d2 From 43f6b0cfb2e4dfd5aa452a1d44491e805e37b7da Mon Sep 17 00:00:00 2001 From: nixiaoming Date: Thu, 31 Aug 2017 18:51:09 +0800 Subject: KVM: PPC: Book3S HV: Fix memory leak in kvm_vm_ioctl_get_htab_fd We do ctx = kzalloc(sizeof(*ctx), GFP_KERNEL) and then later on call anon_inode_getfd(), but if that fails we don't free ctx, so that memory gets leaked. To fix it, this adds kfree(ctx) in the failure path. Signed-off-by: nixiaoming Reviewed-by: Paolo Bonzini Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 67075e065ef2..7c62967d672c 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1941,6 +1941,7 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); if (ret < 0) { + kfree(ctx); kvm_put_kvm(kvm); return ret; } -- cgit v1.2.3-70-g09d2