diff options
author | Jiri Kosina <jkosina@suse.cz> | 2021-11-05 12:10:27 +0100 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2021-11-05 12:10:27 +0100 |
commit | 820e9906cf64142169134f35b996108303cf22ca (patch) | |
tree | 89f8831fb39c59aba208395d91e25ab4b26f473f /arch | |
parent | b9865081a56a5cd01cd7c9735911709ff82bd8df (diff) | |
parent | 2ea5999d07d2a0ab6ad92ccf65524707f2c5e456 (diff) |
Merge branch 'for-5.16/asus' into for-linus
Diffstat (limited to 'arch')
651 files changed, 8000 insertions, 8633 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 98db63496bab..8df1c7102643 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -197,6 +197,9 @@ config HAVE_FUNCTION_ERROR_INJECTION config HAVE_NMI bool +config TRACE_IRQFLAGS_SUPPORT + bool + # # An arch should select this if it provides all these things: # @@ -886,7 +889,7 @@ config HAVE_SOFTIRQ_ON_OWN_STACK bool help Architecture provides a function to run __do_softirq() on a - seperate stack. + separate stack. config PGTABLE_LEVELS int diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 02e5b673bdad..4e87783c90ad 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -20,7 +20,7 @@ config ALPHA select NEED_SG_DMA_LENGTH select VIRT_TO_BUS select GENERIC_IRQ_PROBE - select GENERIC_PCI_IOMAP if PCI + select GENERIC_PCI_IOMAP select AUTO_IRQ_AFFINITY if SMP select GENERIC_IRQ_SHOW select ARCH_WANT_IPC_PARSE_VERSION @@ -199,7 +199,6 @@ config ALPHA_EIGER config ALPHA_JENSEN bool "Jensen" - depends on BROKEN select HAVE_EISA help DEC PC 150 AXP (aka Jensen): This is a very old Digital system - one diff --git a/arch/alpha/include/asm/agp.h b/arch/alpha/include/asm/agp.h index 7173eada1567..7874f063d000 100644 --- a/arch/alpha/include/asm/agp.h +++ b/arch/alpha/include/asm/agp.h @@ -6,8 +6,8 @@ /* dummy for now */ -#define map_page_into_agp(page) -#define unmap_page_from_agp(page) +#define map_page_into_agp(page) do { } while (0) +#define unmap_page_from_agp(page) do { } while (0) #define flush_agp_cache() mb() /* GATT allocation. Returns/accepts GATT kernel virtual address. */ diff --git a/arch/alpha/include/asm/asm-prototypes.h b/arch/alpha/include/asm/asm-prototypes.h index b34cc1f06720..c8ae46fc2e74 100644 --- a/arch/alpha/include/asm/asm-prototypes.h +++ b/arch/alpha/include/asm/asm-prototypes.h @@ -16,3 +16,4 @@ extern void __divlu(void); extern void __remlu(void); extern void __divqu(void); extern void __remqu(void); +extern unsigned long __udiv_qrnnd(unsigned long *, unsigned long, unsigned long , unsigned long); diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index 0fab5ac90775..c9cb554fbe54 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -60,7 +60,7 @@ extern inline void set_hae(unsigned long new_hae) * Change virtual addresses to physical addresses and vv. */ #ifdef USE_48_BIT_KSEG -static inline unsigned long virt_to_phys(void *address) +static inline unsigned long virt_to_phys(volatile void *address) { return (unsigned long)address - IDENT_ADDR; } @@ -70,7 +70,7 @@ static inline void * phys_to_virt(unsigned long address) return (void *) (address + IDENT_ADDR); } #else -static inline unsigned long virt_to_phys(void *address) +static inline unsigned long virt_to_phys(volatile void *address) { unsigned long phys = (unsigned long)address; @@ -106,7 +106,7 @@ static inline void * phys_to_virt(unsigned long address) extern unsigned long __direct_map_base; extern unsigned long __direct_map_size; -static inline unsigned long __deprecated virt_to_bus(void *address) +static inline unsigned long __deprecated virt_to_bus(volatile void *address) { unsigned long phys = virt_to_phys(address); unsigned long bus = phys + __direct_map_base; diff --git a/arch/alpha/include/asm/jensen.h b/arch/alpha/include/asm/jensen.h index 916895155a88..1c4131453db2 100644 --- a/arch/alpha/include/asm/jensen.h +++ b/arch/alpha/include/asm/jensen.h @@ -111,18 +111,18 @@ __EXTERN_INLINE void jensen_set_hae(unsigned long addr) * convinced that I need one of the newer machines. */ -static inline unsigned int jensen_local_inb(unsigned long addr) +__EXTERN_INLINE unsigned int jensen_local_inb(unsigned long addr) { return 0xff & *(vuip)((addr << 9) + EISA_VL82C106); } -static inline void jensen_local_outb(u8 b, unsigned long addr) +__EXTERN_INLINE void jensen_local_outb(u8 b, unsigned long addr) { *(vuip)((addr << 9) + EISA_VL82C106) = b; mb(); } -static inline unsigned int jensen_bus_inb(unsigned long addr) +__EXTERN_INLINE unsigned int jensen_bus_inb(unsigned long addr) { long result; @@ -131,7 +131,7 @@ static inline unsigned int jensen_bus_inb(unsigned long addr) return __kernel_extbl(result, addr & 3); } -static inline void jensen_bus_outb(u8 b, unsigned long addr) +__EXTERN_INLINE void jensen_bus_outb(u8 b, unsigned long addr) { jensen_set_hae(0); *(vuip)((addr << 7) + EISA_IO + 0x00) = b * 0x01010101; diff --git a/arch/alpha/include/asm/setup.h b/arch/alpha/include/asm/setup.h new file mode 100644 index 000000000000..262aab99e391 --- /dev/null +++ b/arch/alpha/include/asm/setup.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ALPHA_SETUP_H +#define __ALPHA_SETUP_H + +#include <uapi/asm/setup.h> + +/* + * We leave one page for the initial stack page, and one page for + * the initial process structure. Also, the console eats 3 MB for + * the initial bootloader (one of which we can reclaim later). + */ +#define BOOT_PCB 0x20000000 +#define BOOT_ADDR 0x20000000 +/* Remove when official MILO sources have ELF support: */ +#define BOOT_SIZE (16*1024) + +#ifdef CONFIG_ALPHA_LEGACY_START_ADDRESS +#define KERNEL_START_PHYS 0x300000 /* Old bootloaders hardcoded this. */ +#else +#define KERNEL_START_PHYS 0x1000000 /* required: Wildfire/Titan/Marvel */ +#endif + +#define KERNEL_START (PAGE_OFFSET+KERNEL_START_PHYS) +#define SWAPPER_PGD KERNEL_START +#define INIT_STACK (PAGE_OFFSET+KERNEL_START_PHYS+0x02000) +#define EMPTY_PGT (PAGE_OFFSET+KERNEL_START_PHYS+0x04000) +#define EMPTY_PGE (PAGE_OFFSET+KERNEL_START_PHYS+0x08000) +#define ZERO_PGE (PAGE_OFFSET+KERNEL_START_PHYS+0x0A000) + +#define START_ADDR (PAGE_OFFSET+KERNEL_START_PHYS+0x10000) + +/* + * This is setup by the secondary bootstrap loader. Because + * the zero page is zeroed out as soon as the vm system is + * initialized, we need to copy things out into a more permanent + * place. + */ +#define PARAM ZERO_PGE +#define COMMAND_LINE ((char *)(absolute_pointer(PARAM + 0x0000))) +#define INITRD_START (*(unsigned long *) (PARAM+0x100)) +#define INITRD_SIZE (*(unsigned long *) (PARAM+0x108)) + +#endif diff --git a/arch/alpha/include/uapi/asm/setup.h b/arch/alpha/include/uapi/asm/setup.h index 13b7ee465b0e..f881ea5947cb 100644 --- a/arch/alpha/include/uapi/asm/setup.h +++ b/arch/alpha/include/uapi/asm/setup.h @@ -1,43 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __ALPHA_SETUP_H -#define __ALPHA_SETUP_H +#ifndef _UAPI__ALPHA_SETUP_H +#define _UAPI__ALPHA_SETUP_H #define COMMAND_LINE_SIZE 256 -/* - * We leave one page for the initial stack page, and one page for - * the initial process structure. Also, the console eats 3 MB for - * the initial bootloader (one of which we can reclaim later). - */ -#define BOOT_PCB 0x20000000 -#define BOOT_ADDR 0x20000000 -/* Remove when official MILO sources have ELF support: */ -#define BOOT_SIZE (16*1024) - -#ifdef CONFIG_ALPHA_LEGACY_START_ADDRESS -#define KERNEL_START_PHYS 0x300000 /* Old bootloaders hardcoded this. */ -#else -#define KERNEL_START_PHYS 0x1000000 /* required: Wildfire/Titan/Marvel */ -#endif - -#define KERNEL_START (PAGE_OFFSET+KERNEL_START_PHYS) -#define SWAPPER_PGD KERNEL_START -#define INIT_STACK (PAGE_OFFSET+KERNEL_START_PHYS+0x02000) -#define EMPTY_PGT (PAGE_OFFSET+KERNEL_START_PHYS+0x04000) -#define EMPTY_PGE (PAGE_OFFSET+KERNEL_START_PHYS+0x08000) -#define ZERO_PGE (PAGE_OFFSET+KERNEL_START_PHYS+0x0A000) - -#define START_ADDR (PAGE_OFFSET+KERNEL_START_PHYS+0x10000) - -/* - * This is setup by the secondary bootstrap loader. Because - * the zero page is zeroed out as soon as the vm system is - * initialized, we need to copy things out into a more permanent - * place. - */ -#define PARAM ZERO_PGE -#define COMMAND_LINE ((char*)(PARAM + 0x0000)) -#define INITRD_START (*(unsigned long *) (PARAM+0x100)) -#define INITRD_SIZE (*(unsigned long *) (PARAM+0x108)) - -#endif +#endif /* _UAPI__ALPHA_SETUP_H */ diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c index 0021580d79ad..5808a66e2a81 100644 --- a/arch/alpha/kernel/pci-sysfs.c +++ b/arch/alpha/kernel/pci-sysfs.c @@ -60,6 +60,8 @@ static int __pci_mmap_fits(struct pci_dev *pdev, int num, * @sparse: address space type * * Use the bus mapping routines to map a PCI resource into userspace. + * + * Return: %0 on success, negative error code otherwise */ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, @@ -106,7 +108,7 @@ static int pci_mmap_resource_dense(struct file *filp, struct kobject *kobj, /** * pci_remove_resource_files - cleanup resource files - * @dev: dev to cleanup + * @pdev: pci_dev to cleanup * * If we created resource files for @dev, remove them from sysfs and * free their resources. @@ -221,10 +223,12 @@ static int pci_create_attr(struct pci_dev *pdev, int num) } /** - * pci_create_resource_files - create resource files in sysfs for @dev - * @dev: dev in question + * pci_create_resource_files - create resource files in sysfs for @pdev + * @pdev: pci_dev in question * * Walk the resources in @dev creating files for each resource available. + * + * Return: %0 on success, or negative error code */ int pci_create_resource_files(struct pci_dev *pdev) { @@ -296,7 +300,7 @@ int pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma, /** * pci_adjust_legacy_attr - adjustment of legacy file attributes - * @b: bus to create files under + * @bus: bus to create files under * @mmap_type: I/O port or memory * * Adjust file name and size for sparse mappings. diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c index e5d870ff225f..5c9c88428124 100644 --- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -7,6 +7,11 @@ * * Code supporting the Jensen. */ +#define __EXTERN_INLINE +#include <asm/io.h> +#include <asm/jensen.h> +#undef __EXTERN_INLINE + #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> @@ -17,11 +22,6 @@ #include <asm/ptrace.h> -#define __EXTERN_INLINE inline -#include <asm/io.h> -#include <asm/jensen.h> -#undef __EXTERN_INLINE - #include <asm/dma.h> #include <asm/irq.h> #include <asm/mmu_context.h> diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 7ac22e007d52..e4a041cd5715 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -486,3 +486,5 @@ 554 common landlock_create_ruleset sys_landlock_create_ruleset 555 common landlock_add_rule sys_landlock_add_rule 556 common landlock_restrict_self sys_landlock_restrict_self +# 557 reserved for memfd_secret +558 common process_mrelease sys_process_mrelease diff --git a/arch/alpha/lib/Makefile b/arch/alpha/lib/Makefile index 854d5e79979e..1cc74f7b50ef 100644 --- a/arch/alpha/lib/Makefile +++ b/arch/alpha/lib/Makefile @@ -14,6 +14,7 @@ ev6-$(CONFIG_ALPHA_EV6) := ev6- ev67-$(CONFIG_ALPHA_EV67) := ev67- lib-y = __divqu.o __remqu.o __divlu.o __remlu.o \ + udiv-qrnnd.o \ udelay.o \ $(ev6-y)memset.o \ $(ev6-y)memcpy.o \ diff --git a/arch/alpha/math-emu/qrnnd.S b/arch/alpha/lib/udiv-qrnnd.S index d6373ec1bff9..b887aa5428e5 100644 --- a/arch/alpha/math-emu/qrnnd.S +++ b/arch/alpha/lib/udiv-qrnnd.S @@ -25,6 +25,7 @@ # along with GCC; see the file COPYING. If not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. +#include <asm/export.h> .set noreorder .set noat @@ -161,3 +162,4 @@ $Odd: ret $31,($26),1 .end __udiv_qrnnd +EXPORT_SYMBOL(__udiv_qrnnd) diff --git a/arch/alpha/math-emu/Makefile b/arch/alpha/math-emu/Makefile index 6eda0973e183..3206402a8792 100644 --- a/arch/alpha/math-emu/Makefile +++ b/arch/alpha/math-emu/Makefile @@ -7,4 +7,4 @@ ccflags-y := -w obj-$(CONFIG_MATHEMU) += math-emu.o -math-emu-objs := math.o qrnnd.o +math-emu-objs := math.o diff --git a/arch/alpha/math-emu/math.c b/arch/alpha/math-emu/math.c index f7cef66af88d..4212258f3cfd 100644 --- a/arch/alpha/math-emu/math.c +++ b/arch/alpha/math-emu/math.c @@ -403,5 +403,3 @@ alpha_fp_emul_imprecise (struct pt_regs *regs, unsigned long write_mask) egress: return si_code; } - -EXPORT_SYMBOL(__udiv_qrnnd); diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index b5bf68e74732..3a5a80f302e1 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -49,9 +49,7 @@ config ARC select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32 select SET_FS - -config TRACE_IRQFLAGS_SUPPORT - def_bool y + select TRACE_IRQFLAGS_SUPPORT config LOCKDEP_SUPPORT def_bool y @@ -116,16 +114,9 @@ choice default ARC_CPU_770 if ISA_ARCOMPACT default ARC_CPU_HS if ISA_ARCV2 -if ISA_ARCOMPACT - -config ARC_CPU_750D - bool "ARC750D" - select ARC_CANT_LLSC - help - Support for ARC750 core - config ARC_CPU_770 bool "ARC770" + depends on ISA_ARCOMPACT select ARC_HAS_SWAPE help Support for ARC770 core introduced with Rel 4.10 (Summer 2011) @@ -135,8 +126,6 @@ config ARC_CPU_770 -Caches: New Prog Model, Region Flush -Insns: endian swap, load-locked/store-conditional, time-stamp-ctr -endif #ISA_ARCOMPACT - config ARC_CPU_HS bool "ARC-HS" depends on ISA_ARCV2 @@ -274,33 +263,17 @@ config ARC_DCCM_BASE choice prompt "MMU Version" - default ARC_MMU_V3 if ARC_CPU_770 - default ARC_MMU_V2 if ARC_CPU_750D - default ARC_MMU_V4 if ARC_CPU_HS - -if ISA_ARCOMPACT - -config ARC_MMU_V1 - bool "MMU v1" - help - Orig ARC700 MMU - -config ARC_MMU_V2 - bool "MMU v2" - help - Fixed the deficiency of v1 - possible thrashing in memcpy scenario - when 2 D-TLB and 1 I-TLB entries index into same 2way set. + default ARC_MMU_V3 if ISA_ARCOMPACT + default ARC_MMU_V4 if ISA_ARCV2 config ARC_MMU_V3 bool "MMU v3" - depends on ARC_CPU_770 + depends on ISA_ARCOMPACT help Introduced with ARC700 4.10: New Features Variable Page size (1k-16k), var JTLB size 128 x (2 or 4) Shared Address Spaces (SASID) -endif - config ARC_MMU_V4 bool "MMU v4" depends on ISA_ARCV2 @@ -319,7 +292,6 @@ config ARC_PAGE_SIZE_8K config ARC_PAGE_SIZE_16K bool "16KB" - depends on ARC_MMU_V3 || ARC_MMU_V4 config ARC_PAGE_SIZE_4K bool "4KB" @@ -340,6 +312,10 @@ config ARC_HUGEPAGE_16M endchoice +config PGTABLE_LEVELS + int "Number of Page table levels" + default 2 + config ARC_COMPACT_IRQ_LEVELS depends on ISA_ARCOMPACT bool "Setup Timer IRQ as high Priority" @@ -563,9 +539,6 @@ config ARC_DW2_UNWIND If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame unwind information -config ARC_DBG_TLB_PARANOIA - bool "Paranoia Checks in Low Level TLB Handlers" - config ARC_DBG_JUMP_LABEL bool "Paranoid checks in Static Keys (jump labels) code" depends on JUMP_LABEL diff --git a/arch/arc/Makefile b/arch/arc/Makefile index c0d87ac2e221..8782a03f24a8 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -18,8 +18,7 @@ ifeq ($(CONFIG_ARC_TUNE_MCPU),"") cflags-y += $(tune-mcpu-def-y) else tune-mcpu := $(shell echo $(CONFIG_ARC_TUNE_MCPU)) -tune-mcpu-ok := $(call cc-option-yn, $(tune-mcpu)) -ifeq ($(tune-mcpu-ok),y) +ifneq ($(call cc-option,$(tune-mcpu)),) cflags-y += $(tune-mcpu) else # The flag provided by 'CONFIG_ARC_TUNE_MCPU' option isn't known by this compiler diff --git a/arch/arc/include/asm/atomic-llsc.h b/arch/arc/include/asm/atomic-llsc.h new file mode 100644 index 000000000000..088d348781c1 --- /dev/null +++ b/arch/arc/include/asm/atomic-llsc.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _ASM_ARC_ATOMIC_LLSC_H +#define _ASM_ARC_ATOMIC_LLSC_H + +#define arch_atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) + +#define ATOMIC_OP(op, c_op, asm_op) \ +static inline void arch_atomic_##op(int i, atomic_t *v) \ +{ \ + unsigned int val; \ + \ + __asm__ __volatile__( \ + "1: llock %[val], [%[ctr]] \n" \ + " " #asm_op " %[val], %[val], %[i] \n" \ + " scond %[val], [%[ctr]] \n" \ + " bnz 1b \n" \ + : [val] "=&r" (val) /* Early clobber to prevent reg reuse */ \ + : [ctr] "r" (&v->counter), /* Not "m": llock only supports reg direct addr mode */ \ + [i] "ir" (i) \ + : "cc"); \ +} \ + +#define ATOMIC_OP_RETURN(op, c_op, asm_op) \ +static inline int arch_atomic_##op##_return_relaxed(int i, atomic_t *v) \ +{ \ + unsigned int val; \ + \ + __asm__ __volatile__( \ + "1: llock %[val], [%[ctr]] \n" \ + " " #asm_op " %[val], %[val], %[i] \n" \ + " scond %[val], [%[ctr]] \n" \ + " bnz 1b \n" \ + : [val] "=&r" (val) \ + : [ctr] "r" (&v->counter), \ + [i] "ir" (i) \ + : "cc"); \ + \ + return val; \ +} + +#define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed +#define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed + +#define ATOMIC_FETCH_OP(op, c_op, asm_op) \ +static inline int arch_atomic_fetch_##op##_relaxed(int i, atomic_t *v) \ +{ \ + unsigned int val, orig; \ + \ + __asm__ __volatile__( \ + "1: llock %[orig], [%[ctr]] \n" \ + " " #asm_op " %[val], %[orig], %[i] \n" \ + " scond %[val], [%[ctr]] \n" \ + " bnz 1b \n" \ + : [val] "=&r" (val), \ + [orig] "=&r" (orig) \ + : [ctr] "r" (&v->counter), \ + [i] "ir" (i) \ + : "cc"); \ + \ + return orig; \ +} + +#define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add_relaxed +#define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed + +#define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed +#define arch_atomic_fetch_andnot_relaxed arch_atomic_fetch_andnot_relaxed +#define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed +#define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor_relaxed + +#define ATOMIC_OPS(op, c_op, asm_op) \ + ATOMIC_OP(op, c_op, asm_op) \ + ATOMIC_OP_RETURN(op, c_op, asm_op) \ + ATOMIC_FETCH_OP(op, c_op, asm_op) + +ATOMIC_OPS(add, +=, add) +ATOMIC_OPS(sub, -=, sub) + +#undef ATOMIC_OPS +#define ATOMIC_OPS(op, c_op, asm_op) \ + ATOMIC_OP(op, c_op, asm_op) \ + ATOMIC_FETCH_OP(op, c_op, asm_op) + +ATOMIC_OPS(and, &=, and) +ATOMIC_OPS(andnot, &= ~, bic) +ATOMIC_OPS(or, |=, or) +ATOMIC_OPS(xor, ^=, xor) + +#define arch_atomic_andnot arch_atomic_andnot + +#undef ATOMIC_OPS +#undef ATOMIC_FETCH_OP +#undef ATOMIC_OP_RETURN +#undef ATOMIC_OP + +#endif diff --git a/arch/arc/include/asm/atomic-spinlock.h b/arch/arc/include/asm/atomic-spinlock.h new file mode 100644 index 000000000000..2c830347bfb4 --- /dev/null +++ b/arch/arc/include/asm/atomic-spinlock.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _ASM_ARC_ATOMIC_SPLOCK_H +#define _ASM_ARC_ATOMIC_SPLOCK_H + +/* + * Non hardware assisted Atomic-R-M-W + * Locking would change to irq-disabling only (UP) and spinlocks (SMP) + */ + +static inline void arch_atomic_set(atomic_t *v, int i) +{ + /* + * Independent of hardware support, all of the atomic_xxx() APIs need + * to follow the same locking rules to make sure that a "hardware" + * atomic insn (e.g. LD) doesn't clobber an "emulated" atomic insn + * sequence + * + * Thus atomic_set() despite being 1 insn (and seemingly atomic) + * requires the locking. + */ + unsigned long flags; + + atomic_ops_lock(flags); + WRITE_ONCE(v->counter, i); + atomic_ops_unlock(flags); +} + +#define arch_atomic_set_release(v, i) arch_atomic_set((v), (i)) + +#define ATOMIC_OP(op, c_op, asm_op) \ +static inline void arch_atomic_##op(int i, atomic_t *v) \ +{ \ + unsigned long flags; \ + \ + atomic_ops_lock(flags); \ + v->counter c_op i; \ + atomic_ops_unlock(flags); \ +} + +#define ATOMIC_OP_RETURN(op, c_op, asm_op) \ +static inline int arch_atomic_##op##_return(int i, atomic_t *v) \ +{ \ + unsigned long flags; \ + unsigned int temp; \ + \ + /* \ + * spin lock/unlock provides the needed smp_mb() before/after \ + */ \ + atomic_ops_lock(flags); \ + temp = v->counter; \ + temp c_op i; \ + v->counter = temp; \ + atomic_ops_unlock(flags); \ + \ + return temp; \ +} + +#define ATOMIC_FETCH_OP(op, c_op, asm_op) \ +static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \ +{ \ + unsigned long flags; \ + unsigned int orig; \ + \ + /* \ + * spin lock/unlock provides the needed smp_mb() before/after \ + */ \ + atomic_ops_lock(flags); \ + orig = v->counter; \ + v->counter c_op i; \ + atomic_ops_unlock(flags); \ + \ + return orig; \ +} + +#define ATOMIC_OPS(op, c_op, asm_op) \ + ATOMIC_OP(op, c_op, asm_op) \ + ATOMIC_OP_RETURN(op, c_op, asm_op) \ + ATOMIC_FETCH_OP(op, c_op, asm_op) + +ATOMIC_OPS(add, +=, add) +ATOMIC_OPS(sub, -=, sub) + +#undef ATOMIC_OPS +#define ATOMIC_OPS(op, c_op, asm_op) \ + ATOMIC_OP(op, c_op, asm_op) \ + ATOMIC_FETCH_OP(op, c_op, asm_op) + +ATOMIC_OPS(and, &=, and) +ATOMIC_OPS(andnot, &= ~, bic) +ATOMIC_OPS(or, |=, or) +ATOMIC_OPS(xor, ^=, xor) + +#define arch_atomic_andnot arch_atomic_andnot +#define arch_atomic_fetch_andnot arch_atomic_fetch_andnot + +#undef ATOMIC_OPS +#undef ATOMIC_FETCH_OP +#undef ATOMIC_OP_RETURN +#undef ATOMIC_OP + +#endif diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 7a36d79b5b2f..52ee51e1ff7c 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -17,435 +17,43 @@ #define arch_atomic_read(v) READ_ONCE((v)->counter) #ifdef CONFIG_ARC_HAS_LLSC - -#define arch_atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) - -#define ATOMIC_OP(op, c_op, asm_op) \ -static inline void arch_atomic_##op(int i, atomic_t *v) \ -{ \ - unsigned int val; \ - \ - __asm__ __volatile__( \ - "1: llock %[val], [%[ctr]] \n" \ - " " #asm_op " %[val], %[val], %[i] \n" \ - " scond %[val], [%[ctr]] \n" \ - " bnz 1b \n" \ - : [val] "=&r" (val) /* Early clobber to prevent reg reuse */ \ - : [ctr] "r" (&v->counter), /* Not "m": llock only supports reg direct addr mode */ \ - [i] "ir" (i) \ - : "cc"); \ -} \ - -#define ATOMIC_OP_RETURN(op, c_op, asm_op) \ -static inline int arch_atomic_##op##_return(int i, atomic_t *v) \ -{ \ - unsigned int val; \ - \ - /* \ - * Explicit full memory barrier needed before/after as \ - * LLOCK/SCOND themselves don't provide any such semantics \ - */ \ - smp_mb(); \ - \ - __asm__ __volatile__( \ - "1: llock %[val], [%[ctr]] \n" \ - " " #asm_op " %[val], %[val], %[i] \n" \ - " scond %[val], [%[ctr]] \n" \ - " bnz 1b \n" \ - : [val] "=&r" (val) \ - : [ctr] "r" (&v->counter), \ - [i] "ir" (i) \ - : "cc"); \ - \ - smp_mb(); \ - \ - return val; \ -} - -#define ATOMIC_FETCH_OP(op, c_op, asm_op) \ -static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \ -{ \ - unsigned int val, orig; \ - \ - /* \ - * Explicit full memory barrier needed before/after as \ - * LLOCK/SCOND themselves don't provide any such semantics \ - */ \ - smp_mb(); \ - \ - __asm__ __volatile__( \ - "1: llock %[orig], [%[ctr]] \n" \ - " " #asm_op " %[val], %[orig], %[i] \n" \ - " scond %[val], [%[ctr]] \n" \ - " bnz 1b \n" \ - : [val] "=&r" (val), \ - [orig] "=&r" (orig) \ - : [ctr] "r" (&v->counter), \ - [i] "ir" (i) \ - : "cc"); \ - \ - smp_mb(); \ - \ - return orig; \ -} - -#else /* !CONFIG_ARC_HAS_LLSC */ - -#ifndef CONFIG_SMP - - /* violating atomic_xxx API locking protocol in UP for optimization sake */ -#define arch_atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) - +#include <asm/atomic-llsc.h> #else +#include <asm/atomic-spinlock.h> +#endif -static inline void arch_atomic_set(atomic_t *v, int i) -{ - /* - * Independent of hardware support, all of the atomic_xxx() APIs need - * to follow the same locking rules to make sure that a "hardware" - * atomic insn (e.g. LD) doesn't clobber an "emulated" atomic insn - * sequence - * - * Thus atomic_set() despite being 1 insn (and seemingly atomic) - * requires the locking. - */ - unsigned long flags; +#define arch_atomic_cmpxchg(v, o, n) \ +({ \ + arch_cmpxchg(&((v)->counter), (o), (n)); \ +}) - atomic_ops_lock(flags); - WRITE_ONCE(v->counter, i); - atomic_ops_unlock(flags); -} +#ifdef arch_cmpxchg_relaxed +#define arch_atomic_cmpxchg_relaxed(v, o, n) \ +({ \ + arch_cmpxchg_relaxed(&((v)->counter), (o), (n)); \ +}) +#endif -#define arch_atomic_set_release(v, i) arch_atomic_set((v), (i)) +#define arch_atomic_xchg(v, n) \ +({ \ + arch_xchg(&((v)->counter), (n)); \ +}) +#ifdef arch_xchg_relaxed +#define arch_atomic_xchg_relaxed(v, n) \ +({ \ + arch_xchg_relaxed(&((v)->counter), (n)); \ +}) #endif /* - * Non hardware assisted Atomic-R-M-W - * Locking would change to irq-disabling only (UP) and spinlocks (SMP) + * 64-bit atomics */ - -#define ATOMIC_OP(op, c_op, asm_op) \ -static inline void arch_atomic_##op(int i, atomic_t *v) \ -{ \ - unsigned long flags; \ - \ - atomic_ops_lock(flags); \ - v->counter c_op i; \ - atomic_ops_unlock(flags); \ -} - -#define ATOMIC_OP_RETURN(op, c_op, asm_op) \ -static inline int arch_atomic_##op##_return(int i, atomic_t *v) \ -{ \ - unsigned long flags; \ - unsigned long temp; \ - \ - /* \ - * spin lock/unlock provides the needed smp_mb() before/after \ - */ \ - atomic_ops_lock(flags); \ - temp = v->counter; \ - temp c_op i; \ - v->counter = temp; \ - atomic_ops_unlock(flags); \ - \ - return temp; \ -} - -#define ATOMIC_FETCH_OP(op, c_op, asm_op) \ -static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \ -{ \ - unsigned long flags; \ - unsigned long orig; \ - \ - /* \ - * spin lock/unlock provides the needed smp_mb() before/after \ - */ \ - atomic_ops_lock(flags); \ - orig = v->counter; \ - v->counter c_op i; \ - atomic_ops_unlock(flags); \ - \ - return orig; \ -} - -#endif /* !CONFIG_ARC_HAS_LLSC */ - -#define ATOMIC_OPS(op, c_op, asm_op) \ - ATOMIC_OP(op, c_op, asm_op) \ - ATOMIC_OP_RETURN(op, c_op, asm_op) \ - ATOMIC_FETCH_OP(op, c_op, asm_op) - -ATOMIC_OPS(add, +=, add) -ATOMIC_OPS(sub, -=, sub) - -#undef ATOMIC_OPS -#define ATOMIC_OPS(op, c_op, asm_op) \ - ATOMIC_OP(op, c_op, asm_op) \ - ATOMIC_FETCH_OP(op, c_op, asm_op) - -ATOMIC_OPS(and, &=, and) -ATOMIC_OPS(andnot, &= ~, bic) -ATOMIC_OPS(or, |=, or) -ATOMIC_OPS(xor, ^=, xor) - -#define arch_atomic_andnot arch_atomic_andnot -#define arch_atomic_fetch_andnot arch_atomic_fetch_andnot - -#undef ATOMIC_OPS -#undef ATOMIC_FETCH_OP -#undef ATOMIC_OP_RETURN -#undef ATOMIC_OP - #ifdef CONFIG_GENERIC_ATOMIC64 - #include <asm-generic/atomic64.h> - -#else /* Kconfig ensures this is only enabled with needed h/w assist */ - -/* - * ARCv2 supports 64-bit exclusive load (LLOCKD) / store (SCONDD) - * - The address HAS to be 64-bit aligned - * - There are 2 semantics involved here: - * = exclusive implies no interim update between load/store to same addr - * = both words are observed/updated together: this is guaranteed even - * for regular 64-bit load (LDD) / store (STD). Thus atomic64_set() - * is NOT required to use LLOCKD+SCONDD, STD suffices - */ - -typedef struct { - s64 __aligned(8) counter; -} atomic64_t; - -#define ATOMIC64_INIT(a) { (a) } - -static inline s64 arch_atomic64_read(const atomic64_t *v) -{ - s64 val; - - __asm__ __volatile__( - " ldd %0, [%1] \n" - : "=r"(val) - : "r"(&v->counter)); - - return val; -} - -static inline void arch_atomic64_set(atomic64_t *v, s64 a) -{ - /* - * This could have been a simple assignment in "C" but would need - * explicit volatile. Otherwise gcc optimizers could elide the store - * which borked atomic64 self-test - * In the inline asm version, memory clobber needed for exact same - * reason, to tell gcc about the store. - * - * This however is not needed for sibling atomic64_add() etc since both - * load/store are explicitly done in inline asm. As long as API is used - * for each access, gcc has no way to optimize away any load/store - */ - __asm__ __volatile__( - " std %0, [%1] \n" - : - : "r"(a), "r"(&v->counter) - : "memory"); -} - -#define ATOMIC64_OP(op, op1, op2) \ -static inline void arch_atomic64_##op(s64 a, atomic64_t *v) \ -{ \ - s64 val; \ - \ - __asm__ __volatile__( \ - "1: \n" \ - " llockd %0, [%1] \n" \ - " " #op1 " %L0, %L0, %L2 \n" \ - " " #op2 " %H0, %H0, %H2 \n" \ - " scondd %0, [%1] \n" \ - " bnz 1b \n" \ - : "=&r"(val) \ - : "r"(&v->counter), "ir"(a) \ - : "cc"); \ -} \ - -#define ATOMIC64_OP_RETURN(op, op1, op2) \ -static inline s64 arch_atomic64_##op##_return(s64 a, atomic64_t *v) \ -{ \ - s64 val; \ - \ - smp_mb(); \ - \ - __asm__ __volatile__( \ - "1: \n" \ - " llockd %0, [%1] \n" \ - " " #op1 " %L0, %L0, %L2 \n" \ - " " #op2 " %H0, %H0, %H2 \n" \ - " scondd %0, [%1] \n" \ - " bnz 1b \n" \ - : [val] "=&r"(val) \ - : "r"(&v->counter), "ir"(a) \ - : "cc"); /* memory clobber comes from smp_mb() */ \ - \ - smp_mb(); \ - \ - return val; \ -} - -#define ATOMIC64_FETCH_OP(op, op1, op2) \ -static inline s64 arch_atomic64_fetch_##op(s64 a, atomic64_t *v) \ -{ \ - s64 val, orig; \ - \ - smp_mb(); \ - \ - __asm__ __volatile__( \ - "1: \n" \ - " llockd %0, [%2] \n" \ - " " #op1 " %L1, %L0, %L3 \n" \ - " " #op2 " %H1, %H0, %H3 \n" \ - " scondd %1, [%2] \n" \ - " bnz 1b \n" \ - : "=&r"(orig), "=&r"(val) \ - : "r"(&v->counter), "ir"(a) \ - : "cc"); /* memory clobber comes from smp_mb() */ \ - \ - smp_mb(); \ - \ - return orig; \ -} - -#define ATOMIC64_OPS(op, op1, op2) \ - ATOMIC64_OP(op, op1, op2) \ - ATOMIC64_OP_RETURN(op, op1, op2) \ - ATOMIC64_FETCH_OP(op, op1, op2) - -ATOMIC64_OPS(add, add.f, adc) -ATOMIC64_OPS(sub, sub.f, sbc) -ATOMIC64_OPS(and, and, and) -ATOMIC64_OPS(andnot, bic, bic) -ATOMIC64_OPS(or, or, or) -ATOMIC64_OPS(xor, xor, xor) - -#define arch_atomic64_andnot arch_atomic64_andnot -#define arch_atomic64_fetch_andnot arch_atomic64_fetch_andnot - -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP_RETURN -#undef ATOMIC64_OP - -static inline s64 -arch_atomic64_cmpxchg(atomic64_t *ptr, s64 expected, s64 new) -{ - s64 prev; - - smp_mb(); - - __asm__ __volatile__( - "1: llockd %0, [%1] \n" - " brne %L0, %L2, 2f \n" - " brne %H0, %H2, 2f \n" - " scondd %3, [%1] \n" - " bnz 1b \n" - "2: \n" - : "=&r"(prev) - : "r"(ptr), "ir"(expected), "r"(new) - : "cc"); /* memory clobber comes from smp_mb() */ - - smp_mb(); - - return prev; -} - -static inline s64 arch_atomic64_xchg(atomic64_t *ptr, s64 new) -{ - s64 prev; - - smp_mb(); - - __asm__ __volatile__( - "1: llockd %0, [%1] \n" - " scondd %2, [%1] \n" - " bnz 1b \n" - "2: \n" - : "=&r"(prev) - : "r"(ptr), "r"(new) - : "cc"); /* memory clobber comes from smp_mb() */ - - smp_mb(); - - return prev; -} - -/** - * arch_atomic64_dec_if_positive - decrement by 1 if old value positive - * @v: pointer of type atomic64_t - * - * The function returns the old value of *v minus 1, even if - * the atomic variable, v, was not decremented. - */ - -static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) -{ - s64 val; - - smp_mb(); - - __asm__ __volatile__( - "1: llockd %0, [%1] \n" - " sub.f %L0, %L0, 1 # w0 - 1, set C on borrow\n" - " sub.c %H0, %H0, 1 # if C set, w1 - 1\n" - " brlt %H0, 0, 2f \n" - " scondd %0, [%1] \n" - " bnz 1b \n" - "2: \n" - : "=&r"(val) - : "r"(&v->counter) - : "cc"); /* memory clobber comes from smp_mb() */ - - smp_mb(); - - return val; -} -#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive - -/** - * arch_atomic64_fetch_add_unless - add unless the number is a given value - * @v: pointer of type atomic64_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, if it was not @u. - * Returns the old value of @v - */ -static inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) -{ - s64 old, temp; - - smp_mb(); - - __asm__ __volatile__( - "1: llockd %0, [%2] \n" - " brne %L0, %L4, 2f # continue to add since v != u \n" - " breq.d %H0, %H4, 3f # return since v == u \n" - "2: \n" - " add.f %L1, %L0, %L3 \n" - " adc %H1, %H0, %H3 \n" - " scondd %1, [%2] \n" - " bnz 1b \n" - "3: \n" - : "=&r"(old), "=&r" (temp) - : "r"(&v->counter), "r"(a), "r"(u) - : "cc"); /* memory clobber comes from smp_mb() */ - - smp_mb(); - - return old; -} -#define arch_atomic64_fetch_add_unless arch_atomic64_fetch_add_unless - -#endif /* !CONFIG_GENERIC_ATOMIC64 */ +#else +#include <asm/atomic64-arcv2.h> +#endif #endif /* !__ASSEMBLY__ */ diff --git a/arch/arc/include/asm/atomic64-arcv2.h b/arch/arc/include/asm/atomic64-arcv2.h new file mode 100644 index 000000000000..c5a8010fdc97 --- /dev/null +++ b/arch/arc/include/asm/atomic64-arcv2.h @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +/* + * ARCv2 supports 64-bit exclusive load (LLOCKD) / store (SCONDD) + * - The address HAS to be 64-bit aligned + */ + +#ifndef _ASM_ARC_ATOMIC64_ARCV2_H +#define _ASM_ARC_ATOMIC64_ARCV2_H + +typedef struct { + s64 __aligned(8) counter; +} atomic64_t; + +#define ATOMIC64_INIT(a) { (a) } + +static inline s64 arch_atomic64_read(const atomic64_t *v) +{ + s64 val; + + __asm__ __volatile__( + " ldd %0, [%1] \n" + : "=r"(val) + : "r"(&v->counter)); + + return val; +} + +static inline void arch_atomic64_set(atomic64_t *v, s64 a) +{ + /* + * This could have been a simple assignment in "C" but would need + * explicit volatile. Otherwise gcc optimizers could elide the store + * which borked atomic64 self-test + * In the inline asm version, memory clobber needed for exact same + * reason, to tell gcc about the store. + * + * This however is not needed for sibling atomic64_add() etc since both + * load/store are explicitly done in inline asm. As long as API is used + * for each access, gcc has no way to optimize away any load/store + */ + __asm__ __volatile__( + " std %0, [%1] \n" + : + : "r"(a), "r"(&v->counter) + : "memory"); +} + +#define ATOMIC64_OP(op, op1, op2) \ +static inline void arch_atomic64_##op(s64 a, atomic64_t *v) \ +{ \ + s64 val; \ + \ + __asm__ __volatile__( \ + "1: \n" \ + " llockd %0, [%1] \n" \ + " " #op1 " %L0, %L0, %L2 \n" \ + " " #op2 " %H0, %H0, %H2 \n" \ + " scondd %0, [%1] \n" \ + " bnz 1b \n" \ + : "=&r"(val) \ + : "r"(&v->counter), "ir"(a) \ + : "cc"); \ +} \ + +#define ATOMIC64_OP_RETURN(op, op1, op2) \ +static inline s64 arch_atomic64_##op##_return_relaxed(s64 a, atomic64_t *v) \ +{ \ + s64 val; \ + \ + __asm__ __volatile__( \ + "1: \n" \ + " llockd %0, [%1] \n" \ + " " #op1 " %L0, %L0, %L2 \n" \ + " " #op2 " %H0, %H0, %H2 \n" \ + " scondd %0, [%1] \n" \ + " bnz 1b \n" \ + : [val] "=&r"(val) \ + : "r"(&v->counter), "ir"(a) \ + : "cc"); /* memory clobber comes from smp_mb() */ \ + \ + return val; \ +} + +#define arch_atomic64_add_return_relaxed arch_atomic64_add_return_relaxed +#define arch_atomic64_sub_return_relaxed arch_atomic64_sub_return_relaxed + +#define ATOMIC64_FETCH_OP(op, op1, op2) \ +static inline s64 arch_atomic64_fetch_##op##_relaxed(s64 a, atomic64_t *v) \ +{ \ + s64 val, orig; \ + \ + __asm__ __volatile__( \ + "1: \n" \ + " llockd %0, [%2] \n" \ + " " #op1 " %L1, %L0, %L3 \n" \ + " " #op2 " %H1, %H0, %H3 \n" \ + " scondd %1, [%2] \n" \ + " bnz 1b \n" \ + : "=&r"(orig), "=&r"(val) \ + : "r"(&v->counter), "ir"(a) \ + : "cc"); /* memory clobber comes from smp_mb() */ \ + \ + return orig; \ +} + +#define arch_atomic64_fetch_add_relaxed arch_atomic64_fetch_add_relaxed +#define arch_atomic64_fetch_sub_relaxed arch_atomic64_fetch_sub_relaxed + +#define arch_atomic64_fetch_and_relaxed arch_atomic64_fetch_and_relaxed +#define arch_atomic64_fetch_andnot_relaxed arch_atomic64_fetch_andnot_relaxed +#define arch_atomic64_fetch_or_relaxed arch_atomic64_fetch_or_relaxed +#define arch_atomic64_fetch_xor_relaxed arch_atomic64_fetch_xor_relaxed + +#define ATOMIC64_OPS(op, op1, op2) \ + ATOMIC64_OP(op, op1, op2) \ + ATOMIC64_OP_RETURN(op, op1, op2) \ + ATOMIC64_FETCH_OP(op, op1, op2) + +ATOMIC64_OPS(add, add.f, adc) +ATOMIC64_OPS(sub, sub.f, sbc) + +#undef ATOMIC64_OPS +#define ATOMIC64_OPS(op, op1, op2) \ + ATOMIC64_OP(op, op1, op2) \ + ATOMIC64_FETCH_OP(op, op1, op2) + +ATOMIC64_OPS(and, and, and) +ATOMIC64_OPS(andnot, bic, bic) +ATOMIC64_OPS(or, or, or) +ATOMIC64_OPS(xor, xor, xor) + +#define arch_atomic64_andnot arch_atomic64_andnot + +#undef ATOMIC64_OPS +#undef ATOMIC64_FETCH_OP +#undef ATOMIC64_OP_RETURN +#undef ATOMIC64_OP + +static inline s64 +arch_atomic64_cmpxchg(atomic64_t *ptr, s64 expected, s64 new) +{ + s64 prev; + + smp_mb(); + + __asm__ __volatile__( + "1: llockd %0, [%1] \n" + " brne %L0, %L2, 2f \n" + " brne %H0, %H2, 2f \n" + " scondd %3, [%1] \n" + " bnz 1b \n" + "2: \n" + : "=&r"(prev) + : "r"(ptr), "ir"(expected), "r"(new) + : "cc"); /* memory clobber comes from smp_mb() */ + + smp_mb(); + + return prev; +} + +static inline s64 arch_atomic64_xchg(atomic64_t *ptr, s64 new) +{ + s64 prev; + + smp_mb(); + + __asm__ __volatile__( + "1: llockd %0, [%1] \n" + " scondd %2, [%1] \n" + " bnz 1b \n" + "2: \n" + : "=&r"(prev) + : "r"(ptr), "r"(new) + : "cc"); /* memory clobber comes from smp_mb() */ + + smp_mb(); + + return prev; +} + +/** + * arch_atomic64_dec_if_positive - decrement by 1 if old value positive + * @v: pointer of type atomic64_t + * + * The function returns the old value of *v minus 1, even if + * the atomic variable, v, was not decremented. + */ + +static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) +{ + s64 val; + + smp_mb(); + + __asm__ __volatile__( + "1: llockd %0, [%1] \n" + " sub.f %L0, %L0, 1 # w0 - 1, set C on borrow\n" + " sub.c %H0, %H0, 1 # if C set, w1 - 1\n" + " brlt %H0, 0, 2f \n" + " scondd %0, [%1] \n" + " bnz 1b \n" + "2: \n" + : "=&r"(val) + : "r"(&v->counter) + : "cc"); /* memory clobber comes from smp_mb() */ + + smp_mb(); + + return val; +} +#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive + +/** + * arch_atomic64_fetch_add_unless - add unless the number is a given value + * @v: pointer of type atomic64_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, if it was not @u. + * Returns the old value of @v + */ +static inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) +{ + s64 old, temp; + + smp_mb(); + + __asm__ __volatile__( + "1: llockd %0, [%2] \n" + " brne %L0, %L4, 2f # continue to add since v != u \n" + " breq.d %H0, %H4, 3f # return since v == u \n" + "2: \n" + " add.f %L1, %L0, %L3 \n" + " adc %H1, %H0, %H3 \n" + " scondd %1, [%2] \n" + " bnz 1b \n" + "3: \n" + : "=&r"(old), "=&r" (temp) + : "r"(&v->counter), "r"(a), "r"(u) + : "cc"); /* memory clobber comes from smp_mb() */ + + smp_mb(); + + return old; +} +#define arch_atomic64_fetch_add_unless arch_atomic64_fetch_add_unless + +#endif diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h index fb98440c0bd4..a7daaf64ae34 100644 --- a/arch/arc/include/asm/bitops.h +++ b/arch/arc/include/asm/bitops.h @@ -14,188 +14,6 @@ #include <linux/types.h> #include <linux/compiler.h> -#include <asm/barrier.h> -#ifndef CONFIG_ARC_HAS_LLSC -#include <asm/smp.h> -#endif - -#ifdef CONFIG_ARC_HAS_LLSC - -/* - * Hardware assisted Atomic-R-M-W - */ - -#define BIT_OP(op, c_op, asm_op) \ -static inline void op##_bit(unsigned long nr, volatile unsigned long *m)\ -{ \ - unsigned int temp; \ - \ - m += nr >> 5; \ - \ - nr &= 0x1f; \ - \ - __asm__ __volatile__( \ - "1: llock %0, [%1] \n" \ - " " #asm_op " %0, %0, %2 \n" \ - " scond %0, [%1] \n" \ - " bnz 1b \n" \ - : "=&r"(temp) /* Early clobber, to prevent reg reuse */ \ - : "r"(m), /* Not "m": llock only supports reg direct addr mode */ \ - "ir"(nr) \ - : "cc"); \ -} - -/* - * Semantically: - * Test the bit - * if clear - * set it and return 0 (old value) - * else - * return 1 (old value). - * - * Since ARC lacks a equivalent h/w primitive, the bit is set unconditionally - * and the old value of bit is returned - */ -#define TEST_N_BIT_OP(op, c_op, asm_op) \ -static inline int test_and_##op##_bit(unsigned long nr, volatile unsigned long *m)\ -{ \ - unsigned long old, temp; \ - \ - m += nr >> 5; \ - \ - nr &= 0x1f; \ - \ - /* \ - * Explicit full memory barrier needed before/after as \ - * LLOCK/SCOND themselves don't provide any such smenatic \ - */ \ - smp_mb(); \ - \ - __asm__ __volatile__( \ - "1: llock %0, [%2] \n" \ - " " #asm_op " %1, %0, %3 \n" \ - " scond %1, [%2] \n" \ - " bnz 1b \n" \ - : "=&r"(old), "=&r"(temp) \ - : "r"(m), "ir"(nr) \ - : "cc"); \ - \ - smp_mb(); \ - \ - return (old & (1 << nr)) != 0; \ -} - -#else /* !CONFIG_ARC_HAS_LLSC */ - -/* - * Non hardware assisted Atomic-R-M-W - * Locking would change to irq-disabling only (UP) and spinlocks (SMP) - * - * There's "significant" micro-optimization in writing our own variants of - * bitops (over generic variants) - * - * (1) The generic APIs have "signed" @nr while we have it "unsigned" - * This avoids extra code to be generated for pointer arithmatic, since - * is "not sure" that index is NOT -ve - * (2) Utilize the fact that ARCompact bit fidding insn (BSET/BCLR/ASL) etc - * only consider bottom 5 bits of @nr, so NO need to mask them off. - * (GCC Quirk: however for constant @nr we still need to do the masking - * at compile time) - */ - -#define BIT_OP(op, c_op, asm_op) \ -static inline void op##_bit(unsigned long nr, volatile unsigned long *m)\ -{ \ - unsigned long temp, flags; \ - m += nr >> 5; \ - \ - /* \ - * spin lock/unlock provide the needed smp_mb() before/after \ - */ \ - bitops_lock(flags); \ - \ - temp = *m; \ - *m = temp c_op (1UL << (nr & 0x1f)); \ - \ - bitops_unlock(flags); \ -} - -#define TEST_N_BIT_OP(op, c_op, asm_op) \ -static inline int test_and_##op##_bit(unsigned long nr, volatile unsigned long *m)\ -{ \ - unsigned long old, flags; \ - m += nr >> 5; \ - \ - bitops_lock(flags); \ - \ - old = *m; \ - *m = old c_op (1UL << (nr & 0x1f)); \ - \ - bitops_unlock(flags); \ - \ - return (old & (1UL << (nr & 0x1f))) != 0; \ -} - -#endif - -/*************************************** - * Non atomic variants - **************************************/ - -#define __BIT_OP(op, c_op, asm_op) \ -static inline void __##op##_bit(unsigned long nr, volatile unsigned long *m) \ -{ \ - unsigned long temp; \ - m += nr >> 5; \ - \ - temp = *m; \ - *m = temp c_op (1UL << (nr & 0x1f)); \ -} - -#define __TEST_N_BIT_OP(op, c_op, asm_op) \ -static inline int __test_and_##op##_bit(unsigned long nr, volatile unsigned long *m)\ -{ \ - unsigned long old; \ - m += nr >> 5; \ - \ - old = *m; \ - *m = old c_op (1UL << (nr & 0x1f)); \ - \ - return (old & (1UL << (nr & 0x1f))) != 0; \ -} - -#define BIT_OPS(op, c_op, asm_op) \ - \ - /* set_bit(), clear_bit(), change_bit() */ \ - BIT_OP(op, c_op, asm_op) \ - \ - /* test_and_set_bit(), test_and_clear_bit(), test_and_change_bit() */\ - TEST_N_BIT_OP(op, c_op, asm_op) \ - \ - /* __set_bit(), __clear_bit(), __change_bit() */ \ - __BIT_OP(op, c_op, asm_op) \ - \ - /* __test_and_set_bit(), __test_and_clear_bit(), __test_and_change_bit() */\ - __TEST_N_BIT_OP(op, c_op, asm_op) - -BIT_OPS(set, |, bset) -BIT_OPS(clear, & ~, bclr) -BIT_OPS(change, ^, bxor) - -/* - * This routine doesn't need to be atomic. - */ -static inline int -test_bit(unsigned int nr, const volatile unsigned long *addr) -{ - unsigned long mask; - - addr += nr >> 5; - - mask = 1UL << (nr & 0x1f); - - return ((mask & *addr) != 0); -} #ifdef CONFIG_ISA_ARCOMPACT @@ -296,7 +114,7 @@ static inline __attribute__ ((const)) unsigned long __ffs(unsigned long word) * @result: [1-32] * fls(1) = 1, fls(0x80000000) = 32, fls(0) = 0 */ -static inline __attribute__ ((const)) int fls(unsigned long x) +static inline __attribute__ ((const)) int fls(unsigned int x) { int n; @@ -323,7 +141,7 @@ static inline __attribute__ ((const)) int __fls(unsigned long x) * ffs = Find First Set in word (LSB to MSB) * @result: [1-32], 0 if all 0's */ -static inline __attribute__ ((const)) int ffs(unsigned long x) +static inline __attribute__ ((const)) int ffs(unsigned int x) { int n; @@ -368,6 +186,8 @@ static inline __attribute__ ((const)) unsigned long __ffs(unsigned long x) #include <asm-generic/bitops/fls64.h> #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/lock.h> +#include <asm-generic/bitops/atomic.h> +#include <asm-generic/bitops/non-atomic.h> #include <asm-generic/bitops/find.h> #include <asm-generic/bitops/le.h> diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h index d8ece4292388..f0f1fc5d62b6 100644 --- a/arch/arc/include/asm/cache.h +++ b/arch/arc/include/asm/cache.h @@ -62,10 +62,6 @@ #define ARCH_SLAB_MINALIGN 8 #endif -extern void arc_cache_init(void); -extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len); -extern void read_decode_cache_bcr(void); - extern int ioc_enable; extern unsigned long perip_base, perip_end; diff --git a/arch/arc/include/asm/cmpxchg.h b/arch/arc/include/asm/cmpxchg.h index d42917e803e1..c5b544a5fe81 100644 --- a/arch/arc/include/asm/cmpxchg.h +++ b/arch/arc/include/asm/cmpxchg.h @@ -6,6 +6,7 @@ #ifndef __ASM_ARC_CMPXCHG_H #define __ASM_ARC_CMPXCHG_H +#include <linux/build_bug.h> #include <linux/types.h> #include <asm/barrier.h> @@ -13,146 +14,130 @@ #ifdef CONFIG_ARC_HAS_LLSC -static inline unsigned long -__cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new) -{ - unsigned long prev; - - /* - * Explicit full memory barrier needed before/after as - * LLOCK/SCOND themselves don't provide any such semantics - */ - smp_mb(); - - __asm__ __volatile__( - "1: llock %0, [%1] \n" - " brne %0, %2, 2f \n" - " scond %3, [%1] \n" - " bnz 1b \n" - "2: \n" - : "=&r"(prev) /* Early clobber, to prevent reg reuse */ - : "r"(ptr), /* Not "m": llock only supports reg direct addr mode */ - "ir"(expected), - "r"(new) /* can't be "ir". scond can't take LIMM for "b" */ - : "cc", "memory"); /* so that gcc knows memory is being written here */ - - smp_mb(); - - return prev; -} - -#else /* !CONFIG_ARC_HAS_LLSC */ - -static inline unsigned long -__cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new) -{ - unsigned long flags; - int prev; - volatile unsigned long *p = ptr; - - /* - * spin lock/unlock provide the needed smp_mb() before/after - */ - atomic_ops_lock(flags); - prev = *p; - if (prev == expected) - *p = new; - atomic_ops_unlock(flags); - return prev; -} - -#endif +/* + * if (*ptr == @old) + * *ptr = @new + */ +#define __cmpxchg(ptr, old, new) \ +({ \ + __typeof__(*(ptr)) _prev; \ + \ + __asm__ __volatile__( \ + "1: llock %0, [%1] \n" \ + " brne %0, %2, 2f \n" \ + " scond %3, [%1] \n" \ + " bnz 1b \n" \ + "2: \n" \ + : "=&r"(_prev) /* Early clobber prevent reg reuse */ \ + : "r"(ptr), /* Not "m": llock only supports reg */ \ + "ir"(old), \ + "r"(new) /* Not "ir": scond can't take LIMM */ \ + : "cc", \ + "memory"); /* gcc knows memory is clobbered */ \ + \ + _prev; \ +}) -#define arch_cmpxchg(ptr, o, n) ({ \ - (typeof(*(ptr)))__cmpxchg((ptr), \ - (unsigned long)(o), \ - (unsigned long)(n)); \ +#define arch_cmpxchg_relaxed(ptr, old, new) \ +({ \ + __typeof__(ptr) _p_ = (ptr); \ + __typeof__(*(ptr)) _o_ = (old); \ + __typeof__(*(ptr)) _n_ = (new); \ + __typeof__(*(ptr)) _prev_; \ + \ + switch(sizeof((_p_))) { \ + case 4: \ + _prev_ = __cmpxchg(_p_, _o_, _n_); \ + break; \ + default: \ + BUILD_BUG(); \ + } \ + _prev_; \ }) -/* - * atomic_cmpxchg is same as cmpxchg - * LLSC: only different in data-type, semantics are exactly same - * !LLSC: cmpxchg() has to use an external lock atomic_ops_lock to guarantee - * semantics, and this lock also happens to be used by atomic_*() - */ -#define arch_atomic_cmpxchg(v, o, n) ((int)arch_cmpxchg(&((v)->counter), (o), (n))) +#else +#define arch_cmpxchg(ptr, old, new) \ +({ \ + volatile __typeof__(ptr) _p_ = (ptr); \ + __typeof__(*(ptr)) _o_ = (old); \ + __typeof__(*(ptr)) _n_ = (new); \ + __typeof__(*(ptr)) _prev_; \ + unsigned long __flags; \ + \ + BUILD_BUG_ON(sizeof(_p_) != 4); \ + \ + /* \ + * spin lock/unlock provide the needed smp_mb() before/after \ + */ \ + atomic_ops_lock(__flags); \ + _prev_ = *_p_; \ + if (_prev_ == _o_) \ + *_p_ = _n_; \ + atomic_ops_unlock(__flags); \ + _prev_; \ +}) + +#endif /* - * xchg (reg with memory) based on "Native atomic" EX insn + * xchg */ -static inline unsigned long __xchg(unsigned long val, volatile void *ptr, - int size) -{ - extern unsigned long __xchg_bad_pointer(void); - - switch (size) { - case 4: - smp_mb(); - - __asm__ __volatile__( - " ex %0, [%1] \n" - : "+r"(val) - : "r"(ptr) - : "memory"); +#ifdef CONFIG_ARC_HAS_LLSC - smp_mb(); +#define __xchg(ptr, val) \ +({ \ + __asm__ __volatile__( \ + " ex %0, [%1] \n" /* set new value */ \ + : "+r"(val) \ + : "r"(ptr) \ + : "memory"); \ + _val_; /* get old value */ \ +}) - return val; - } - return __xchg_bad_pointer(); -} +#define arch_xchg_relaxed(ptr, val) \ +({ \ + __typeof__(ptr) _p_ = (ptr); \ + __typeof__(*(ptr)) _val_ = (val); \ + \ + switch(sizeof(*(_p_))) { \ + case 4: \ + _val_ = __xchg(_p_, _val_); \ + break; \ + default: \ + BUILD_BUG(); \ + } \ + _val_; \ +}) -#define _xchg(ptr, with) ((typeof(*(ptr)))__xchg((unsigned long)(with), (ptr), \ - sizeof(*(ptr)))) +#else /* !CONFIG_ARC_HAS_LLSC */ /* - * xchg() maps directly to ARC EX instruction which guarantees atomicity. - * However in !LLSC config, it also needs to be use @atomic_ops_lock spinlock - * due to a subtle reason: - * - For !LLSC, cmpxchg() needs to use that lock (see above) and there is lot - * of kernel code which calls xchg()/cmpxchg() on same data (see llist.h) - * Hence xchg() needs to follow same locking rules. - * - * Technically the lock is also needed for UP (boils down to irq save/restore) - * but we can cheat a bit since cmpxchg() atomic_ops_lock() would cause irqs to - * be disabled thus can't possibly be interrupted/preempted/clobbered by xchg() - * Other way around, xchg is one instruction anyways, so can't be interrupted - * as such + * EX instructions is baseline and present in !LLSC too. But in this + * regime it still needs use @atomic_ops_lock spinlock to allow interop + * with cmpxchg() which uses spinlock in !LLSC + * (llist.h use xchg and cmpxchg on sama data) */ -#if !defined(CONFIG_ARC_HAS_LLSC) && defined(CONFIG_SMP) - -#define arch_xchg(ptr, with) \ -({ \ - unsigned long flags; \ - typeof(*(ptr)) old_val; \ - \ - atomic_ops_lock(flags); \ - old_val = _xchg(ptr, with); \ - atomic_ops_unlock(flags); \ - old_val; \ +#define arch_xchg(ptr, val) \ +({ \ + __typeof__(ptr) _p_ = (ptr); \ + __typeof__(*(ptr)) _val_ = (val); \ + \ + unsigned long __flags; \ + \ + atomic_ops_lock(__flags); \ + \ + __asm__ __volatile__( \ + " ex %0, [%1] \n" \ + : "+r"(_val_) \ + : "r"(_p_) \ + : "memory"); \ + \ + atomic_ops_unlock(__flags); \ + _val_; \ }) -#else - -#define arch_xchg(ptr, with) _xchg(ptr, with) - #endif -/* - * "atomic" variant of xchg() - * REQ: It needs to follow the same serialization rules as other atomic_xxx() - * Since xchg() doesn't always do that, it would seem that following definition - * is incorrect. But here's the rationale: - * SMP : Even xchg() takes the atomic_ops_lock, so OK. - * LLSC: atomic_ops_lock are not relevant at all (even if SMP, since LLSC - * is natively "SMP safe", no serialization required). - * UP : other atomics disable IRQ, so no way a difft ctxt atomic_xchg() - * could clobber them. atomic_xchg() itself would be 1 insn, so it - * can't be clobbered by others. Thus no serialization required when - * atomic_xchg is involved. - */ -#define arch_atomic_xchg(v, new) (arch_xchg(&((v)->counter), new)) - #endif diff --git a/arch/arc/include/asm/entry-compact.h b/arch/arc/include/asm/entry-compact.h index 6dbf5cecc8cc..5aab4f93ab8a 100644 --- a/arch/arc/include/asm/entry-compact.h +++ b/arch/arc/include/asm/entry-compact.h @@ -126,19 +126,11 @@ * to be saved again on kernel mode stack, as part of pt_regs. *-------------------------------------------------------------*/ .macro PROLOG_FREEUP_REG reg, mem -#ifndef ARC_USE_SCRATCH_REG - sr \reg, [ARC_REG_SCRATCH_DATA0] -#else st \reg, [\mem] -#endif .endm .macro PROLOG_RESTORE_REG reg, mem -#ifndef ARC_USE_SCRATCH_REG - lr \reg, [ARC_REG_SCRATCH_DATA0] -#else ld \reg, [\mem] -#endif .endm /*-------------------------------------------------------------- diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h index 4eef17c5c1da..11b0ff26b97b 100644 --- a/arch/arc/include/asm/hugepage.h +++ b/arch/arc/include/asm/hugepage.h @@ -58,14 +58,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); -/* Generic variants assume pgtable_t is struct page *, hence need for these */ -#define __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable); - -#define __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); - #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE extern void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); diff --git a/arch/arc/include/asm/mmu-arcv2.h b/arch/arc/include/asm/mmu-arcv2.h new file mode 100644 index 000000000000..ed9036d4ede3 --- /dev/null +++ b/arch/arc/include/asm/mmu-arcv2.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2004, 2007-2010, 2011-2012, 2019-20 Synopsys, Inc. (www.synopsys.com) + * + * MMUv3 (arc700) / MMUv4 (archs) are software page walked and software managed. + * This file contains the TLB access registers and commands + */ + +#ifndef _ASM_ARC_MMU_ARCV2_H +#define _ASM_ARC_MMU_ARCV2_H + +/* + * TLB Management regs + */ +#define ARC_REG_MMU_BCR 0x06f + +#ifdef CONFIG_ARC_MMU_V3 +#define ARC_REG_TLBPD0 0x405 +#define ARC_REG_TLBPD1 0x406 +#define ARC_REG_TLBPD1HI 0 /* Dummy: allows common code */ +#define ARC_REG_TLBINDEX 0x407 +#define ARC_REG_TLBCOMMAND 0x408 +#define ARC_REG_PID 0x409 +#define ARC_REG_SCRATCH_DATA0 0x418 +#else +#define ARC_REG_TLBPD0 0x460 +#define ARC_REG_TLBPD1 0x461 +#define ARC_REG_TLBPD1HI 0x463 +#define ARC_REG_TLBINDEX 0x464 +#define ARC_REG_TLBCOMMAND 0x465 +#define ARC_REG_PID 0x468 +#define ARC_REG_SCRATCH_DATA0 0x46c +#endif + +/* Bits in MMU PID reg */ +#define __TLB_ENABLE (1 << 31) +#define __PROG_ENABLE (1 << 30) +#define MMU_ENABLE (__TLB_ENABLE | __PROG_ENABLE) + +/* Bits in TLB Index reg */ +#define TLB_LKUP_ERR 0x80000000 + +#ifdef CONFIG_ARC_MMU_V3 +#define TLB_DUP_ERR (TLB_LKUP_ERR | 0x00000001) +#else +#define TLB_DUP_ERR (TLB_LKUP_ERR | 0x40000000) +#endif + +/* + * TLB Commands + */ +#define TLBWrite 0x1 +#define TLBRead 0x2 +#define TLBGetIndex 0x3 +#define TLBProbe 0x4 +#define TLBWriteNI 0x5 /* write JTLB without inv uTLBs */ +#define TLBIVUTLB 0x6 /* explicitly inv uTLBs */ + +#ifdef CONFIG_ARC_MMU_V4 +#define TLBInsertEntry 0x7 +#define TLBDeleteEntry 0x8 +#endif + +/* Masks for actual TLB "PD"s */ +#define PTE_BITS_IN_PD0 (_PAGE_GLOBAL | _PAGE_PRESENT | _PAGE_HW_SZ) +#define PTE_BITS_RWX (_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ) + +#define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK_PHYS | _PAGE_CACHEABLE) + +#ifndef __ASSEMBLY__ + +struct mm_struct; +extern int pae40_exist_but_not_enab(void); + +static inline int is_pae40_enabled(void) +{ + return IS_ENABLED(CONFIG_ARC_HAS_PAE40); +} + +static inline void mmu_setup_asid(struct mm_struct *mm, unsigned long asid) +{ + write_aux_reg(ARC_REG_PID, asid | MMU_ENABLE); +} + +static inline void mmu_setup_pgd(struct mm_struct *mm, void *pgd) +{ + /* PGD cached in MMU reg to avoid 3 mem lookups: task->mm->pgd */ +#ifdef CONFIG_ISA_ARCV2 + write_aux_reg(ARC_REG_SCRATCH_DATA0, (unsigned int)pgd); +#endif +} + +#else + +.macro ARC_MMU_REENABLE reg + lr \reg, [ARC_REG_PID] + or \reg, \reg, MMU_ENABLE + sr \reg, [ARC_REG_PID] +.endm + +#endif /* !__ASSEMBLY__ */ + +#endif diff --git a/arch/arc/include/asm/mmu.h b/arch/arc/include/asm/mmu.h index 26b731d32a2b..ca427c30f70e 100644 --- a/arch/arc/include/asm/mmu.h +++ b/arch/arc/include/asm/mmu.h @@ -7,98 +7,15 @@ #define _ASM_ARC_MMU_H #ifndef __ASSEMBLY__ -#include <linux/threads.h> /* NR_CPUS */ -#endif - -#if defined(CONFIG_ARC_MMU_V1) -#define CONFIG_ARC_MMU_VER 1 -#elif defined(CONFIG_ARC_MMU_V2) -#define CONFIG_ARC_MMU_VER 2 -#elif defined(CONFIG_ARC_MMU_V3) -#define CONFIG_ARC_MMU_VER 3 -#elif defined(CONFIG_ARC_MMU_V4) -#define CONFIG_ARC_MMU_VER 4 -#endif - -/* MMU Management regs */ -#define ARC_REG_MMU_BCR 0x06f -#if (CONFIG_ARC_MMU_VER < 4) -#define ARC_REG_TLBPD0 0x405 -#define ARC_REG_TLBPD1 0x406 -#define ARC_REG_TLBPD1HI 0 /* Dummy: allows code sharing with ARC700 */ -#define ARC_REG_TLBINDEX 0x407 -#define ARC_REG_TLBCOMMAND 0x408 -#define ARC_REG_PID 0x409 -#define ARC_REG_SCRATCH_DATA0 0x418 -#else -#define ARC_REG_TLBPD0 0x460 -#define ARC_REG_TLBPD1 0x461 -#define ARC_REG_TLBPD1HI 0x463 -#define ARC_REG_TLBINDEX 0x464 -#define ARC_REG_TLBCOMMAND 0x465 -#define ARC_REG_PID 0x468 -#define ARC_REG_SCRATCH_DATA0 0x46c -#endif - -#if defined(CONFIG_ISA_ARCV2) || !defined(CONFIG_SMP) -#define ARC_USE_SCRATCH_REG -#endif - -/* Bits in MMU PID register */ -#define __TLB_ENABLE (1 << 31) -#define __PROG_ENABLE (1 << 30) -#define MMU_ENABLE (__TLB_ENABLE | __PROG_ENABLE) - -/* Error code if probe fails */ -#define TLB_LKUP_ERR 0x80000000 - -#if (CONFIG_ARC_MMU_VER < 4) -#define TLB_DUP_ERR (TLB_LKUP_ERR | 0x00000001) -#else -#define TLB_DUP_ERR (TLB_LKUP_ERR | 0x40000000) -#endif - -/* TLB Commands */ -#define TLBWrite 0x1 -#define TLBRead 0x2 -#define TLBGetIndex 0x3 -#define TLBProbe 0x4 - -#if (CONFIG_ARC_MMU_VER >= 2) -#define TLBWriteNI 0x5 /* write JTLB without inv uTLBs */ -#define TLBIVUTLB 0x6 /* explicitly inv uTLBs */ -#else -#define TLBWriteNI TLBWrite /* Not present in hardware, fallback */ -#endif - -#if (CONFIG_ARC_MMU_VER >= 4) -#define TLBInsertEntry 0x7 -#define TLBDeleteEntry 0x8 -#endif -#ifndef __ASSEMBLY__ +#include <linux/threads.h> /* NR_CPUS */ typedef struct { unsigned long asid[NR_CPUS]; /* 8 bit MMU PID + Generation cycle */ } mm_context_t; -#ifdef CONFIG_ARC_DBG_TLB_PARANOIA -void tlb_paranoid_check(unsigned int mm_asid, unsigned long address); -#else -#define tlb_paranoid_check(a, b) #endif -void arc_mmu_init(void); -extern char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len); -void read_decode_mmu_bcr(void); - -static inline int is_pae40_enabled(void) -{ - return IS_ENABLED(CONFIG_ARC_HAS_PAE40); -} - -extern int pae40_exist_but_not_enab(void); - -#endif /* !__ASSEMBLY__ */ +#include <asm/mmu-arcv2.h> #endif diff --git a/arch/arc/include/asm/mmu_context.h b/arch/arc/include/asm/mmu_context.h index df164066e172..dda471f5f05b 100644 --- a/arch/arc/include/asm/mmu_context.h +++ b/arch/arc/include/asm/mmu_context.h @@ -15,22 +15,23 @@ #ifndef _ASM_ARC_MMU_CONTEXT_H #define _ASM_ARC_MMU_CONTEXT_H -#include <asm/arcregs.h> -#include <asm/tlb.h> #include <linux/sched/mm.h> +#include <asm/tlb.h> #include <asm-generic/mm_hooks.h> -/* ARC700 ASID Management +/* ARC ASID Management + * + * MMU tags TLBs with an 8-bit ASID, avoiding need to flush the TLB on + * context-switch. * - * ARC MMU provides 8-bit ASID (0..255) to TAG TLB entries, allowing entries - * with same vaddr (different tasks) to co-exit. This provides for - * "Fast Context Switch" i.e. no TLB flush on ctxt-switch + * ASID is managed per cpu, so task threads across CPUs can have different + * ASID. Global ASID management is needed if hardware supports TLB shootdown + * and/or shared TLB across cores, which ARC doesn't. * - * Linux assigns each task a unique ASID. A simple round-robin allocation - * of H/w ASID is done using software tracker @asid_cpu. - * When it reaches max 255, the allocation cycle starts afresh by flushing - * the entire TLB and wrapping ASID back to zero. + * Each task is assigned unique ASID, with a simple round-robin allocator + * tracked in @asid_cpu. When 8-bit value rolls over,a new cycle is started + * over from 0, and TLB is flushed * * A new allocation cycle, post rollover, could potentially reassign an ASID * to a different task. Thus the rule is to refresh the ASID in a new cycle. @@ -93,7 +94,7 @@ static inline void get_new_mmu_context(struct mm_struct *mm) asid_mm(mm, cpu) = asid_cpu(cpu); set_hw: - write_aux_reg(ARC_REG_PID, hw_pid(mm, cpu) | MMU_ENABLE); + mmu_setup_asid(mm, hw_pid(mm, cpu)); local_irq_restore(flags); } @@ -146,10 +147,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, */ cpumask_set_cpu(cpu, mm_cpumask(next)); -#ifdef ARC_USE_SCRATCH_REG - /* PGD cached in MMU reg to avoid 3 mem lookups: task->mm->pgd */ - write_aux_reg(ARC_REG_SCRATCH_DATA0, next->pgd); -#endif + mmu_setup_pgd(next, next->pgd); get_new_mmu_context(next); } diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 4a9d33372fe2..9a62e1d87967 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -34,57 +34,55 @@ void copy_user_highpage(struct page *to, struct page *from, unsigned long u_vaddr, struct vm_area_struct *vma); void clear_user_page(void *to, unsigned long u_vaddr, struct page *page); -#undef STRICT_MM_TYPECHECKS - -#ifdef STRICT_MM_TYPECHECKS -/* - * These are used to make use of C type-checking.. - */ -typedef struct { -#ifdef CONFIG_ARC_HAS_PAE40 - unsigned long long pte; -#else - unsigned long pte; -#endif -} pte_t; typedef struct { unsigned long pgd; } pgd_t; + +#define pgd_val(x) ((x).pgd) +#define __pgd(x) ((pgd_t) { (x) }) + +#if CONFIG_PGTABLE_LEVELS > 3 + typedef struct { - unsigned long pgprot; -} pgprot_t; + unsigned long pud; +} pud_t; -#define pte_val(x) ((x).pte) -#define pgd_val(x) ((x).pgd) -#define pgprot_val(x) ((x).pgprot) +#define pud_val(x) ((x).pud) +#define __pud(x) ((pud_t) { (x) }) -#define __pte(x) ((pte_t) { (x) }) -#define __pgd(x) ((pgd_t) { (x) }) -#define __pgprot(x) ((pgprot_t) { (x) }) +#endif + +#if CONFIG_PGTABLE_LEVELS > 2 -#define pte_pgprot(x) __pgprot(pte_val(x)) +typedef struct { + unsigned long pmd; +} pmd_t; -#else /* !STRICT_MM_TYPECHECKS */ +#define pmd_val(x) ((x).pmd) +#define __pmd(x) ((pmd_t) { (x) }) +#endif + +typedef struct { #ifdef CONFIG_ARC_HAS_PAE40 -typedef unsigned long long pte_t; + unsigned long long pte; #else -typedef unsigned long pte_t; + unsigned long pte; #endif -typedef unsigned long pgd_t; -typedef unsigned long pgprot_t; +} pte_t; -#define pte_val(x) (x) -#define pgd_val(x) (x) -#define pgprot_val(x) (x) -#define __pte(x) (x) -#define __pgd(x) (x) -#define __pgprot(x) (x) -#define pte_pgprot(x) (x) +#define pte_val(x) ((x).pte) +#define __pte(x) ((pte_t) { (x) }) -#endif +typedef struct { + unsigned long pgprot; +} pgprot_t; + +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) }) +#define pte_pgprot(x) __pgprot(pte_val(x)) -typedef pte_t * pgtable_t; +typedef struct page *pgtable_t; /* * Use virt_to_pfn with caution: @@ -122,8 +120,8 @@ extern int pfn_valid(unsigned long pfn); * virt here means link-address/program-address as embedded in object code. * And for ARC, link-addr = physical address */ -#define __pa(vaddr) ((unsigned long)(vaddr)) -#define __va(paddr) ((void *)((unsigned long)(paddr))) +#define __pa(vaddr) ((unsigned long)(vaddr)) +#define __va(paddr) ((void *)((unsigned long)(paddr))) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index a32ca3104ced..096b8ef58edb 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -31,30 +31,32 @@ #include <linux/mm.h> #include <linux/log2.h> +#include <asm-generic/pgalloc.h> static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { - pmd_set(pmd, pte); + /* + * The cast to long below is OK in 32-bit PAE40 regime with long long pte + * Despite "wider" pte, the pte table needs to be in non-PAE low memory + * as all higher levels can only hold long pointers. + * + * The cast itself is needed given simplistic definition of set_pmd() + */ + set_pmd(pmd, __pmd((unsigned long)pte)); } -static inline void -pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t ptep) +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, (pte_t *) ptep); -} - -static inline int __get_order_pgd(void) -{ - return get_order(PTRS_PER_PGD * sizeof(pgd_t)); + set_pmd(pmd, __pmd((unsigned long)page_address(pte_page))); } static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - int num, num2; - pgd_t *ret = (pgd_t *) __get_free_pages(GFP_KERNEL, __get_order_pgd()); + pgd_t *ret = (pgd_t *) __get_free_page(GFP_KERNEL); if (ret) { + int num, num2; num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE; memzero(ret, num * sizeof(pgd_t)); @@ -68,64 +70,27 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return ret; } -static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - free_pages((unsigned long)pgd, __get_order_pgd()); -} - - -/* - * With software-only page-tables, addr-split for traversal is tweakable and - * that directly governs how big tables would be at each level. - * Further, the MMU page size is configurable. - * Thus we need to programatically assert the size constraint - * All of this is const math, allowing gcc to do constant folding/propagation. - */ +#if CONFIG_PGTABLE_LEVELS > 3 -static inline int __get_order_pte(void) +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) { - return get_order(PTRS_PER_PTE * sizeof(pte_t)); + set_p4d(p4dp, __p4d((unsigned long)pudp)); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte; +#define __pud_free_tlb(tlb, pmd, addr) pud_free((tlb)->mm, pmd) - pte = (pte_t *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, - __get_order_pte()); +#endif - return pte; -} +#if CONFIG_PGTABLE_LEVELS > 2 -static inline pgtable_t -pte_alloc_one(struct mm_struct *mm) +static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp) { - pgtable_t pte_pg; - struct page *page; - - pte_pg = (pgtable_t)__get_free_pages(GFP_KERNEL, __get_order_pte()); - if (!pte_pg) - return 0; - memzero((void *)pte_pg, PTRS_PER_PTE * sizeof(pte_t)); - page = virt_to_page(pte_pg); - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); - return 0; - } - - return pte_pg; + set_pud(pudp, __pud((unsigned long)pmdp)); } -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_pages((unsigned long)pte, __get_order_pte()); /* takes phy addr */ -} +#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) -static inline void pte_free(struct mm_struct *mm, pgtable_t ptep) -{ - pgtable_pte_page_dtor(virt_to_page(ptep)); - free_pages((unsigned long)ptep, __get_order_pte()); -} +#endif #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h new file mode 100644 index 000000000000..183d23bc1e00 --- /dev/null +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) + */ + +/* + * page table flags for software walked/managed MMUv3 (ARC700) and MMUv4 (HS) + * There correspond to the corresponding bits in the TLB + */ + +#ifndef _ASM_ARC_PGTABLE_BITS_ARCV2_H +#define _ASM_ARC_PGTABLE_BITS_ARCV2_H + +#ifdef CONFIG_ARC_CACHE_PAGES +#define _PAGE_CACHEABLE (1 << 0) /* Cached (H) */ +#else +#define _PAGE_CACHEABLE 0 +#endif + +#define _PAGE_EXECUTE (1 << 1) /* User Execute (H) */ +#define _PAGE_WRITE (1 << 2) /* User Write (H) */ +#define _PAGE_READ (1 << 3) /* User Read (H) */ +#define _PAGE_ACCESSED (1 << 4) /* Accessed (s) */ +#define _PAGE_DIRTY (1 << 5) /* Modified (s) */ +#define _PAGE_SPECIAL (1 << 6) +#define _PAGE_GLOBAL (1 << 8) /* ASID agnostic (H) */ +#define _PAGE_PRESENT (1 << 9) /* PTE/TLB Valid (H) */ + +#ifdef CONFIG_ARC_MMU_V4 +#define _PAGE_HW_SZ (1 << 10) /* Normal/super (H) */ +#else +#define _PAGE_HW_SZ 0 +#endif + +/* Defaults for every user page */ +#define ___DEF (_PAGE_PRESENT | _PAGE_CACHEABLE) + +/* Set of bits not changed in pte_modify */ +#define _PAGE_CHG_MASK (PAGE_MASK_PHYS | _PAGE_ACCESSED | _PAGE_DIRTY | \ + _PAGE_SPECIAL) + +/* More Abbrevaited helpers */ +#define PAGE_U_NONE __pgprot(___DEF) +#define PAGE_U_R __pgprot(___DEF | _PAGE_READ) +#define PAGE_U_W_R __pgprot(___DEF | _PAGE_READ | _PAGE_WRITE) +#define PAGE_U_X_R __pgprot(___DEF | _PAGE_READ | _PAGE_EXECUTE) +#define PAGE_U_X_W_R __pgprot(___DEF \ + | _PAGE_READ | _PAGE_WRITE | _PAGE_EXECUTE) +#define PAGE_KERNEL __pgprot(___DEF | _PAGE_GLOBAL \ + | _PAGE_READ | _PAGE_WRITE | _PAGE_EXECUTE) + +#define PAGE_SHARED PAGE_U_W_R + +#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) & ~_PAGE_CACHEABLE)) + +/* + * Mapping of vm_flags (Generic VM) to PTE flags (arch specific) + * + * Certain cases have 1:1 mapping + * e.g. __P101 means VM_READ, VM_EXEC and !VM_SHARED + * which directly corresponds to PAGE_U_X_R + * + * Other rules which cause the divergence from 1:1 mapping + * + * 1. Although ARC700 can do exclusive execute/write protection (meaning R + * can be tracked independet of X/W unlike some other CPUs), still to + * keep things consistent with other archs: + * -Write implies Read: W => R + * -Execute implies Read: X => R + * + * 2. Pvt Writable doesn't have Write Enabled initially: Pvt-W => !W + * This is to enable COW mechanism + */ + /* xwr */ +#define __P000 PAGE_U_NONE +#define __P001 PAGE_U_R +#define __P010 PAGE_U_R /* Pvt-W => !W */ +#define __P011 PAGE_U_R /* Pvt-W => !W */ +#define __P100 PAGE_U_X_R /* X => R */ +#define __P101 PAGE_U_X_R +#define __P110 PAGE_U_X_R /* Pvt-W => !W and X => R */ +#define __P111 PAGE_U_X_R /* Pvt-W => !W */ + +#define __S000 PAGE_U_NONE +#define __S001 PAGE_U_R +#define __S010 PAGE_U_W_R /* W => R */ +#define __S011 PAGE_U_W_R +#define __S100 PAGE_U_X_R /* X => R */ +#define __S101 PAGE_U_X_R +#define __S110 PAGE_U_X_W_R /* X => R */ +#define __S111 PAGE_U_X_W_R + +#ifndef __ASSEMBLY__ + +#define pte_write(pte) (pte_val(pte) & _PAGE_WRITE) +#define pte_dirty(pte) (pte_val(pte) & _PAGE_DIRTY) +#define pte_young(pte) (pte_val(pte) & _PAGE_ACCESSED) +#define pte_special(pte) (pte_val(pte) & _PAGE_SPECIAL) + +#define PTE_BIT_FUNC(fn, op) \ + static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; } + +PTE_BIT_FUNC(mknotpresent, &= ~(_PAGE_PRESENT)); +PTE_BIT_FUNC(wrprotect, &= ~(_PAGE_WRITE)); +PTE_BIT_FUNC(mkwrite, |= (_PAGE_WRITE)); +PTE_BIT_FUNC(mkclean, &= ~(_PAGE_DIRTY)); +PTE_BIT_FUNC(mkdirty, |= (_PAGE_DIRTY)); +PTE_BIT_FUNC(mkold, &= ~(_PAGE_ACCESSED)); +PTE_BIT_FUNC(mkyoung, |= (_PAGE_ACCESSED)); +PTE_BIT_FUNC(mkspecial, |= (_PAGE_SPECIAL)); +PTE_BIT_FUNC(mkhuge, |= (_PAGE_HW_SZ)); + +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); +} + +static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + set_pte(ptep, pteval); +} + +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep); + +/* Encode swap {type,off} tuple into PTE + * We reserve 13 bits for 5-bit @type, keeping bits 12-5 zero, ensuring that + * PAGE_PRESENT is zero in a PTE holding swap "identifier" + */ +#define __swp_entry(type, off) ((swp_entry_t) \ + { ((type) & 0x1f) | ((off) << 13) }) + +/* Decode a PTE containing swap "identifier "into constituents */ +#define __swp_type(pte_lookalike) (((pte_lookalike).val) & 0x1f) +#define __swp_offset(pte_lookalike) ((pte_lookalike).val >> 13) + +#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) +#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) + +#define kern_addr_valid(addr) (1) + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#include <asm/hugepage.h> +#endif + +#endif /* __ASSEMBLY__ */ + +#endif diff --git a/arch/arc/include/asm/pgtable-levels.h b/arch/arc/include/asm/pgtable-levels.h new file mode 100644 index 000000000000..8084ef2f6491 --- /dev/null +++ b/arch/arc/include/asm/pgtable-levels.h @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 Synopsys, Inc. (www.synopsys.com) + */ + +/* + * Helpers for implemenintg paging levels + */ + +#ifndef _ASM_ARC_PGTABLE_LEVELS_H +#define _ASM_ARC_PGTABLE_LEVELS_H + +#if CONFIG_PGTABLE_LEVELS == 2 + +/* + * 2 level paging setup for software walked MMUv3 (ARC700) and MMUv4 (HS) + * + * [31] 32 bit virtual address [0] + * ------------------------------------------------------- + * | | <---------- PGDIR_SHIFT ----------> | + * | | | <-- PAGE_SHIFT --> | + * ------------------------------------------------------- + * | | | + * | | --> off in page frame + * | ---> index into Page Table + * ----> index into Page Directory + * + * Given software walk, the vaddr split is arbitrary set to 11:8:13 + * However enabling of super page in a 2 level regime pegs PGDIR_SHIFT to + * super page size. + */ + +#if defined(CONFIG_ARC_HUGEPAGE_16M) +#define PGDIR_SHIFT 24 +#elif defined(CONFIG_ARC_HUGEPAGE_2M) +#define PGDIR_SHIFT 21 +#else +/* + * No Super page case + * Default value provides 11:8:13 (8K), 10:10:12 (4K) + * Limits imposed by pgtable_t only PAGE_SIZE long + * (so 4K page can only have 1K entries: or 10 bits) + */ +#ifdef CONFIG_ARC_PAGE_SIZE_4K +#define PGDIR_SHIFT 22 +#else +#define PGDIR_SHIFT 21 +#endif + +#endif + +#else /* CONFIG_PGTABLE_LEVELS != 2 */ + +/* + * A default 3 level paging testing setup in software walked MMU + * MMUv4 (8K page): <4> : <7> : <8> : <13> + * A default 4 level paging testing setup in software walked MMU + * MMUv4 (8K page): <4> : <3> : <4> : <8> : <13> + */ +#define PGDIR_SHIFT 28 +#if CONFIG_PGTABLE_LEVELS > 3 +#define PUD_SHIFT 25 +#endif +#if CONFIG_PGTABLE_LEVELS > 2 +#define PMD_SHIFT 21 +#endif + +#endif /* CONFIG_PGTABLE_LEVELS */ + +#define PGDIR_SIZE BIT(PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE - 1)) +#define PTRS_PER_PGD BIT(32 - PGDIR_SHIFT) + +#if CONFIG_PGTABLE_LEVELS > 3 +#define PUD_SIZE BIT(PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE - 1)) +#define PTRS_PER_PUD BIT(PGDIR_SHIFT - PUD_SHIFT) +#endif + +#if CONFIG_PGTABLE_LEVELS > 2 +#define PMD_SIZE BIT(PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE - 1)) +#define PTRS_PER_PMD BIT(PUD_SHIFT - PMD_SHIFT) +#endif + +#define PTRS_PER_PTE BIT(PMD_SHIFT - PAGE_SHIFT) + +#ifndef __ASSEMBLY__ + +#if CONFIG_PGTABLE_LEVELS > 3 +#include <asm-generic/pgtable-nop4d.h> +#elif CONFIG_PGTABLE_LEVELS > 2 +#include <asm-generic/pgtable-nopud.h> +#else +#include <asm-generic/pgtable-nopmd.h> +#endif + +/* + * 1st level paging: pgd + */ +#define pgd_index(addr) ((addr) >> PGDIR_SHIFT) +#define pgd_offset(mm, addr) (((mm)->pgd) + pgd_index(addr)) +#define pgd_offset_k(addr) pgd_offset(&init_mm, addr) +#define pgd_ERROR(e) \ + pr_crit("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) + +#if CONFIG_PGTABLE_LEVELS > 3 + +/* In 4 level paging, p4d_* macros work on pgd */ +#define p4d_none(x) (!p4d_val(x)) +#define p4d_bad(x) ((p4d_val(x) & ~PAGE_MASK)) +#define p4d_present(x) (p4d_val(x)) +#define p4d_clear(xp) do { p4d_val(*(xp)) = 0; } while (0) +#define p4d_pgtable(p4d) ((pud_t *)(p4d_val(p4d) & PAGE_MASK)) +#define p4d_page(p4d) virt_to_page(p4d_pgtable(p4d)) +#define set_p4d(p4dp, p4d) (*(p4dp) = p4d) + +/* + * 2nd level paging: pud + */ +#define pud_ERROR(e) \ + pr_crit("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e)) + +#endif + +#if CONFIG_PGTABLE_LEVELS > 2 + +/* + * In 3 level paging, pud_* macros work on pgd + * In 4 level paging, pud_* macros work on pud + */ +#define pud_none(x) (!pud_val(x)) +#define pud_bad(x) ((pud_val(x) & ~PAGE_MASK)) +#define pud_present(x) (pud_val(x)) +#define pud_clear(xp) do { pud_val(*(xp)) = 0; } while (0) +#define pud_pgtable(pud) ((pmd_t *)(pud_val(pud) & PAGE_MASK)) +#define pud_page(pud) virt_to_page(pud_pgtable(pud)) +#define set_pud(pudp, pud) (*(pudp) = pud) + +/* + * 3rd level paging: pmd + */ +#define pmd_ERROR(e) \ + pr_crit("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) + +#define pmd_pfn(pmd) ((pmd_val(pmd) & PMD_MASK) >> PAGE_SHIFT) +#define pfn_pmd(pfn,prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) + +#endif + +/* + * Due to the strange way generic pgtable level folding works, the pmd_* macros + * - are valid even for 2 levels (which supposedly only has pgd - pte) + * - behave differently for 2 vs. 3 + * In 2 level paging (pgd -> pte), pmd_* macros work on pgd + * In 3+ level paging (pgd -> pmd -> pte), pmd_* macros work on pmd + */ +#define pmd_none(x) (!pmd_val(x)) +#define pmd_bad(x) ((pmd_val(x) & ~PAGE_MASK)) +#define pmd_present(x) (pmd_val(x)) +#define pmd_clear(xp) do { pmd_val(*(xp)) = 0; } while (0) +#define pmd_page_vaddr(pmd) (pmd_val(pmd) & PAGE_MASK) +#define pmd_page(pmd) virt_to_page(pmd_page_vaddr(pmd)) +#define set_pmd(pmdp, pmd) (*(pmdp) = pmd) +#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd)) + +/* + * 4th level paging: pte + */ +#define pte_ERROR(e) \ + pr_crit("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) + +#define pte_none(x) (!pte_val(x)) +#define pte_present(x) (pte_val(x) & _PAGE_PRESENT) +#define pte_clear(mm,addr,ptep) set_pte_at(mm, addr, ptep, __pte(0)) +#define pte_page(pte) pfn_to_page(pte_pfn(pte)) +#define set_pte(ptep, pte) ((*(ptep)) = (pte)) +#define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) +#define pfn_pte(pfn, prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot)) +#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot) + +#ifdef CONFIG_ISA_ARCV2 +#define pmd_leaf(x) (pmd_val(x) & _PAGE_HW_SZ) +#endif + +#endif /* !__ASSEMBLY__ */ + +#endif diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 320cc0ae8a08..9320b04c04bf 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -1,220 +1,17 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) - * - * vineetg: May 2011 - * -Folded PAGE_PRESENT (used by VM) and PAGE_VALID (used by MMU) into 1. - * They are semantically the same although in different contexts - * VALID marks a TLB entry exists and it will only happen if PRESENT - * - Utilise some unused free bits to confine PTE flags to 12 bits - * This is a must for 4k pg-sz - * - * vineetg: Mar 2011 - changes to accommodate MMU TLB Page Descriptor mods - * -TLB Locking never really existed, except for initial specs - * -SILENT_xxx not needed for our port - * -Per my request, MMU V3 changes the layout of some of the bits - * to avoid a few shifts in TLB Miss handlers. - * - * vineetg: April 2010 - * -PGD entry no longer contains any flags. If empty it is 0, otherwise has - * Pg-Tbl ptr. Thus pmd_present(), pmd_valid(), pmd_set( ) become simpler - * - * vineetg: April 2010 - * -Switched form 8:11:13 split for page table lookup to 11:8:13 - * -this speeds up page table allocation itself as we now have to memset 1K - * instead of 8k per page table. - * -TODO: Right now page table alloc is 8K and rest 7K is unused - * need to optimise it - * - * Amit Bhor, Sameer Dhavale: Codito Technologies 2004 */ #ifndef _ASM_ARC_PGTABLE_H #define _ASM_ARC_PGTABLE_H #include <linux/bits.h> -#include <asm-generic/pgtable-nopmd.h> -#include <asm/page.h> -#include <asm/mmu.h> /* to propagate CONFIG_ARC_MMU_VER <n> */ - -/************************************************************************** - * Page Table Flags - * - * ARC700 MMU only deals with softare managed TLB entries. - * Page Tables are purely for Linux VM's consumption and the bits below are - * suited to that (uniqueness). Hence some are not implemented in the TLB and - * some have different value in TLB. - * e.g. MMU v2: K_READ bit is 8 and so is GLOBAL (possible because they live in - * seperate PD0 and PD1, which combined forms a translation entry) - * while for PTE perspective, they are 8 and 9 respectively - * with MMU v3: Most bits (except SHARED) represent the exact hardware pos - * (saves some bit shift ops in TLB Miss hdlrs) - */ - -#if (CONFIG_ARC_MMU_VER <= 2) - -#define _PAGE_ACCESSED (1<<1) /* Page is accessed (S) */ -#define _PAGE_CACHEABLE (1<<2) /* Page is cached (H) */ -#define _PAGE_EXECUTE (1<<3) /* Page has user execute perm (H) */ -#define _PAGE_WRITE (1<<4) /* Page has user write perm (H) */ -#define _PAGE_READ (1<<5) /* Page has user read perm (H) */ -#define _PAGE_DIRTY (1<<6) /* Page modified (dirty) (S) */ -#define _PAGE_SPECIAL (1<<7) -#define _PAGE_GLOBAL (1<<8) /* Page is global (H) */ -#define _PAGE_PRESENT (1<<10) /* TLB entry is valid (H) */ - -#else /* MMU v3 onwards */ - -#define _PAGE_CACHEABLE (1<<0) /* Page is cached (H) */ -#define _PAGE_EXECUTE (1<<1) /* Page has user execute perm (H) */ -#define _PAGE_WRITE (1<<2) /* Page has user write perm (H) */ -#define _PAGE_READ (1<<3) /* Page has user read perm (H) */ -#define _PAGE_ACCESSED (1<<4) /* Page is accessed (S) */ -#define _PAGE_DIRTY (1<<5) /* Page modified (dirty) (S) */ -#define _PAGE_SPECIAL (1<<6) - -#if (CONFIG_ARC_MMU_VER >= 4) -#define _PAGE_WTHRU (1<<7) /* Page cache mode write-thru (H) */ -#endif - -#define _PAGE_GLOBAL (1<<8) /* Page is global (H) */ -#define _PAGE_PRESENT (1<<9) /* TLB entry is valid (H) */ - -#if (CONFIG_ARC_MMU_VER >= 4) -#define _PAGE_HW_SZ (1<<10) /* Page Size indicator (H): 0 normal, 1 super */ -#endif - -#define _PAGE_SHARED_CODE (1<<11) /* Shared Code page with cmn vaddr - usable for shared TLB entries (H) */ - -#define _PAGE_UNUSED_BIT (1<<12) -#endif - -/* vmalloc permissions */ -#define _K_PAGE_PERMS (_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ | \ - _PAGE_GLOBAL | _PAGE_PRESENT) - -#ifndef CONFIG_ARC_CACHE_PAGES -#undef _PAGE_CACHEABLE -#define _PAGE_CACHEABLE 0 -#endif -#ifndef _PAGE_HW_SZ -#define _PAGE_HW_SZ 0 -#endif - -/* Defaults for every user page */ -#define ___DEF (_PAGE_PRESENT | _PAGE_CACHEABLE) - -/* Set of bits not changed in pte_modify */ -#define _PAGE_CHG_MASK (PAGE_MASK_PHYS | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SPECIAL) -/* More Abbrevaited helpers */ -#define PAGE_U_NONE __pgprot(___DEF) -#define PAGE_U_R __pgprot(___DEF | _PAGE_READ) -#define PAGE_U_W_R __pgprot(___DEF | _PAGE_READ | _PAGE_WRITE) -#define PAGE_U_X_R __pgprot(___DEF | _PAGE_READ | _PAGE_EXECUTE) -#define PAGE_U_X_W_R __pgprot(___DEF | _PAGE_READ | _PAGE_WRITE | \ - _PAGE_EXECUTE) - -#define PAGE_SHARED PAGE_U_W_R - -/* While kernel runs out of unstranslated space, vmalloc/modules use a chunk of - * user vaddr space - visible in all addr spaces, but kernel mode only - * Thus Global, all-kernel-access, no-user-access, cached - */ -#define PAGE_KERNEL __pgprot(_K_PAGE_PERMS | _PAGE_CACHEABLE) - -/* ioremap */ -#define PAGE_KERNEL_NO_CACHE __pgprot(_K_PAGE_PERMS) - -/* Masks for actual TLB "PD"s */ -#define PTE_BITS_IN_PD0 (_PAGE_GLOBAL | _PAGE_PRESENT | _PAGE_HW_SZ) -#define PTE_BITS_RWX (_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ) - -#define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK_PHYS | _PAGE_CACHEABLE) - -/************************************************************************** - * Mapping of vm_flags (Generic VM) to PTE flags (arch specific) - * - * Certain cases have 1:1 mapping - * e.g. __P101 means VM_READ, VM_EXEC and !VM_SHARED - * which directly corresponds to PAGE_U_X_R - * - * Other rules which cause the divergence from 1:1 mapping - * - * 1. Although ARC700 can do exclusive execute/write protection (meaning R - * can be tracked independet of X/W unlike some other CPUs), still to - * keep things consistent with other archs: - * -Write implies Read: W => R - * -Execute implies Read: X => R - * - * 2. Pvt Writable doesn't have Write Enabled initially: Pvt-W => !W - * This is to enable COW mechanism - */ - /* xwr */ -#define __P000 PAGE_U_NONE -#define __P001 PAGE_U_R -#define __P010 PAGE_U_R /* Pvt-W => !W */ -#define __P011 PAGE_U_R /* Pvt-W => !W */ -#define __P100 PAGE_U_X_R /* X => R */ -#define __P101 PAGE_U_X_R -#define __P110 PAGE_U_X_R /* Pvt-W => !W and X => R */ -#define __P111 PAGE_U_X_R /* Pvt-W => !W */ - -#define __S000 PAGE_U_NONE -#define __S001 PAGE_U_R -#define __S010 PAGE_U_W_R /* W => R */ -#define __S011 PAGE_U_W_R -#define __S100 PAGE_U_X_R /* X => R */ -#define __S101 PAGE_U_X_R -#define __S110 PAGE_U_X_W_R /* X => R */ -#define __S111 PAGE_U_X_W_R - -/**************************************************************** - * 2 tier (PGD:PTE) software page walker - * - * [31] 32 bit virtual address [0] - * ------------------------------------------------------- - * | | <------------ PGDIR_SHIFT ----------> | - * | | | - * | BITS_FOR_PGD | BITS_FOR_PTE | <-- PAGE_SHIFT --> | - * ------------------------------------------------------- - * | | | - * | | --> off in page frame - * | ---> index into Page Table - * ----> index into Page Directory - * - * In a single page size configuration, only PAGE_SHIFT is fixed - * So both PGD and PTE sizing can be tweaked - * e.g. 8K page (PAGE_SHIFT 13) can have - * - PGDIR_SHIFT 21 -> 11:8:13 address split - * - PGDIR_SHIFT 24 -> 8:11:13 address split - * - * If Super Page is configured, PGDIR_SHIFT becomes fixed too, - * so the sizing flexibility is gone. - */ - -#if defined(CONFIG_ARC_HUGEPAGE_16M) -#define PGDIR_SHIFT 24 -#elif defined(CONFIG_ARC_HUGEPAGE_2M) -#define PGDIR_SHIFT 21 -#else -/* - * Only Normal page support so "hackable" (see comment above) - * Default value provides 11:8:13 (8K), 11:9:12 (4K) - */ -#define PGDIR_SHIFT 21 -#endif - -#define BITS_FOR_PTE (PGDIR_SHIFT - PAGE_SHIFT) -#define BITS_FOR_PGD (32 - PGDIR_SHIFT) - -#define PGDIR_SIZE BIT(PGDIR_SHIFT) /* vaddr span, not PDG sz */ -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -#define PTRS_PER_PTE BIT(BITS_FOR_PTE) -#define PTRS_PER_PGD BIT(BITS_FOR_PGD) +#include <asm/pgtable-levels.h> +#include <asm/pgtable-bits-arcv2.h> +#include <asm/page.h> +#include <asm/mmu.h> /* * Number of entries a user land program use. @@ -222,143 +19,17 @@ */ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) - -/**************************************************************** - * Bucket load of VM Helpers - */ - #ifndef __ASSEMBLY__ -#define pte_ERROR(e) \ - pr_crit("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) -#define pgd_ERROR(e) \ - pr_crit("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) - -/* the zero page used for uninitialized and anonymous pages */ extern char empty_zero_page[PAGE_SIZE]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -#define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval)) -#define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) - -/* find the page descriptor of the Page Tbl ref by PMD entry */ -#define pmd_page(pmd) virt_to_page(pmd_val(pmd) & PAGE_MASK) - -/* find the logical addr (phy for ARC) of the Page Tbl ref by PMD entry */ -#define pmd_page_vaddr(pmd) (pmd_val(pmd) & PAGE_MASK) - -/* In a 2 level sys, setup the PGD entry with PTE value */ -static inline void pmd_set(pmd_t *pmdp, pte_t *ptep) -{ - pmd_val(*pmdp) = (unsigned long)ptep; -} - -#define pte_none(x) (!pte_val(x)) -#define pte_present(x) (pte_val(x) & _PAGE_PRESENT) -#define pte_clear(mm, addr, ptep) set_pte_at(mm, addr, ptep, __pte(0)) - -#define pmd_none(x) (!pmd_val(x)) -#define pmd_bad(x) ((pmd_val(x) & ~PAGE_MASK)) -#define pmd_present(x) (pmd_val(x)) -#define pmd_leaf(x) (pmd_val(x) & _PAGE_HW_SZ) -#define pmd_clear(xp) do { pmd_val(*(xp)) = 0; } while (0) - -#define pte_page(pte) pfn_to_page(pte_pfn(pte)) -#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot) -#define pfn_pte(pfn, prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot)) - -/* Don't use virt_to_pfn for macros below: could cause truncations for PAE40*/ -#define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) - -/* Zoo of pte_xxx function */ -#define pte_read(pte) (pte_val(pte) & _PAGE_READ) -#define pte_write(pte) (pte_val(pte) & _PAGE_WRITE) -#define pte_dirty(pte) (pte_val(pte) & _PAGE_DIRTY) -#define pte_young(pte) (pte_val(pte) & _PAGE_ACCESSED) -#define pte_special(pte) (pte_val(pte) & _PAGE_SPECIAL) - -#define PTE_BIT_FUNC(fn, op) \ - static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; } - -PTE_BIT_FUNC(mknotpresent, &= ~(_PAGE_PRESENT)); -PTE_BIT_FUNC(wrprotect, &= ~(_PAGE_WRITE)); -PTE_BIT_FUNC(mkwrite, |= (_PAGE_WRITE)); -PTE_BIT_FUNC(mkclean, &= ~(_PAGE_DIRTY)); -PTE_BIT_FUNC(mkdirty, |= (_PAGE_DIRTY)); -PTE_BIT_FUNC(mkold, &= ~(_PAGE_ACCESSED)); -PTE_BIT_FUNC(mkyoung, |= (_PAGE_ACCESSED)); -PTE_BIT_FUNC(exprotect, &= ~(_PAGE_EXECUTE)); -PTE_BIT_FUNC(mkexec, |= (_PAGE_EXECUTE)); -PTE_BIT_FUNC(mkspecial, |= (_PAGE_SPECIAL)); -PTE_BIT_FUNC(mkhuge, |= (_PAGE_HW_SZ)); - -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); -} +extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE); /* Macro to mark a page protection as uncacheable */ #define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) & ~_PAGE_CACHEABLE)) -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - set_pte(ptep, pteval); -} - -/* - * Macro to quickly access the PGD entry, utlising the fact that some - * arch may cache the pointer to Page Directory of "current" task - * in a MMU register - * - * Thus task->mm->pgd (3 pointer dereferences, cache misses etc simply - * becomes read a register - * - * ********CAUTION*******: - * Kernel code might be dealing with some mm_struct of NON "current" - * Thus use this macro only when you are certain that "current" is current - * e.g. when dealing with signal frame setup code etc - */ -#ifdef ARC_USE_SCRATCH_REG -#define pgd_offset_fast(mm, addr) \ -({ \ - pgd_t *pgd_base = (pgd_t *) read_aux_reg(ARC_REG_SCRATCH_DATA0); \ - pgd_base + pgd_index(addr); \ -}) -#else -#define pgd_offset_fast(mm, addr) pgd_offset(mm, addr) -#endif - extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE); -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *ptep); - -/* Encode swap {type,off} tuple into PTE - * We reserve 13 bits for 5-bit @type, keeping bits 12-5 zero, ensuring that - * PAGE_PRESENT is zero in a PTE holding swap "identifier" - */ -#define __swp_entry(type, off) ((swp_entry_t) { \ - ((type) & 0x1f) | ((off) << 13) }) - -/* Decode a PTE containing swap "identifier "into constituents */ -#define __swp_type(pte_lookalike) (((pte_lookalike).val) & 0x1f) -#define __swp_offset(pte_lookalike) ((pte_lookalike).val >> 13) - -/* NOPs, to keep generic kernel happy */ -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) - -#define kern_addr_valid(addr) (1) - -#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd)) - -/* - * remap a physical page `pfn' of size `size' with page protection `prot' - * into virtual address `from' - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#include <asm/hugepage.h> -#endif /* to cope with aliasing VIPT cache */ #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index e4031ecd3c8c..f28afcf5c6d1 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -93,7 +93,7 @@ extern unsigned int get_wchan(struct task_struct *p); #define VMALLOC_START (PAGE_OFFSET - (CONFIG_ARC_KVADDR_SIZE << 20)) /* 1 PGDIR_SIZE each for fixmap/pkmap, 2 PGDIR_SIZE gutter (see asm/highmem.h) */ -#define VMALLOC_SIZE ((CONFIG_ARC_KVADDR_SIZE << 20) - PGDIR_SIZE * 4) +#define VMALLOC_SIZE ((CONFIG_ARC_KVADDR_SIZE << 20) - PMD_SIZE * 4) #define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) diff --git a/arch/arc/include/asm/setup.h b/arch/arc/include/asm/setup.h index 01f85478170d..028a8cf76206 100644 --- a/arch/arc/include/asm/setup.h +++ b/arch/arc/include/asm/setup.h @@ -2,8 +2,8 @@ /* * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) */ -#ifndef __ASMARC_SETUP_H -#define __ASMARC_SETUP_H +#ifndef __ASM_ARC_SETUP_H +#define __ASM_ARC_SETUP_H #include <linux/types.h> @@ -34,4 +34,12 @@ long __init arc_get_mem_sz(void); #define IS_AVAIL2(v, s, cfg) IS_AVAIL1(v, s), IS_AVAIL1(v, IS_USED_CFG(cfg)) #define IS_AVAIL3(v, v2, s) IS_AVAIL1(v, s), IS_AVAIL1(v, IS_DISABLED_RUN(v2)) +extern void arc_mmu_init(void); +extern char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len); +extern void read_decode_mmu_bcr(void); + +extern void arc_cache_init(void); +extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len); +extern void read_decode_cache_bcr(void); + #endif /* __ASMARC_SETUP_H */ diff --git a/arch/arc/include/asm/smp.h b/arch/arc/include/asm/smp.h index c5de4008d19f..d856491606ac 100644 --- a/arch/arc/include/asm/smp.h +++ b/arch/arc/include/asm/smp.h @@ -105,7 +105,6 @@ static inline const char *arc_platform_smp_cpuinfo(void) #include <asm/spinlock.h> extern arch_spinlock_t smp_atomic_ops_lock; -extern arch_spinlock_t smp_bitops_lock; #define atomic_ops_lock(flags) do { \ local_irq_save(flags); \ @@ -117,24 +116,11 @@ extern arch_spinlock_t smp_bitops_lock; local_irq_restore(flags); \ } while (0) -#define bitops_lock(flags) do { \ - local_irq_save(flags); \ - arch_spin_lock(&smp_bitops_lock); \ -} while (0) - -#define bitops_unlock(flags) do { \ - arch_spin_unlock(&smp_bitops_lock); \ - local_irq_restore(flags); \ -} while (0) - #else /* !CONFIG_SMP */ #define atomic_ops_lock(flags) local_irq_save(flags) #define atomic_ops_unlock(flags) local_irq_restore(flags) -#define bitops_lock(flags) local_irq_save(flags) -#define bitops_unlock(flags) local_irq_restore(flags) - #endif /* !CONFIG_SMP */ #endif /* !CONFIG_ARC_HAS_LLSC */ diff --git a/arch/arc/include/asm/tlb-mmu1.h b/arch/arc/include/asm/tlb-mmu1.h deleted file mode 100644 index a3083b36f5f4..000000000000 --- a/arch/arc/include/asm/tlb-mmu1.h +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) - */ - -#ifndef __ASM_TLB_MMU_V1_H__ -#define __ASM_TLB_MMU_V1_H__ - -#include <asm/mmu.h> - -#if defined(__ASSEMBLY__) && (CONFIG_ARC_MMU_VER == 1) - -.macro TLB_WRITE_HEURISTICS - -#define JH_HACK1 -#undef JH_HACK2 -#undef JH_HACK3 - -#ifdef JH_HACK3 -; Calculate set index for 2-way MMU -; -avoiding use of GetIndex from MMU -; and its unpleasant LFSR pseudo-random sequence -; -; r1 = TLBPD0 from TLB_RELOAD above -; -; -- jh_ex_way_set not cleared on startup -; didn't want to change setup.c -; hence extra instruction to clean -; -; -- should be in cache since in same line -; as r0/r1 saves above -; -ld r0,[jh_ex_way_sel] ; victim pointer -and r0,r0,1 ; clean -xor.f r0,r0,1 ; flip -st r0,[jh_ex_way_sel] ; store back -asr r0,r1,12 ; get set # <<1, note bit 12=R=0 -or.nz r0,r0,1 ; set way bit -and r0,r0,0xff ; clean -sr r0,[ARC_REG_TLBINDEX] -#endif - -#ifdef JH_HACK2 -; JH hack #2 -; Faster than hack #1 in non-thrash case, but hard-coded for 2-way MMU -; Slower in thrash case (where it matters) because more code is executed -; Inefficient due to two-register paradigm of this miss handler -; -/* r1 = data TLBPD0 at this point */ -lr r0,[eret] /* instruction address */ -xor r0,r0,r1 /* compare set # */ -and.f r0,r0,0x000fe000 /* 2-way MMU mask */ -bne 88f /* not in same set - no need to probe */ - -lr r0,[eret] /* instruction address */ -and r0,r0,PAGE_MASK /* VPN of instruction address */ -; lr r1,[ARC_REG_TLBPD0] /* Data VPN+ASID - already in r1 from TLB_RELOAD*/ -and r1,r1,0xff /* Data ASID */ -or r0,r0,r1 /* Instruction address + Data ASID */ - -lr r1,[ARC_REG_TLBPD0] /* save TLBPD0 containing data TLB*/ -sr r0,[ARC_REG_TLBPD0] /* write instruction address to TLBPD0 */ -sr TLBProbe, [ARC_REG_TLBCOMMAND] /* Look for instruction */ -lr r0,[ARC_REG_TLBINDEX] /* r0 = index where instruction is, if at all */ -sr r1,[ARC_REG_TLBPD0] /* restore TLBPD0 */ - -xor r0,r0,1 /* flip bottom bit of data index */ -b.d 89f -sr r0,[ARC_REG_TLBINDEX] /* and put it back */ -88: -sr TLBGetIndex, [ARC_REG_TLBCOMMAND] -89: -#endif - -#ifdef JH_HACK1 -; -; Always checks whether instruction will be kicked out by dtlb miss -; -mov_s r3, r1 ; save PD0 prepared by TLB_RELOAD in r3 -lr r0,[eret] /* instruction address */ -and r0,r0,PAGE_MASK /* VPN of instruction address */ -bmsk r1,r3,7 /* Data ASID, bits 7-0 */ -or_s r0,r0,r1 /* Instruction address + Data ASID */ - -sr r0,[ARC_REG_TLBPD0] /* write instruction address to TLBPD0 */ -sr TLBProbe, [ARC_REG_TLBCOMMAND] /* Look for instruction */ -lr r0,[ARC_REG_TLBINDEX] /* r0 = index where instruction is, if at all */ -sr r3,[ARC_REG_TLBPD0] /* restore TLBPD0 */ - -sr TLBGetIndex, [ARC_REG_TLBCOMMAND] -lr r1,[ARC_REG_TLBINDEX] /* r1 = index where MMU wants to put data */ -cmp r0,r1 /* if no match on indices, go around */ -xor.eq r1,r1,1 /* flip bottom bit of data index */ -sr r1,[ARC_REG_TLBINDEX] /* and put it back */ -#endif - -.endm - -#endif - -#endif diff --git a/arch/arc/kernel/entry-arcv2.S b/arch/arc/kernel/entry-arcv2.S index 12d5f12d10d2..a7e6a2174187 100644 --- a/arch/arc/kernel/entry-arcv2.S +++ b/arch/arc/kernel/entry-arcv2.S @@ -10,6 +10,7 @@ #include <asm/errno.h> #include <asm/arcregs.h> #include <asm/irqflags.h> +#include <asm/mmu.h> ; A maximum number of supported interrupts in the core interrupt controller. ; This number is not equal to the maximum interrupt number (256) because diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index 2cb8dfe866b6..dd77a0c8f740 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -101,11 +101,8 @@ ENTRY(EV_MachineCheck) lr r0, [efa] mov r1, sp - ; hardware auto-disables MMU, re-enable it to allow kernel vaddr - ; access for say stack unwinding of modules for crash dumps - lr r3, [ARC_REG_PID] - or r3, r3, MMU_ENABLE - sr r3, [ARC_REG_PID] + ; MC excpetions disable MMU + ARC_MMU_REENABLE r3 lsr r3, r2, 8 bmsk r3, r3, 7 diff --git a/arch/arc/kernel/intc-compact.c b/arch/arc/kernel/intc-compact.c index a86641b91e65..6885e422870e 100644 --- a/arch/arc/kernel/intc-compact.c +++ b/arch/arc/kernel/intc-compact.c @@ -142,7 +142,7 @@ IRQCHIP_DECLARE(arc_intc, "snps,arc700-intc", init_onchip_IRQ); * Time hard-ISR, timer_interrupt( ) calls spin_unlock_irq several times. * Here local_irq_enable( ) shd not re-enable lower priority interrupts * -If called from soft-ISR, it must re-enable all interrupts - * soft ISR are low prioity jobs which can be very slow, thus all IRQs + * soft ISR are low priority jobs which can be very slow, thus all IRQs * must be enabled while they run. * Now hardware context wise we may still be in L2 ISR (not done rtie) * still we must re-enable both L1 and L2 IRQs diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index db0e104d6835..78e6d069b1c1 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -29,10 +29,8 @@ #ifndef CONFIG_ARC_HAS_LLSC arch_spinlock_t smp_atomic_ops_lock = __ARCH_SPIN_LOCK_UNLOCKED; -arch_spinlock_t smp_bitops_lock = __ARCH_SPIN_LOCK_UNLOCKED; EXPORT_SYMBOL_GPL(smp_atomic_ops_lock); -EXPORT_SYMBOL_GPL(smp_bitops_lock); #endif struct plat_smp_ops __weak plat_smp_ops; @@ -283,7 +281,7 @@ static void ipi_send_msg_one(int cpu, enum ipi_msg_type msg) /* * Call the platform specific IPI kick function, but avoid if possible: * Only do so if there's no pending msg from other concurrent sender(s). - * Otherwise, recevier will see this msg as well when it takes the + * Otherwise, receiver will see this msg as well when it takes the * IPI corresponding to that msg. This is true, even if it is already in * IPI handler, because !@old means it has not yet dequeued the msg(s) * so @new msg can be a free-loader diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index 1b9576d21e24..c376ff3147e7 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -149,7 +149,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs, #else /* On ARC, only Dward based unwinder works. fp based backtracing is * not possible (-fno-omit-frame-pointer) because of the way function - * prelogue is setup (callee regs saved and then fp set and not other + * prologue is setup (callee regs saved and then fp set and not other * way around */ pr_warn_once("CONFIG_ARC_DW2_UNWIND needs to be enabled\n"); diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c index 57235e5c0cea..6b83e3f2b41c 100644 --- a/arch/arc/kernel/traps.c +++ b/arch/arc/kernel/traps.c @@ -20,11 +20,6 @@ #include <asm/unaligned.h> #include <asm/kprobes.h> -void __init trap_init(void) -{ - return; -} - void die(const char *str, struct pt_regs *regs, unsigned long address) { show_kernel_fault_diag(str, regs, address); diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index a2fbea3ee07c..8aa1231865d1 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -205,93 +205,24 @@ slc_chk: #define OP_INV_IC 0x4 /* - * I-Cache Aliasing in ARC700 VIPT caches (MMU v1-v3) + * Cache Flush programming model * - * ARC VIPT I-cache uses vaddr to index into cache and paddr to match the tag. - * The orig Cache Management Module "CDU" only required paddr to invalidate a - * certain line since it sufficed as index in Non-Aliasing VIPT cache-geometry. - * Infact for distinct V1,V2,P: all of {V1-P},{V2-P},{P-P} would end up fetching - * the exact same line. + * ARC700 MMUv3 I$ and D$ are both VIPT and can potentially alias. + * Programming model requires both paddr and vaddr irrespecive of aliasing + * considerations: + * - vaddr in {I,D}C_IV?L + * - paddr in {I,D}C_PTAG * - * However for larger Caches (way-size > page-size) - i.e. in Aliasing config, - * paddr alone could not be used to correctly index the cache. + * In HS38x (MMUv4), D$ is PIPT, I$ is VIPT and can still alias. + * Programming model is different for aliasing vs. non-aliasing I$ + * - D$ / Non-aliasing I$: only paddr in {I,D}C_IV?L + * - Aliasing I$: same as ARC700 above (so MMUv3 routine used for MMUv4 I$) * - * ------------------ - * MMU v1/v2 (Fixed Page Size 8k) - * ------------------ - * The solution was to provide CDU with these additonal vaddr bits. These - * would be bits [x:13], x would depend on cache-geometry, 13 comes from - * standard page size of 8k. - * H/w folks chose [17:13] to be a future safe range, and moreso these 5 bits - * of vaddr could easily be "stuffed" in the paddr as bits [4:0] since the - * orig 5 bits of paddr were anyways ignored by CDU line ops, as they - * represent the offset within cache-line. The adv of using this "clumsy" - * interface for additional info was no new reg was needed in CDU programming - * model. - * - * 17:13 represented the max num of bits passable, actual bits needed were - * fewer, based on the num-of-aliases possible. - * -for 2 alias possibility, only bit 13 needed (32K cache) - * -for 4 alias possibility, bits 14:13 needed (64K cache) - * - * ------------------ - * MMU v3 - * ------------------ - * This ver of MMU supports variable page sizes (1k-16k): although Linux will - * only support 8k (default), 16k and 4k. - * However from hardware perspective, smaller page sizes aggravate aliasing - * meaning more vaddr bits needed to disambiguate the cache-line-op ; - * the existing scheme of piggybacking won't work for certain configurations. - * Two new registers IC_PTAG and DC_PTAG inttoduced. - * "tag" bits are provided in PTAG, index bits in existing IVIL/IVDL/FLDL regs + * - If PAE40 is enabled, independent of aliasing considerations, the higher + * bits needs to be written into PTAG_HI */ static inline -void __cache_line_loop_v2(phys_addr_t paddr, unsigned long vaddr, - unsigned long sz, const int op, const int full_page) -{ - unsigned int aux_cmd; - int num_lines; - - if (op == OP_INV_IC) { - aux_cmd = ARC_REG_IC_IVIL; - } else { - /* d$ cmd: INV (discard or wback-n-discard) OR FLUSH (wback) */ - aux_cmd = op & OP_INV ? ARC_REG_DC_IVDL : ARC_REG_DC_FLDL; - } - - /* Ensure we properly floor/ceil the non-line aligned/sized requests - * and have @paddr - aligned to cache line and integral @num_lines. - * This however can be avoided for page sized since: - * -@paddr will be cache-line aligned already (being page aligned) - * -@sz will be integral multiple of line size (being page sized). - */ - if (!full_page) { - sz += paddr & ~CACHE_LINE_MASK; - paddr &= CACHE_LINE_MASK; - vaddr &= CACHE_LINE_MASK; - } - - num_lines = DIV_ROUND_UP(sz, L1_CACHE_BYTES); - - /* MMUv2 and before: paddr contains stuffed vaddrs bits */ - paddr |= (vaddr >> PAGE_SHIFT) & 0x1F; - - while (num_lines-- > 0) { - write_aux_reg(aux_cmd, paddr); - paddr += L1_CACHE_BYTES; - } -} - -/* - * For ARC700 MMUv3 I-cache and D-cache flushes - * - ARC700 programming model requires paddr and vaddr be passed in seperate - * AUX registers (*_IV*L and *_PTAG respectively) irrespective of whether the - * caches actually alias or not. - * - For HS38, only the aliasing I-cache configuration uses the PTAG reg - * (non aliasing I-cache version doesn't; while D-cache can't possibly alias) - */ -static inline void __cache_line_loop_v3(phys_addr_t paddr, unsigned long vaddr, unsigned long sz, const int op, const int full_page) { @@ -350,17 +281,6 @@ void __cache_line_loop_v3(phys_addr_t paddr, unsigned long vaddr, #ifndef USE_RGN_FLSH /* - * In HS38x (MMU v4), I-cache is VIPT (can alias), D-cache is PIPT - * Here's how cache ops are implemented - * - * - D-cache: only paddr needed (in DC_IVDL/DC_FLDL) - * - I-cache Non Aliasing: Despite VIPT, only paddr needed (in IC_IVIL) - * - I-cache Aliasing: Both vaddr and paddr needed (in IC_IVIL, IC_PTAG - * respectively, similar to MMU v3 programming model, hence - * __cache_line_loop_v3() is used) - * - * If PAE40 is enabled, independent of aliasing considerations, the higher bits - * needs to be written into PTAG_HI */ static inline void __cache_line_loop_v4(phys_addr_t paddr, unsigned long vaddr, @@ -460,11 +380,9 @@ void __cache_line_loop_v4(phys_addr_t paddr, unsigned long vaddr, #endif -#if (CONFIG_ARC_MMU_VER < 3) -#define __cache_line_loop __cache_line_loop_v2 -#elif (CONFIG_ARC_MMU_VER == 3) +#ifdef CONFIG_ARC_MMU_V3 #define __cache_line_loop __cache_line_loop_v3 -#elif (CONFIG_ARC_MMU_VER > 3) +#else #define __cache_line_loop __cache_line_loop_v4 #endif @@ -1123,7 +1041,7 @@ void clear_user_page(void *to, unsigned long u_vaddr, struct page *page) clear_page(to); clear_bit(PG_dc_clean, &page->flags); } - +EXPORT_SYMBOL(clear_user_page); /********************************************************************** * Explicit Cache flush request from user space via syscall diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index f5657cb68e4f..5787c261c9a4 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -33,28 +33,34 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address) pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; - pgd = pgd_offset_fast(current->active_mm, address); + pgd = pgd_offset(current->active_mm, address); pgd_k = pgd_offset_k(address); - if (!pgd_present(*pgd_k)) + if (pgd_none (*pgd_k)) goto bad_area; + if (!pgd_present(*pgd)) + set_pgd(pgd, *pgd_k); p4d = p4d_offset(pgd, address); p4d_k = p4d_offset(pgd_k, address); - if (!p4d_present(*p4d_k)) + if (p4d_none(*p4d_k)) goto bad_area; + if (!p4d_present(*p4d)) + set_p4d(p4d, *p4d_k); pud = pud_offset(p4d, address); pud_k = pud_offset(p4d_k, address); - if (!pud_present(*pud_k)) + if (pud_none(*pud_k)) goto bad_area; + if (!pud_present(*pud)) + set_pud(pud, *pud_k); pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) + if (pmd_none(*pmd_k)) goto bad_area; - - set_pmd(pmd, *pmd_k); + if (!pmd_present(*pmd)) + set_pmd(pmd, *pmd_k); /* XXX: create the TLB entry here */ return 0; diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index c083bf660cec..699ecf119641 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -189,6 +189,11 @@ void __init mem_init(void) { memblock_free_all(); highmem_init(); + + BUILD_BUG_ON((PTRS_PER_PGD * sizeof(pgd_t)) > PAGE_SIZE); + BUILD_BUG_ON((PTRS_PER_PUD * sizeof(pud_t)) > PAGE_SIZE); + BUILD_BUG_ON((PTRS_PER_PMD * sizeof(pmd_t)) > PAGE_SIZE); + BUILD_BUG_ON((PTRS_PER_PTE * sizeof(pte_t)) > PAGE_SIZE); } #ifdef CONFIG_HIGHMEM diff --git a/arch/arc/mm/ioremap.c b/arch/arc/mm/ioremap.c index 95c649fbc95a..0ee75aca6e10 100644 --- a/arch/arc/mm/ioremap.c +++ b/arch/arc/mm/ioremap.c @@ -39,7 +39,8 @@ void __iomem *ioremap(phys_addr_t paddr, unsigned long size) if (arc_uncached_addr_space(paddr)) return (void __iomem *)(u32)paddr; - return ioremap_prot(paddr, size, PAGE_KERNEL_NO_CACHE); + return ioremap_prot(paddr, size, + pgprot_val(pgprot_noncached(PAGE_KERNEL))); } EXPORT_SYMBOL(ioremap); diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index 9c7c68247289..5f71445f26bd 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -1,51 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * TLB Management (flush/create/diagnostics) for ARC700 + * TLB Management (flush/create/diagnostics) for MMUv3 and MMUv4 * * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) * - * vineetg: Aug 2011 - * -Reintroduce duplicate PD fixup - some customer chips still have the issue - * - * vineetg: May 2011 - * -No need to flush_cache_page( ) for each call to update_mmu_cache() - * some of the LMBench tests improved amazingly - * = page-fault thrice as fast (75 usec to 28 usec) - * = mmap twice as fast (9.6 msec to 4.6 msec), - * = fork (5.3 msec to 3.7 msec) - * - * vineetg: April 2011 : - * -MMU v3: PD{0,1} bits layout changed: They don't overlap anymore, - * helps avoid a shift when preparing PD0 from PTE - * - * vineetg: April 2011 : Preparing for MMU V3 - * -MMU v2/v3 BCRs decoded differently - * -Remove TLB_SIZE hardcoding as it's variable now: 256 or 512 - * -tlb_entry_erase( ) can be void - * -local_flush_tlb_range( ): - * = need not "ceil" @end - * = walks MMU only if range spans < 32 entries, as opposed to 256 - * - * Vineetg: Sept 10th 2008 - * -Changes related to MMU v2 (Rel 4.8) - * - * Vineetg: Aug 29th 2008 - * -In TLB Flush operations (Metal Fix MMU) there is a explicit command to - * flush Micro-TLBS. If TLB Index Reg is invalid prior to TLBIVUTLB cmd, - * it fails. Thus need to load it with ANY valid value before invoking - * TLBIVUTLB cmd - * - * Vineetg: Aug 21th 2008: - * -Reduced the duration of IRQ lockouts in TLB Flush routines - * -Multiple copies of TLB erase code separated into a "single" function - * -In TLB Flush routines, interrupt disabling moved UP to retrieve ASID - * in interrupt-safe region. - * - * Vineetg: April 23rd Bug #93131 - * Problem: tlb_flush_kernel_range() doesn't do anything if the range to - * flush is more than the size of TLB itself. - * - * Rahul Trivedi : Codito Technologies 2004 */ #include <linux/module.h> @@ -57,47 +15,6 @@ #include <asm/mmu_context.h> #include <asm/mmu.h> -/* Need for ARC MMU v2 - * - * ARC700 MMU-v1 had a Joint-TLB for Code and Data and is 2 way set-assoc. - * For a memcpy operation with 3 players (src/dst/code) such that all 3 pages - * map into same set, there would be contention for the 2 ways causing severe - * Thrashing. - * - * Although J-TLB is 2 way set assoc, ARC700 caches J-TLB into uTLBS which has - * much higher associativity. u-D-TLB is 8 ways, u-I-TLB is 4 ways. - * Given this, the thrashing problem should never happen because once the 3 - * J-TLB entries are created (even though 3rd will knock out one of the prev - * two), the u-D-TLB and u-I-TLB will have what is required to accomplish memcpy - * - * Yet we still see the Thrashing because a J-TLB Write cause flush of u-TLBs. - * This is a simple design for keeping them in sync. So what do we do? - * The solution which James came up was pretty neat. It utilised the assoc - * of uTLBs by not invalidating always but only when absolutely necessary. - * - * - Existing TLB commands work as before - * - New command (TLBWriteNI) for TLB write without clearing uTLBs - * - New command (TLBIVUTLB) to invalidate uTLBs. - * - * The uTLBs need only be invalidated when pages are being removed from the - * OS page table. If a 'victim' TLB entry is being overwritten in the main TLB - * as a result of a miss, the removed entry is still allowed to exist in the - * uTLBs as it is still valid and present in the OS page table. This allows the - * full associativity of the uTLBs to hide the limited associativity of the main - * TLB. - * - * During a miss handler, the new "TLBWriteNI" command is used to load - * entries without clearing the uTLBs. - * - * When the OS page table is updated, TLB entries that may be associated with a - * removed page are removed (flushed) from the TLB using TLBWrite. In this - * circumstance, the uTLBs must also be cleared. This is done by using the - * existing TLBWrite command. An explicit IVUTLB is also required for those - * corner cases when TLBWrite was not executed at all because the corresp - * J-TLB entry got evicted/replaced. - */ - - /* A copy of the ASID from the PID reg is kept in asid_cache */ DEFINE_PER_CPU(unsigned int, asid_cache) = MM_CTXT_FIRST_CYCLE; @@ -120,32 +37,10 @@ static inline void __tlb_entry_erase(void) static void utlb_invalidate(void) { -#if (CONFIG_ARC_MMU_VER >= 2) - -#if (CONFIG_ARC_MMU_VER == 2) - /* MMU v2 introduced the uTLB Flush command. - * There was however an obscure hardware bug, where uTLB flush would - * fail when a prior probe for J-TLB (both totally unrelated) would - * return lkup err - because the entry didn't exist in MMU. - * The Workaround was to set Index reg with some valid value, prior to - * flush. This was fixed in MMU v3 - */ - unsigned int idx; - - /* make sure INDEX Reg is valid */ - idx = read_aux_reg(ARC_REG_TLBINDEX); - - /* If not write some dummy val */ - if (unlikely(idx & TLB_LKUP_ERR)) - write_aux_reg(ARC_REG_TLBINDEX, 0xa); -#endif - write_aux_reg(ARC_REG_TLBCOMMAND, TLBIVUTLB); -#endif - } -#if (CONFIG_ARC_MMU_VER < 4) +#ifdef CONFIG_ARC_MMU_V3 static inline unsigned int tlb_entry_lkup(unsigned long vaddr_n_asid) { @@ -176,7 +71,7 @@ static void tlb_entry_erase(unsigned int vaddr_n_asid) } } -static void tlb_entry_insert(unsigned int pd0, pte_t pd1) +static void tlb_entry_insert(unsigned int pd0, phys_addr_t pd1) { unsigned int idx; @@ -206,7 +101,7 @@ static void tlb_entry_insert(unsigned int pd0, pte_t pd1) write_aux_reg(ARC_REG_TLBCOMMAND, TLBWrite); } -#else /* CONFIG_ARC_MMU_VER >= 4) */ +#else /* MMUv4 */ static void tlb_entry_erase(unsigned int vaddr_n_asid) { @@ -214,13 +109,16 @@ static void tlb_entry_erase(unsigned int vaddr_n_asid) write_aux_reg(ARC_REG_TLBCOMMAND, TLBDeleteEntry); } -static void tlb_entry_insert(unsigned int pd0, pte_t pd1) +static void tlb_entry_insert(unsigned int pd0, phys_addr_t pd1) { write_aux_reg(ARC_REG_TLBPD0, pd0); - write_aux_reg(ARC_REG_TLBPD1, pd1); - if (is_pae40_enabled()) + if (!is_pae40_enabled()) { + write_aux_reg(ARC_REG_TLBPD1, pd1); + } else { + write_aux_reg(ARC_REG_TLBPD1, pd1 & 0xFFFFFFFF); write_aux_reg(ARC_REG_TLBPD1HI, (u64)pd1 >> 32); + } write_aux_reg(ARC_REG_TLBCOMMAND, TLBInsertEntry); } @@ -496,7 +394,7 @@ void create_tlb(struct vm_area_struct *vma, unsigned long vaddr, pte_t *ptep) unsigned long flags; unsigned int asid_or_sasid, rwx; unsigned long pd0; - pte_t pd1; + phys_addr_t pd1; /* * create_tlb() assumes that current->mm == vma->mm, since @@ -505,7 +403,6 @@ void create_tlb(struct vm_area_struct *vma, unsigned long vaddr, pte_t *ptep) * * Removing the assumption involves * -Using vma->mm->context{ASID,SASID}, as opposed to MMU reg. - * -Fix the TLB paranoid debug code to not trigger false negatives. * -More importantly it makes this handler inconsistent with fast-path * TLB Refill handler which always deals with "current" * @@ -528,8 +425,6 @@ void create_tlb(struct vm_area_struct *vma, unsigned long vaddr, pte_t *ptep) local_irq_save(flags); - tlb_paranoid_check(asid_mm(vma->vm_mm, smp_processor_id()), vaddr); - vaddr &= PAGE_MASK; /* update this PTE credentials */ @@ -639,43 +534,6 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, update_mmu_cache(vma, addr, &pte); } -void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) -{ - struct list_head *lh = (struct list_head *) pgtable; - - assert_spin_locked(&mm->page_table_lock); - - /* FIFO */ - if (!pmd_huge_pte(mm, pmdp)) - INIT_LIST_HEAD(lh); - else - list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); - pmd_huge_pte(mm, pmdp) = pgtable; -} - -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) -{ - struct list_head *lh; - pgtable_t pgtable; - - assert_spin_locked(&mm->page_table_lock); - - pgtable = pmd_huge_pte(mm, pmdp); - lh = (struct list_head *) pgtable; - if (list_empty(lh)) - pmd_huge_pte(mm, pmdp) = NULL; - else { - pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; - list_del(lh); - } - - pte_val(pgtable[0]) = 0; - pte_val(pgtable[1]) = 0; - - return pgtable; -} - void local_flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { @@ -706,14 +564,6 @@ void read_decode_mmu_bcr(void) { struct cpuinfo_arc_mmu *mmu = &cpuinfo_arc700[smp_processor_id()].mmu; unsigned int tmp; - struct bcr_mmu_1_2 { -#ifdef CONFIG_CPU_BIG_ENDIAN - unsigned int ver:8, ways:4, sets:4, u_itlb:8, u_dtlb:8; -#else - unsigned int u_dtlb:8, u_itlb:8, sets:4, ways:4, ver:8; -#endif - } *mmu2; - struct bcr_mmu_3 { #ifdef CONFIG_CPU_BIG_ENDIAN unsigned int ver:8, ways:4, sets:4, res:3, sasid:1, pg_sz:4, @@ -738,23 +588,14 @@ void read_decode_mmu_bcr(void) tmp = read_aux_reg(ARC_REG_MMU_BCR); mmu->ver = (tmp >> 24); - if (is_isa_arcompact()) { - if (mmu->ver <= 2) { - mmu2 = (struct bcr_mmu_1_2 *)&tmp; - mmu->pg_sz_k = TO_KB(0x2000); - mmu->sets = 1 << mmu2->sets; - mmu->ways = 1 << mmu2->ways; - mmu->u_dtlb = mmu2->u_dtlb; - mmu->u_itlb = mmu2->u_itlb; - } else { - mmu3 = (struct bcr_mmu_3 *)&tmp; - mmu->pg_sz_k = 1 << (mmu3->pg_sz - 1); - mmu->sets = 1 << mmu3->sets; - mmu->ways = 1 << mmu3->ways; - mmu->u_dtlb = mmu3->u_dtlb; - mmu->u_itlb = mmu3->u_itlb; - mmu->sasid = mmu3->sasid; - } + if (is_isa_arcompact() && mmu->ver == 3) { + mmu3 = (struct bcr_mmu_3 *)&tmp; + mmu->pg_sz_k = 1 << (mmu3->pg_sz - 1); + mmu->sets = 1 << mmu3->sets; + mmu->ways = 1 << mmu3->ways; + mmu->u_dtlb = mmu3->u_dtlb; + mmu->u_itlb = mmu3->u_itlb; + mmu->sasid = mmu3->sasid; } else { mmu4 = (struct bcr_mmu_4 *)&tmp; mmu->pg_sz_k = 1 << (mmu4->sz0 - 1); @@ -780,8 +621,8 @@ char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len) IS_USED_CFG(CONFIG_TRANSPARENT_HUGEPAGE)); n += scnprintf(buf + n, len - n, - "MMU [v%x]\t: %dk PAGE, %sJTLB %d (%dx%d), uDTLB %d, uITLB %d%s%s\n", - p_mmu->ver, p_mmu->pg_sz_k, super_pg, + "MMU [v%x]\t: %dk PAGE, %s, swalk %d lvl, JTLB %d (%dx%d), uDTLB %d, uITLB %d%s%s\n", + p_mmu->ver, p_mmu->pg_sz_k, super_pg, CONFIG_PGTABLE_LEVELS, p_mmu->sets * p_mmu->ways, p_mmu->sets, p_mmu->ways, p_mmu->u_dtlb, p_mmu->u_itlb, IS_AVAIL2(p_mmu->pae, ", PAE40 ", CONFIG_ARC_HAS_PAE40)); @@ -815,22 +656,17 @@ void arc_mmu_init(void) /* * Ensure that MMU features assumed by kernel exist in hardware. - * For older ARC700 cpus, it has to be exact match, since the MMU - * revisions were not backwards compatible (MMUv3 TLB layout changed - * so even if kernel for v2 didn't use any new cmds of v3, it would - * still not work. - * For HS cpus, MMUv4 was baseline and v5 is backwards compatible - * (will run older software). + * - For older ARC700 cpus, only v3 supported + * - For HS cpus, v4 was baseline and v5 is backwards compatible + * (will run older software). */ - if (is_isa_arcompact() && mmu->ver == CONFIG_ARC_MMU_VER) + if (is_isa_arcompact() && mmu->ver == 3) compat = 1; - else if (is_isa_arcv2() && mmu->ver >= CONFIG_ARC_MMU_VER) + else if (is_isa_arcv2() && mmu->ver >= 4) compat = 1; - if (!compat) { - panic("MMU ver %d doesn't match kernel built for %d...\n", - mmu->ver, CONFIG_ARC_MMU_VER); - } + if (!compat) + panic("MMU ver %d doesn't match kernel built for\n", mmu->ver); if (mmu->pg_sz_k != TO_KB(PAGE_SIZE)) panic("MMU pg size != PAGE_SIZE (%luk)\n", TO_KB(PAGE_SIZE)); @@ -843,14 +679,11 @@ void arc_mmu_init(void) if (IS_ENABLED(CONFIG_ARC_HAS_PAE40) && !mmu->pae) panic("Hardware doesn't support PAE40\n"); - /* Enable the MMU */ - write_aux_reg(ARC_REG_PID, MMU_ENABLE); + /* Enable the MMU with ASID 0 */ + mmu_setup_asid(NULL, 0); - /* In smp we use this reg for interrupt 1 scratch */ -#ifdef ARC_USE_SCRATCH_REG - /* swapper_pg_dir is the pgd for the kernel, used by vmalloc */ - write_aux_reg(ARC_REG_SCRATCH_DATA0, swapper_pg_dir); -#endif + /* cache the pgd pointer in MMU SCRATCH reg (ARCv2 only) */ + mmu_setup_pgd(NULL, swapper_pg_dir); if (pae40_exist_but_not_enab()) write_aux_reg(ARC_REG_TLBPD1HI, 0); @@ -945,40 +778,3 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address, local_irq_restore(flags); } - -/*********************************************************************** - * Diagnostic Routines - * -Called from Low Level TLB Handlers if things don;t look good - **********************************************************************/ - -#ifdef CONFIG_ARC_DBG_TLB_PARANOIA - -/* - * Low Level ASM TLB handler calls this if it finds that HW and SW ASIDS - * don't match - */ -void print_asid_mismatch(int mm_asid, int mmu_asid, int is_fast_path) -{ - pr_emerg("ASID Mismatch in %s Path Handler: sw-pid=0x%x hw-pid=0x%x\n", - is_fast_path ? "Fast" : "Slow", mm_asid, mmu_asid); - - __asm__ __volatile__("flag 1"); -} - -void tlb_paranoid_check(unsigned int mm_asid, unsigned long addr) -{ - unsigned int mmu_asid; - - mmu_asid = read_aux_reg(ARC_REG_PID) & 0xff; - - /* - * At the time of a TLB miss/installation - * - HW version needs to match SW version - * - SW needs to have a valid ASID - */ - if (addr < 0x70000000 && - ((mm_asid == MM_CTXT_NO_ASID) || - (mmu_asid != (mm_asid & MM_CTXT_ASID_MASK)))) - print_asid_mismatch(mm_asid, mmu_asid, 0); -} -#endif diff --git a/arch/arc/mm/tlbex.S b/arch/arc/mm/tlbex.S index 062fae46c3f8..e054780a8fe0 100644 --- a/arch/arc/mm/tlbex.S +++ b/arch/arc/mm/tlbex.S @@ -39,7 +39,6 @@ #include <asm/arcregs.h> #include <asm/cache.h> #include <asm/processor.h> -#include <asm/tlb-mmu1.h> #ifdef CONFIG_ISA_ARCOMPACT ;----------------------------------------------------------------- @@ -94,11 +93,6 @@ ex_saved_reg1: st_s r1, [r0, 4] st_s r2, [r0, 8] st_s r3, [r0, 12] - - ; VERIFY if the ASID in MMU-PID Reg is same as - ; one in Linux data structures - - tlb_paranoid_check_asm .endm .macro TLBMISS_RESTORE_REGS @@ -148,53 +142,16 @@ ex_saved_reg1: #endif ;============================================================================ -; Troubleshooting Stuff +;TLB Miss handling Code ;============================================================================ -; Linux keeps ASID (Address Space ID) in task->active_mm->context.asid -; When Creating TLB Entries, instead of doing 3 dependent loads from memory, -; we use the MMU PID Reg to get current ASID. -; In bizzare scenrios SW and HW ASID can get out-of-sync which is trouble. -; So we try to detect this in TLB Mis shandler - -.macro tlb_paranoid_check_asm - -#ifdef CONFIG_ARC_DBG_TLB_PARANOIA - - GET_CURR_TASK_ON_CPU r3 - ld r0, [r3, TASK_ACT_MM] - ld r0, [r0, MM_CTXT+MM_CTXT_ASID] - breq r0, 0, 55f ; Error if no ASID allocated - - lr r1, [ARC_REG_PID] - and r1, r1, 0xFF - - and r2, r0, 0xFF ; MMU PID bits only for comparison - breq r1, r2, 5f - -55: - ; Error if H/w and S/w ASID don't match, but NOT if in kernel mode - lr r2, [erstatus] - bbit0 r2, STATUS_U_BIT, 5f - - ; We sure are in troubled waters, Flag the error, but to do so - ; need to switch to kernel mode stack to call error routine - GET_TSK_STACK_BASE r3, sp - - ; Call printk to shoutout aloud - mov r2, 1 - j print_asid_mismatch - -5: ; ASIDs match so proceed normally - nop - +#ifndef PMD_SHIFT +#define PMD_SHIFT PUD_SHIFT #endif -.endm - -;============================================================================ -;TLB Miss handling Code -;============================================================================ +#ifndef PUD_SHIFT +#define PUD_SHIFT PGDIR_SHIFT +#endif ;----------------------------------------------------------------------------- ; This macro does the page-table lookup for the faulting address. @@ -203,7 +160,7 @@ ex_saved_reg1: lr r2, [efa] -#ifdef ARC_USE_SCRATCH_REG +#ifdef CONFIG_ISA_ARCV2 lr r1, [ARC_REG_SCRATCH_DATA0] ; current pgd #else GET_CURR_TASK_ON_CPU r1 @@ -216,6 +173,24 @@ ex_saved_reg1: tst r3, r3 bz do_slow_path_pf ; if no Page Table, do page fault +#if CONFIG_PGTABLE_LEVELS > 3 + lsr r0, r2, PUD_SHIFT ; Bits for indexing into PUD + and r0, r0, (PTRS_PER_PUD - 1) + ld.as r1, [r3, r0] ; PMD entry + tst r1, r1 + bz do_slow_path_pf + mov r3, r1 +#endif + +#if CONFIG_PGTABLE_LEVELS > 2 + lsr r0, r2, PMD_SHIFT ; Bits for indexing into PMD + and r0, r0, (PTRS_PER_PMD - 1) + ld.as r1, [r3, r0] ; PMD entry + tst r1, r1 + bz do_slow_path_pf + mov r3, r1 +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE and.f 0, r3, _PAGE_HW_SZ ; Is this Huge PMD (thp) add2.nz r1, r1, r0 @@ -279,7 +254,7 @@ ex_saved_reg1: ; Commit the TLB entry into MMU .macro COMMIT_ENTRY_TO_MMU -#if (CONFIG_ARC_MMU_VER < 4) +#ifdef CONFIG_ARC_MMU_V3 /* Get free TLB slot: Set = computed from vaddr, way = random */ sr TLBGetIndex, [ARC_REG_TLBCOMMAND] @@ -375,13 +350,6 @@ ENTRY(EV_TLBMissD) CONV_PTE_TO_TLB -#if (CONFIG_ARC_MMU_VER == 1) - ; MMU with 2 way set assoc J-TLB, needs some help in pathetic case of - ; memcpy where 3 parties contend for 2 ways, ensuing a livelock. - ; But only for old MMU or one with Metal Fix - TLB_WRITE_HEURISTICS -#endif - COMMIT_ENTRY_TO_MMU TLBMISS_RESTORE_REGS EV_TLBMissD_fast_ret: ; additional label for VDK OS-kit instrumentation diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index f1d6531b5ce5..fc196421b2ce 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -124,8 +124,8 @@ config ARM select PCI_SYSCALL if PCI select PERF_USE_VMALLOC select RTC_LIB - select SET_FS select SYS_SUPPORTS_APM_EMULATION + select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M # Above selects are sorted alphabetically; please add new ones # according to that. Thanks. help @@ -189,10 +189,6 @@ config LOCKDEP_SUPPORT bool default y -config TRACE_IRQFLAGS_SUPPORT - bool - default !CPU_V7M - config ARCH_HAS_ILOG2_U32 bool diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 173da685a52e..847c31e7c368 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -308,7 +308,8 @@ $(BOOT_TARGETS): vmlinux @$(kecho) ' Kernel: $(boot)/$@ is ready' $(INSTALL_TARGETS): - $(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $@ + $(CONFIG_SHELL) $(srctree)/$(boot)/install.sh "$(KERNELRELEASE)" \ + $(boot)/$(patsubst %install,%Image,$@) System.map "$(INSTALL_PATH)" PHONY += vdso_install vdso_install: diff --git a/arch/arm/boot/Makefile b/arch/arm/boot/Makefile index 0b3cd7a33a26..54a09f9464fb 100644 --- a/arch/arm/boot/Makefile +++ b/arch/arm/boot/Makefile @@ -96,23 +96,11 @@ $(obj)/bootp/bootp: $(obj)/zImage initrd FORCE $(obj)/bootpImage: $(obj)/bootp/bootp FORCE $(call if_changed,objcopy) -PHONY += initrd install zinstall uinstall +PHONY += initrd initrd: @test "$(INITRD_PHYS)" != "" || \ (echo This machine does not support INITRD; exit -1) @test "$(INITRD)" != "" || \ (echo You must specify INITRD; exit -1) -install: - $(CONFIG_SHELL) $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" \ - $(obj)/Image System.map "$(INSTALL_PATH)" - -zinstall: - $(CONFIG_SHELL) $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" \ - $(obj)/zImage System.map "$(INSTALL_PATH)" - -uinstall: - $(CONFIG_SHELL) $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" \ - $(obj)/uImage System.map "$(INSTALL_PATH)" - subdir- := bootp compressed dts diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 9d91ae1091b0..91265e7ff672 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -85,6 +85,8 @@ compress-$(CONFIG_KERNEL_LZ4) = lz4 libfdt_objs := fdt_rw.o fdt_ro.o fdt_wip.o fdt.o ifeq ($(CONFIG_ARM_ATAG_DTB_COMPAT),y) +CFLAGS_REMOVE_atags_to_fdt.o += -Wframe-larger-than=${CONFIG_FRAME_WARN} +CFLAGS_atags_to_fdt.o += -Wframe-larger-than=1280 OBJS += $(libfdt_objs) atags_to_fdt.o endif ifeq ($(CONFIG_USE_OF),y) diff --git a/arch/arm/boot/dts/omap34xx.dtsi b/arch/arm/boot/dts/omap34xx.dtsi index feaa43b78535..8b8451399784 100644 --- a/arch/arm/boot/dts/omap34xx.dtsi +++ b/arch/arm/boot/dts/omap34xx.dtsi @@ -24,7 +24,6 @@ }; }; - /* see Documentation/devicetree/bindings/opp/opp.txt */ cpu0_opp_table: opp-table { compatible = "operating-points-v2-ti-cpu"; syscon = <&scm_conf>; diff --git a/arch/arm/boot/dts/omap36xx.dtsi b/arch/arm/boot/dts/omap36xx.dtsi index 20844dbc002e..22b33098b1a2 100644 --- a/arch/arm/boot/dts/omap36xx.dtsi +++ b/arch/arm/boot/dts/omap36xx.dtsi @@ -29,7 +29,6 @@ }; }; - /* see Documentation/devicetree/bindings/opp/opp.txt */ cpu0_opp_table: opp-table { compatible = "operating-points-v2-ti-cpu"; syscon = <&scm_conf>; diff --git a/arch/arm/configs/dove_defconfig b/arch/arm/configs/dove_defconfig index b935162a8bba..33074fdab2ea 100644 --- a/arch/arm/configs/dove_defconfig +++ b/arch/arm/configs/dove_defconfig @@ -56,7 +56,6 @@ CONFIG_ATA=y CONFIG_SATA_MV=y CONFIG_NETDEVICES=y CONFIG_MV643XX_ETH=y -CONFIG_INPUT_POLLDEV=y # CONFIG_INPUT_MOUSEDEV is not set CONFIG_INPUT_EVDEV=y # CONFIG_KEYBOARD_ATKBD is not set diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index 363f1b1b08e3..58f4834289e6 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -284,7 +284,6 @@ CONFIG_RT2800USB=m CONFIG_MWIFIEX=m CONFIG_MWIFIEX_SDIO=m CONFIG_INPUT_FF_MEMLESS=m -CONFIG_INPUT_POLLDEV=y CONFIG_INPUT_MATRIXKMAP=y CONFIG_INPUT_MOUSEDEV=m CONFIG_INPUT_MOUSEDEV_SCREEN_X=640 diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 2e24e765e6d3..5e56288e343b 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -291,6 +291,7 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) { if ((cache_is_vivt() || cache_is_vipt_aliasing())) @@ -312,9 +313,6 @@ static inline void flush_anon_page(struct vm_area_struct *vma, __flush_anon_page(vma, page, vmaddr); } -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -extern void flush_kernel_dcache_page(struct page *); - #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h index 595e538f5bfb..4b69cf850451 100644 --- a/arch/arm/include/asm/div64.h +++ b/arch/arm/include/asm/div64.h @@ -52,17 +52,6 @@ static inline uint32_t __div64_32(uint64_t *n, uint32_t base) #else -/* - * gcc versions earlier than 4.0 are simply too problematic for the - * __div64_const32() code in asm-generic/div64.h. First there is - * gcc PR 15089 that tend to trig on more complex constructs, spurious - * .global __udivsi3 are inserted even if none of those symbols are - * referenced in the generated code, and those gcc versions are not able - * to do constant propagation on long long values anyway. - */ - -#define __div64_const32_is_OK (__GNUC__ >= 4) - static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias) { unsigned long long res; diff --git a/arch/arm/include/asm/gpio.h b/arch/arm/include/asm/gpio.h index c50e383358c4..f3bb8a2bf788 100644 --- a/arch/arm/include/asm/gpio.h +++ b/arch/arm/include/asm/gpio.h @@ -2,10 +2,6 @@ #ifndef _ARCH_ARM_GPIO_H #define _ARCH_ARM_GPIO_H -#if CONFIG_ARCH_NR_GPIO > 0 -#define ARCH_NR_GPIOS CONFIG_ARCH_NR_GPIO -#endif - /* Note: this may rely upon the value of ARCH_NR_GPIOS set in mach/gpio.h */ #include <asm-generic/gpio.h> diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h index 91d6b7856be4..93051e2f402c 100644 --- a/arch/arm/include/asm/ptrace.h +++ b/arch/arm/include/asm/ptrace.h @@ -19,7 +19,6 @@ struct pt_regs { struct svc_pt_regs { struct pt_regs regs; u32 dacr; - u32 addr_limit; }; #define to_svc_pt_regs(r) container_of(r, struct svc_pt_regs, regs) diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h index fd02761ba06c..24c19d63ff0a 100644 --- a/arch/arm/include/asm/syscall.h +++ b/arch/arm/include/asm/syscall.h @@ -22,7 +22,21 @@ extern const unsigned long sys_call_table[]; static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { - return task_thread_info(task)->syscall; + if (IS_ENABLED(CONFIG_AEABI) && !IS_ENABLED(CONFIG_OABI_COMPAT)) + return task_thread_info(task)->abi_syscall; + + return task_thread_info(task)->abi_syscall & __NR_SYSCALL_MASK; +} + +static inline bool __in_oabi_syscall(struct task_struct *task) +{ + return IS_ENABLED(CONFIG_OABI_COMPAT) && + (task_thread_info(task)->abi_syscall & __NR_OABI_SYSCALL_BASE); +} + +static inline bool in_oabi_syscall(void) +{ + return __in_oabi_syscall(current); } static inline void syscall_rollback(struct task_struct *task, diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index a02799bd0cdf..9a18da3e10cc 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -31,8 +31,6 @@ struct task_struct; #include <asm/types.h> -typedef unsigned long mm_segment_t; - struct cpu_context_save { __u32 r4; __u32 r5; @@ -54,7 +52,6 @@ struct cpu_context_save { struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ - mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ __u32 cpu; /* cpu */ __u32 cpu_domain; /* cpu domain */ @@ -62,7 +59,7 @@ struct thread_info { unsigned long stack_canary; #endif struct cpu_context_save cpu_context; /* cpu context */ - __u32 syscall; /* syscall number */ + __u32 abi_syscall; /* ABI type and syscall nr */ __u8 used_cp[16]; /* thread used copro */ unsigned long tp_value[2]; /* TLS registers */ union fp_state fpstate __attribute__((aligned(8))); @@ -77,7 +74,6 @@ struct thread_info { .task = &tsk, \ .flags = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ } /* diff --git a/arch/arm/include/asm/uaccess-asm.h b/arch/arm/include/asm/uaccess-asm.h index e6eb7a2aaf1e..6451a433912c 100644 --- a/arch/arm/include/asm/uaccess-asm.h +++ b/arch/arm/include/asm/uaccess-asm.h @@ -84,12 +84,8 @@ * if \disable is set. */ .macro uaccess_entry, tsk, tmp0, tmp1, tmp2, disable - ldr \tmp1, [\tsk, #TI_ADDR_LIMIT] - ldr \tmp2, =TASK_SIZE - str \tmp2, [\tsk, #TI_ADDR_LIMIT] DACR( mrc p15, 0, \tmp0, c3, c0, 0) DACR( str \tmp0, [sp, #SVC_DACR]) - str \tmp1, [sp, #SVC_ADDR_LIMIT] .if \disable && IS_ENABLED(CONFIG_CPU_SW_DOMAIN_PAN) /* kernel=client, user=no access */ mov \tmp2, #DACR_UACCESS_DISABLE @@ -106,9 +102,7 @@ /* Restore the user access state previously saved by uaccess_entry */ .macro uaccess_exit, tsk, tmp0, tmp1 - ldr \tmp1, [sp, #SVC_ADDR_LIMIT] DACR( ldr \tmp0, [sp, #SVC_DACR]) - str \tmp1, [\tsk, #TI_ADDR_LIMIT] DACR( mcr p15, 0, \tmp0, c3, c0, 0) .endm diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index a13d90206472..084d1c07c2d0 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -52,32 +52,8 @@ static __always_inline void uaccess_restore(unsigned int flags) extern int __get_user_bad(void); extern int __put_user_bad(void); -/* - * Note that this is actually 0x1,0000,0000 - */ -#define KERNEL_DS 0x00000000 - #ifdef CONFIG_MMU -#define USER_DS TASK_SIZE -#define get_fs() (current_thread_info()->addr_limit) - -static inline void set_fs(mm_segment_t fs) -{ - current_thread_info()->addr_limit = fs; - - /* - * Prevent a mispredicted conditional call to set_fs from forwarding - * the wrong address limit to access_ok under speculation. - */ - dsb(nsh); - isb(); - - modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER); -} - -#define uaccess_kernel() (get_fs() == KERNEL_DS) - /* * We use 33-bit arithmetic here. Success returns zero, failure returns * addr_limit. We take advantage that addr_limit will be zero for KERNEL_DS, @@ -89,7 +65,7 @@ static inline void set_fs(mm_segment_t fs) __asm__(".syntax unified\n" \ "adds %1, %2, %3; sbcscc %1, %1, %0; movcc %0, #0" \ : "=&r" (flag), "=&r" (roksum) \ - : "r" (addr), "Ir" (size), "0" (current_thread_info()->addr_limit) \ + : "r" (addr), "Ir" (size), "0" (TASK_SIZE) \ : "cc"); \ flag; }) @@ -120,7 +96,7 @@ static inline void __user *__uaccess_mask_range_ptr(const void __user *ptr, " subshs %1, %1, %2\n" " movlo %0, #0\n" : "+r" (safe_ptr), "=&r" (tmp) - : "r" (size), "r" (current_thread_info()->addr_limit) + : "r" (size), "r" (TASK_SIZE) : "cc"); csdb(); @@ -194,7 +170,7 @@ extern int __get_user_64t_4(void *); #define __get_user_check(x, p) \ ({ \ - unsigned long __limit = current_thread_info()->addr_limit - 1; \ + unsigned long __limit = TASK_SIZE - 1; \ register typeof(*(p)) __user *__p asm("r0") = (p); \ register __inttype(x) __r2 asm("r2"); \ register unsigned long __l asm("r1") = __limit; \ @@ -245,7 +221,7 @@ extern int __put_user_8(void *, unsigned long long); #define __put_user_check(__pu_val, __ptr, __err, __s) \ ({ \ - unsigned long __limit = current_thread_info()->addr_limit - 1; \ + unsigned long __limit = TASK_SIZE - 1; \ register typeof(__pu_val) __r2 asm("r2") = __pu_val; \ register const void __user *__p asm("r0") = __ptr; \ register unsigned long __l asm("r1") = __limit; \ @@ -262,19 +238,8 @@ extern int __put_user_8(void *, unsigned long long); #else /* CONFIG_MMU */ -/* - * uClinux has only one addr space, so has simplified address limits. - */ -#define USER_DS KERNEL_DS - -#define uaccess_kernel() (true) #define __addr_ok(addr) ((void)(addr), 1) #define __range_ok(addr, size) ((void)(addr), 0) -#define get_fs() (KERNEL_DS) - -static inline void set_fs(mm_segment_t fs) -{ -} #define get_user(x, p) __get_user(x, p) #define __put_user_check __put_user_nocheck @@ -283,9 +248,6 @@ static inline void set_fs(mm_segment_t fs) #define access_ok(addr, size) (__range_ok(addr, size) == 0) -#define user_addr_max() \ - (uaccess_kernel() ? ~0UL : get_fs()) - #ifdef CONFIG_CPU_SPECTRE /* * When mitigating Spectre variant 1, it is not worth fixing the non- @@ -308,11 +270,11 @@ static inline void set_fs(mm_segment_t fs) #define __get_user(x, ptr) \ ({ \ long __gu_err = 0; \ - __get_user_err((x), (ptr), __gu_err); \ + __get_user_err((x), (ptr), __gu_err, TUSER()); \ __gu_err; \ }) -#define __get_user_err(x, ptr, err) \ +#define __get_user_err(x, ptr, err, __t) \ do { \ unsigned long __gu_addr = (unsigned long)(ptr); \ unsigned long __gu_val; \ @@ -321,18 +283,19 @@ do { \ might_fault(); \ __ua_flags = uaccess_save_and_enable(); \ switch (sizeof(*(ptr))) { \ - case 1: __get_user_asm_byte(__gu_val, __gu_addr, err); break; \ - case 2: __get_user_asm_half(__gu_val, __gu_addr, err); break; \ - case 4: __get_user_asm_word(__gu_val, __gu_addr, err); break; \ + case 1: __get_user_asm_byte(__gu_val, __gu_addr, err, __t); break; \ + case 2: __get_user_asm_half(__gu_val, __gu_addr, err, __t); break; \ + case 4: __get_user_asm_word(__gu_val, __gu_addr, err, __t); break; \ default: (__gu_val) = __get_user_bad(); \ } \ uaccess_restore(__ua_flags); \ (x) = (__typeof__(*(ptr)))__gu_val; \ } while (0) +#endif #define __get_user_asm(x, addr, err, instr) \ __asm__ __volatile__( \ - "1: " TUSER(instr) " %1, [%2], #0\n" \ + "1: " instr " %1, [%2], #0\n" \ "2:\n" \ " .pushsection .text.fixup,\"ax\"\n" \ " .align 2\n" \ @@ -348,40 +311,38 @@ do { \ : "r" (addr), "i" (-EFAULT) \ : "cc") -#define __get_user_asm_byte(x, addr, err) \ - __get_user_asm(x, addr, err, ldrb) +#define __get_user_asm_byte(x, addr, err, __t) \ + __get_user_asm(x, addr, err, "ldrb" __t) #if __LINUX_ARM_ARCH__ >= 6 -#define __get_user_asm_half(x, addr, err) \ - __get_user_asm(x, addr, err, ldrh) +#define __get_user_asm_half(x, addr, err, __t) \ + __get_user_asm(x, addr, err, "ldrh" __t) #else #ifndef __ARMEB__ -#define __get_user_asm_half(x, __gu_addr, err) \ +#define __get_user_asm_half(x, __gu_addr, err, __t) \ ({ \ unsigned long __b1, __b2; \ - __get_user_asm_byte(__b1, __gu_addr, err); \ - __get_user_asm_byte(__b2, __gu_addr + 1, err); \ + __get_user_asm_byte(__b1, __gu_addr, err, __t); \ + __get_user_asm_byte(__b2, __gu_addr + 1, err, __t); \ (x) = __b1 | (__b2 << 8); \ }) #else -#define __get_user_asm_half(x, __gu_addr, err) \ +#define __get_user_asm_half(x, __gu_addr, err, __t) \ ({ \ unsigned long __b1, __b2; \ - __get_user_asm_byte(__b1, __gu_addr, err); \ - __get_user_asm_byte(__b2, __gu_addr + 1, err); \ + __get_user_asm_byte(__b1, __gu_addr, err, __t); \ + __get_user_asm_byte(__b2, __gu_addr + 1, err, __t); \ (x) = (__b1 << 8) | __b2; \ }) #endif #endif /* __LINUX_ARM_ARCH__ >= 6 */ -#define __get_user_asm_word(x, addr, err) \ - __get_user_asm(x, addr, err, ldr) -#endif - +#define __get_user_asm_word(x, addr, err, __t) \ + __get_user_asm(x, addr, err, "ldr" __t) #define __put_user_switch(x, ptr, __err, __fn) \ do { \ @@ -425,7 +386,7 @@ do { \ #define __put_user_nocheck(x, __pu_ptr, __err, __size) \ do { \ unsigned long __pu_addr = (unsigned long)__pu_ptr; \ - __put_user_nocheck_##__size(x, __pu_addr, __err); \ + __put_user_nocheck_##__size(x, __pu_addr, __err, TUSER());\ } while (0) #define __put_user_nocheck_1 __put_user_asm_byte @@ -433,9 +394,11 @@ do { \ #define __put_user_nocheck_4 __put_user_asm_word #define __put_user_nocheck_8 __put_user_asm_dword +#endif /* !CONFIG_CPU_SPECTRE */ + #define __put_user_asm(x, __pu_addr, err, instr) \ __asm__ __volatile__( \ - "1: " TUSER(instr) " %1, [%2], #0\n" \ + "1: " instr " %1, [%2], #0\n" \ "2:\n" \ " .pushsection .text.fixup,\"ax\"\n" \ " .align 2\n" \ @@ -450,36 +413,36 @@ do { \ : "r" (x), "r" (__pu_addr), "i" (-EFAULT) \ : "cc") -#define __put_user_asm_byte(x, __pu_addr, err) \ - __put_user_asm(x, __pu_addr, err, strb) +#define __put_user_asm_byte(x, __pu_addr, err, __t) \ + __put_user_asm(x, __pu_addr, err, "strb" __t) #if __LINUX_ARM_ARCH__ >= 6 -#define __put_user_asm_half(x, __pu_addr, err) \ - __put_user_asm(x, __pu_addr, err, strh) +#define __put_user_asm_half(x, __pu_addr, err, __t) \ + __put_user_asm(x, __pu_addr, err, "strh" __t) #else #ifndef __ARMEB__ -#define __put_user_asm_half(x, __pu_addr, err) \ +#define __put_user_asm_half(x, __pu_addr, err, __t) \ ({ \ unsigned long __temp = (__force unsigned long)(x); \ - __put_user_asm_byte(__temp, __pu_addr, err); \ - __put_user_asm_byte(__temp >> 8, __pu_addr + 1, err); \ + __put_user_asm_byte(__temp, __pu_addr, err, __t); \ + __put_user_asm_byte(__temp >> 8, __pu_addr + 1, err, __t);\ }) #else -#define __put_user_asm_half(x, __pu_addr, err) \ +#define __put_user_asm_half(x, __pu_addr, err, __t) \ ({ \ unsigned long __temp = (__force unsigned long)(x); \ - __put_user_asm_byte(__temp >> 8, __pu_addr, err); \ - __put_user_asm_byte(__temp, __pu_addr + 1, err); \ + __put_user_asm_byte(__temp >> 8, __pu_addr, err, __t); \ + __put_user_asm_byte(__temp, __pu_addr + 1, err, __t); \ }) #endif #endif /* __LINUX_ARM_ARCH__ >= 6 */ -#define __put_user_asm_word(x, __pu_addr, err) \ - __put_user_asm(x, __pu_addr, err, str) +#define __put_user_asm_word(x, __pu_addr, err, __t) \ + __put_user_asm(x, __pu_addr, err, "str" __t) #ifndef __ARMEB__ #define __reg_oper0 "%R2" @@ -489,12 +452,12 @@ do { \ #define __reg_oper1 "%R2" #endif -#define __put_user_asm_dword(x, __pu_addr, err) \ +#define __put_user_asm_dword(x, __pu_addr, err, __t) \ __asm__ __volatile__( \ - ARM( "1: " TUSER(str) " " __reg_oper1 ", [%1], #4\n" ) \ - ARM( "2: " TUSER(str) " " __reg_oper0 ", [%1]\n" ) \ - THUMB( "1: " TUSER(str) " " __reg_oper1 ", [%1]\n" ) \ - THUMB( "2: " TUSER(str) " " __reg_oper0 ", [%1, #4]\n" ) \ + ARM( "1: str" __t " " __reg_oper1 ", [%1], #4\n" ) \ + ARM( "2: str" __t " " __reg_oper0 ", [%1]\n" ) \ + THUMB( "1: str" __t " " __reg_oper1 ", [%1]\n" ) \ + THUMB( "2: str" __t " " __reg_oper0 ", [%1, #4]\n" ) \ "3:\n" \ " .pushsection .text.fixup,\"ax\"\n" \ " .align 2\n" \ @@ -510,7 +473,49 @@ do { \ : "r" (x), "i" (-EFAULT) \ : "cc") -#endif /* !CONFIG_CPU_SPECTRE */ +#define HAVE_GET_KERNEL_NOFAULT + +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + const type *__pk_ptr = (src); \ + unsigned long __src = (unsigned long)(__pk_ptr); \ + type __val; \ + int __err = 0; \ + switch (sizeof(type)) { \ + case 1: __get_user_asm_byte(__val, __src, __err, ""); break; \ + case 2: __get_user_asm_half(__val, __src, __err, ""); break; \ + case 4: __get_user_asm_word(__val, __src, __err, ""); break; \ + case 8: { \ + u32 *__v32 = (u32*)&__val; \ + __get_user_asm_word(__v32[0], __src, __err, ""); \ + if (__err) \ + break; \ + __get_user_asm_word(__v32[1], __src+4, __err, ""); \ + break; \ + } \ + default: __err = __get_user_bad(); break; \ + } \ + *(type *)(dst) = __val; \ + if (__err) \ + goto err_label; \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, err_label) \ +do { \ + const type *__pk_ptr = (dst); \ + unsigned long __dst = (unsigned long)__pk_ptr; \ + int __err = 0; \ + type __val = *(type *)src; \ + switch (sizeof(type)) { \ + case 1: __put_user_asm_byte(__val, __dst, __err, ""); break; \ + case 2: __put_user_asm_half(__val, __dst, __err, ""); break; \ + case 4: __put_user_asm_word(__val, __dst, __err, ""); break; \ + case 8: __put_user_asm_dword(__val, __dst, __err, ""); break; \ + default: __err = __put_user_bad(); break; \ + } \ + if (__err) \ + goto err_label; \ +} while (0) #ifdef CONFIG_MMU extern unsigned long __must_check diff --git a/arch/arm/include/asm/unified.h b/arch/arm/include/asm/unified.h index 1e2c3eb04353..ce9689118dbb 100644 --- a/arch/arm/include/asm/unified.h +++ b/arch/arm/include/asm/unified.h @@ -24,10 +24,6 @@ __asm__(".syntax unified"); #ifdef CONFIG_THUMB2_KERNEL -#if __GNUC__ < 4 -#error Thumb-2 kernel requires gcc >= 4 -#endif - /* The CPSR bit describing the instruction set (Thumb) */ #define PSR_ISETSTATE PSR_T_BIT diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h index ae7749e15726..a1149911464c 100644 --- a/arch/arm/include/uapi/asm/unistd.h +++ b/arch/arm/include/uapi/asm/unistd.h @@ -15,6 +15,7 @@ #define _UAPI__ASM_ARM_UNISTD_H #define __NR_OABI_SYSCALL_BASE 0x900000 +#define __NR_SYSCALL_MASK 0x0fffff #if defined(__thumb__) || defined(__ARM_EABI__) #define __NR_SYSCALL_BASE 0 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 64944701bf6a..a646a3f6440f 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -43,11 +43,11 @@ int main(void) BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); - DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain)); DEFINE(TI_CPU_SAVE, offsetof(struct thread_info, cpu_context)); + DEFINE(TI_ABI_SYSCALL, offsetof(struct thread_info, abi_syscall)); DEFINE(TI_USED_CP, offsetof(struct thread_info, used_cp)); DEFINE(TI_TP_VALUE, offsetof(struct thread_info, tp_value)); DEFINE(TI_FPSTATE, offsetof(struct thread_info, fpstate)); @@ -88,7 +88,6 @@ int main(void) DEFINE(S_OLD_R0, offsetof(struct pt_regs, ARM_ORIG_r0)); DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); DEFINE(SVC_DACR, offsetof(struct svc_pt_regs, dacr)); - DEFINE(SVC_ADDR_LIMIT, offsetof(struct svc_pt_regs, addr_limit)); DEFINE(SVC_REGS_SIZE, sizeof(struct svc_pt_regs)); BLANK(); DEFINE(SIGFRAME_RC3_OFFSET, offsetof(struct sigframe, retcode[3])); diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 7f0b7aba1498..d9c99db50243 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -49,10 +49,6 @@ __ret_fast_syscall: UNWIND(.fnstart ) UNWIND(.cantunwind ) disable_irq_notrace @ disable interrupts - ldr r2, [tsk, #TI_ADDR_LIMIT] - ldr r1, =TASK_SIZE - cmp r2, r1 - blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing movs r1, r1, lsl #16 bne fast_work_pending @@ -87,10 +83,6 @@ __ret_fast_syscall: bl do_rseq_syscall #endif disable_irq_notrace @ disable interrupts - ldr r2, [tsk, #TI_ADDR_LIMIT] - ldr r1, =TASK_SIZE - cmp r2, r1 - blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing movs r1, r1, lsl #16 beq no_work_pending @@ -129,10 +121,6 @@ ret_slow_syscall: #endif disable_irq_notrace @ disable interrupts ENTRY(ret_to_user_from_irq) - ldr r2, [tsk, #TI_ADDR_LIMIT] - ldr r1, =TASK_SIZE - cmp r2, r1 - blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] movs r1, r1, lsl #16 bne slow_work_pending @@ -226,6 +214,7 @@ ENTRY(vector_swi) /* saved_psr and saved_pc are now dead */ uaccess_disable tbl + get_thread_info tsk adr tbl, sys_call_table @ load syscall table pointer @@ -237,13 +226,17 @@ ENTRY(vector_swi) * get the old ABI syscall table address. */ bics r10, r10, #0xff000000 + strne r10, [tsk, #TI_ABI_SYSCALL] + streq scno, [tsk, #TI_ABI_SYSCALL] eorne scno, r10, #__NR_OABI_SYSCALL_BASE ldrne tbl, =sys_oabi_call_table #elif !defined(CONFIG_AEABI) bic scno, scno, #0xff000000 @ mask off SWI op-code + str scno, [tsk, #TI_ABI_SYSCALL] eor scno, scno, #__NR_SYSCALL_BASE @ check OS number +#else + str scno, [tsk, #TI_ABI_SYSCALL] #endif - get_thread_info tsk /* * Reload the registers that may have been corrupted on entry to * the syscall assembly (by tracing or context tracking.) @@ -288,7 +281,6 @@ ENDPROC(vector_swi) * context switches, and waiting for our parent to respond. */ __sys_trace: - mov r1, scno add r0, sp, #S_OFF bl syscall_trace_enter mov scno, r0 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index fc9e8b37eaa8..0e2d3051741e 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -5,8 +5,6 @@ * Copyright (C) 1996-2000 Russell King - Converted to ARM. * Original Copyright (C) 1995 Linus Torvalds */ -#include <stdarg.h> - #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> @@ -108,7 +106,7 @@ void __show_regs(struct pt_regs *regs) unsigned long flags; char buf[64]; #ifndef CONFIG_CPU_V7M - unsigned int domain, fs; + unsigned int domain; #ifdef CONFIG_CPU_SW_DOMAIN_PAN /* * Get the domain register for the parent context. In user @@ -117,14 +115,11 @@ void __show_regs(struct pt_regs *regs) */ if (user_mode(regs)) { domain = DACR_UACCESS_ENABLE; - fs = get_fs(); } else { domain = to_svc_pt_regs(regs)->dacr; - fs = to_svc_pt_regs(regs)->addr_limit; } #else domain = get_domain(); - fs = get_fs(); #endif #endif @@ -160,8 +155,6 @@ void __show_regs(struct pt_regs *regs) if ((domain & domain_mask(DOMAIN_USER)) == domain_val(DOMAIN_USER, DOMAIN_NOACCESS)) segment = "none"; - else if (fs == KERNEL_DS) - segment = "kernel"; else segment = "user"; diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index b008859680bc..43b963ea4a0e 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -25,6 +25,7 @@ #include <linux/tracehook.h> #include <linux/unistd.h> +#include <asm/syscall.h> #include <asm/traps.h> #define CREATE_TRACE_POINTS @@ -785,7 +786,8 @@ long arch_ptrace(struct task_struct *child, long request, break; case PTRACE_SET_SYSCALL: - task_thread_info(child)->syscall = data; + task_thread_info(child)->abi_syscall = data & + __NR_SYSCALL_MASK; ret = 0; break; @@ -844,14 +846,14 @@ static void tracehook_report_syscall(struct pt_regs *regs, if (dir == PTRACE_SYSCALL_EXIT) tracehook_report_syscall_exit(regs, 0); else if (tracehook_report_syscall_entry(regs)) - current_thread_info()->syscall = -1; + current_thread_info()->abi_syscall = -1; regs->ARM_ip = ip; } -asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno) +asmlinkage int syscall_trace_enter(struct pt_regs *regs) { - current_thread_info()->syscall = scno; + int scno; if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); @@ -862,11 +864,11 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno) return -1; #else /* XXX: remove this once OABI gets fixed */ - secure_computing_strict(current_thread_info()->syscall); + secure_computing_strict(syscall_get_nr(current, regs)); #endif /* Tracer or seccomp may have changed syscall. */ - scno = current_thread_info()->syscall; + scno = syscall_get_nr(current, regs); if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, scno); diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index f97eb2371672..284a80c0b6e1 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -1012,31 +1012,25 @@ static void __init reserve_crashkernel(void) unsigned long long lowmem_max = __pa(high_memory - 1) + 1; if (crash_max > lowmem_max) crash_max = lowmem_max; - crash_base = memblock_find_in_range(CRASH_ALIGN, crash_max, - crash_size, CRASH_ALIGN); + + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + CRASH_ALIGN, crash_max); if (!crash_base) { pr_err("crashkernel reservation failed - No suitable area found.\n"); return; } } else { + unsigned long long crash_max = crash_base + crash_size; unsigned long long start; - start = memblock_find_in_range(crash_base, - crash_base + crash_size, - crash_size, SECTION_SIZE); - if (start != crash_base) { + start = memblock_phys_alloc_range(crash_size, SECTION_SIZE, + crash_base, crash_max); + if (!start) { pr_err("crashkernel reservation failed - memory is in use.\n"); return; } } - ret = memblock_reserve(crash_base, crash_size); - if (ret < 0) { - pr_warn("crashkernel reservation failed - memory is in use (0x%lx)\n", - (unsigned long)crash_base); - return; - } - pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", (unsigned long)(crash_size >> 20), (unsigned long)(crash_base >> 20), diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 4e0dcff3f5b0..a41e27ace391 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -628,7 +628,6 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) uprobe_notify_resume(regs); } else { tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); } } local_irq_disable(); @@ -669,14 +668,6 @@ struct page *get_signal_page(void) return page; } -/* Defer to generic check */ -asmlinkage void addr_limit_check_failed(void) -{ -#ifdef CONFIG_MMU - addr_limit_user_check(); -#endif -} - #ifdef CONFIG_DEBUG_RSEQ asmlinkage void do_rseq_syscall(struct pt_regs *regs) { diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 075a2e0ed2c1..68112c172025 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -80,9 +80,12 @@ #include <linux/socket.h> #include <linux/net.h> #include <linux/ipc.h> +#include <linux/ipc_namespace.h> #include <linux/uaccess.h> #include <linux/slab.h> +#include <asm/syscall.h> + struct oldabi_stat64 { unsigned long long st_dev; unsigned int __pad1; @@ -191,60 +194,87 @@ struct oabi_flock64 { pid_t l_pid; } __attribute__ ((packed,aligned(4))); -static long do_locks(unsigned int fd, unsigned int cmd, - unsigned long arg) +static int get_oabi_flock(struct flock64 *kernel, struct oabi_flock64 __user *arg) { - struct flock64 kernel; struct oabi_flock64 user; - mm_segment_t fs; - long ret; if (copy_from_user(&user, (struct oabi_flock64 __user *)arg, sizeof(user))) return -EFAULT; - kernel.l_type = user.l_type; - kernel.l_whence = user.l_whence; - kernel.l_start = user.l_start; - kernel.l_len = user.l_len; - kernel.l_pid = user.l_pid; - - fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_fcntl64(fd, cmd, (unsigned long)&kernel); - set_fs(fs); - - if (!ret && (cmd == F_GETLK64 || cmd == F_OFD_GETLK)) { - user.l_type = kernel.l_type; - user.l_whence = kernel.l_whence; - user.l_start = kernel.l_start; - user.l_len = kernel.l_len; - user.l_pid = kernel.l_pid; - if (copy_to_user((struct oabi_flock64 __user *)arg, - &user, sizeof(user))) - ret = -EFAULT; - } - return ret; + + kernel->l_type = user.l_type; + kernel->l_whence = user.l_whence; + kernel->l_start = user.l_start; + kernel->l_len = user.l_len; + kernel->l_pid = user.l_pid; + + return 0; +} + +static int put_oabi_flock(struct flock64 *kernel, struct oabi_flock64 __user *arg) +{ + struct oabi_flock64 user; + + user.l_type = kernel->l_type; + user.l_whence = kernel->l_whence; + user.l_start = kernel->l_start; + user.l_len = kernel->l_len; + user.l_pid = kernel->l_pid; + + if (copy_to_user((struct oabi_flock64 __user *)arg, + &user, sizeof(user))) + return -EFAULT; + + return 0; } asmlinkage long sys_oabi_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg) { + void __user *argp = (void __user *)arg; + struct fd f = fdget_raw(fd); + struct flock64 flock; + long err = -EBADF; + + if (!f.file) + goto out; + switch (cmd) { - case F_OFD_GETLK: - case F_OFD_SETLK: - case F_OFD_SETLKW: case F_GETLK64: + case F_OFD_GETLK: + err = security_file_fcntl(f.file, cmd, arg); + if (err) + break; + err = get_oabi_flock(&flock, argp); + if (err) + break; + err = fcntl_getlk64(f.file, cmd, &flock); + if (!err) + err = put_oabi_flock(&flock, argp); + break; case F_SETLK64: case F_SETLKW64: - return do_locks(fd, cmd, arg); - + case F_OFD_SETLK: + case F_OFD_SETLKW: + err = security_file_fcntl(f.file, cmd, arg); + if (err) + break; + err = get_oabi_flock(&flock, argp); + if (err) + break; + err = fcntl_setlk64(fd, f.file, cmd, &flock); + break; default: - return sys_fcntl64(fd, cmd, arg); + err = sys_fcntl64(fd, cmd, arg); + break; } + fdput(f); +out: + return err; } struct oabi_epoll_event { - __u32 events; + __poll_t events; __u64 data; } __attribute__ ((packed,aligned(4))); @@ -264,55 +294,34 @@ asmlinkage long sys_oabi_epoll_ctl(int epfd, int op, int fd, return do_epoll_ctl(epfd, op, fd, &kernel, false); } - -asmlinkage long sys_oabi_epoll_wait(int epfd, - struct oabi_epoll_event __user *events, - int maxevents, int timeout) -{ - struct epoll_event *kbuf; - struct oabi_epoll_event e; - mm_segment_t fs; - long ret, err, i; - - if (maxevents <= 0 || - maxevents > (INT_MAX/sizeof(*kbuf)) || - maxevents > (INT_MAX/sizeof(*events))) - return -EINVAL; - if (!access_ok(events, sizeof(*events) * maxevents)) - return -EFAULT; - kbuf = kmalloc_array(maxevents, sizeof(*kbuf), GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_epoll_wait(epfd, kbuf, maxevents, timeout); - set_fs(fs); - err = 0; - for (i = 0; i < ret; i++) { - e.events = kbuf[i].events; - e.data = kbuf[i].data; - err = __copy_to_user(events, &e, sizeof(e)); - if (err) - break; - events++; - } - kfree(kbuf); - return err ? -EFAULT : ret; -} #else asmlinkage long sys_oabi_epoll_ctl(int epfd, int op, int fd, struct oabi_epoll_event __user *event) { return -EINVAL; } +#endif -asmlinkage long sys_oabi_epoll_wait(int epfd, - struct oabi_epoll_event __user *events, - int maxevents, int timeout) +struct epoll_event __user * +epoll_put_uevent(__poll_t revents, __u64 data, + struct epoll_event __user *uevent) { - return -EINVAL; + if (in_oabi_syscall()) { + struct oabi_epoll_event __user *oevent = (void __user *)uevent; + + if (__put_user(revents, &oevent->events) || + __put_user(data, &oevent->data)) + return NULL; + + return (void __user *)(oevent+1); + } + + if (__put_user(revents, &uevent->events) || + __put_user(data, &uevent->data)) + return NULL; + + return uevent+1; } -#endif struct oabi_sembuf { unsigned short sem_num; @@ -321,46 +330,52 @@ struct oabi_sembuf { unsigned short __pad; }; +#define sc_semopm sem_ctls[2] + +#ifdef CONFIG_SYSVIPC asmlinkage long sys_oabi_semtimedop(int semid, struct oabi_sembuf __user *tsops, unsigned nsops, const struct old_timespec32 __user *timeout) { + struct ipc_namespace *ns; struct sembuf *sops; - struct old_timespec32 local_timeout; long err; int i; + ns = current->nsproxy->ipc_ns; + if (nsops > ns->sc_semopm) + return -E2BIG; if (nsops < 1 || nsops > SEMOPM) return -EINVAL; - if (!access_ok(tsops, sizeof(*tsops) * nsops)) - return -EFAULT; - sops = kmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); + sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); if (!sops) return -ENOMEM; err = 0; for (i = 0; i < nsops; i++) { struct oabi_sembuf osb; - err |= __copy_from_user(&osb, tsops, sizeof(osb)); + err |= copy_from_user(&osb, tsops, sizeof(osb)); sops[i].sem_num = osb.sem_num; sops[i].sem_op = osb.sem_op; sops[i].sem_flg = osb.sem_flg; tsops++; } - if (timeout) { - /* copy this as well before changing domain protection */ - err |= copy_from_user(&local_timeout, timeout, sizeof(*timeout)); - timeout = &local_timeout; - } if (err) { err = -EFAULT; - } else { - mm_segment_t fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_semtimedop_time32(semid, sops, nsops, timeout); - set_fs(fs); + goto out; } - kfree(sops); + + if (timeout) { + struct timespec64 ts; + err = get_old_timespec32(&ts, timeout); + if (err) + goto out; + err = __do_semtimedop(semid, sops, nsops, &ts, ns); + goto out; + } + err = __do_semtimedop(semid, sops, nsops, NULL, ns); +out: + kvfree(sops); return err; } @@ -387,6 +402,27 @@ asmlinkage int sys_oabi_ipc(uint call, int first, int second, int third, return sys_ipc(call, first, second, third, ptr, fifth); } } +#else +asmlinkage long sys_oabi_semtimedop(int semid, + struct oabi_sembuf __user *tsops, + unsigned nsops, + const struct old_timespec32 __user *timeout) +{ + return -ENOSYS; +} + +asmlinkage long sys_oabi_semop(int semid, struct oabi_sembuf __user *tsops, + unsigned nsops) +{ + return -ENOSYS; +} + +asmlinkage int sys_oabi_ipc(uint call, int first, int second, int third, + void __user *ptr, long fifth) +{ + return -ENOSYS; +} +#endif asmlinkage long sys_oabi_bind(int fd, struct sockaddr __user *addr, int addrlen) { diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 64308e3a5d0c..4a7edc6e848f 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -122,17 +122,8 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom, unsigned long top) { unsigned long first; - mm_segment_t fs; int i; - /* - * We need to switch to kernel mode so that we can use __get_user - * to safely read from kernel space. Note that we now dump the - * code first, just in case the backtrace kills us. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - printk("%s%s(0x%08lx to 0x%08lx)\n", lvl, str, bottom, top); for (first = bottom & ~31; first < top; first += 32) { @@ -145,7 +136,7 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom, for (p = first, i = 0; i < 8 && p < top; i++, p += 4) { if (p >= bottom && p < top) { unsigned long val; - if (__get_user(val, (unsigned long *)p) == 0) + if (get_kernel_nofault(val, (unsigned long *)p)) sprintf(str + i * 9, " %08lx", val); else sprintf(str + i * 9, " ????????"); @@ -153,11 +144,9 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom, } printk("%s%04lx:%s\n", lvl, first & 0xffff, str); } - - set_fs(fs); } -static void __dump_instr(const char *lvl, struct pt_regs *regs) +static void dump_instr(const char *lvl, struct pt_regs *regs) { unsigned long addr = instruction_pointer(regs); const int thumb = thumb_mode(regs); @@ -173,10 +162,20 @@ static void __dump_instr(const char *lvl, struct pt_regs *regs) for (i = -4; i < 1 + !!thumb; i++) { unsigned int val, bad; - if (thumb) - bad = get_user(val, &((u16 *)addr)[i]); - else - bad = get_user(val, &((u32 *)addr)[i]); + if (!user_mode(regs)) { + if (thumb) { + u16 val16; + bad = get_kernel_nofault(val16, &((u16 *)addr)[i]); + val = val16; + } else { + bad = get_kernel_nofault(val, &((u32 *)addr)[i]); + } + } else { + if (thumb) + bad = get_user(val, &((u16 *)addr)[i]); + else + bad = get_user(val, &((u32 *)addr)[i]); + } if (!bad) p += sprintf(p, i == 0 ? "(%0*x) " : "%0*x ", @@ -189,20 +188,6 @@ static void __dump_instr(const char *lvl, struct pt_regs *regs) printk("%sCode: %s\n", lvl, str); } -static void dump_instr(const char *lvl, struct pt_regs *regs) -{ - mm_segment_t fs; - - if (!user_mode(regs)) { - fs = get_fs(); - set_fs(KERNEL_DS); - __dump_instr(lvl, regs); - set_fs(fs); - } else { - __dump_instr(lvl, regs); - } -} - #ifdef CONFIG_ARM_UNWIND static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, const char *loglvl) @@ -781,11 +766,6 @@ void abort(void) panic("Oops failed to kill thread"); } -void __init trap_init(void) -{ - return; -} - #ifdef CONFIG_KUSER_HELPERS static void __init kuser_init(void *vectors) { diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S index f8016e3db65d..480a20766137 100644 --- a/arch/arm/lib/copy_from_user.S +++ b/arch/arm/lib/copy_from_user.S @@ -109,8 +109,7 @@ ENTRY(arm_copy_from_user) #ifdef CONFIG_CPU_SPECTRE - get_thread_info r3 - ldr r3, [r3, #TI_ADDR_LIMIT] + ldr r3, =TASK_SIZE uaccess_mask_range_ptr r1, r2, r3, ip #endif diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S index ebfe4cb3d912..842ea5ede485 100644 --- a/arch/arm/lib/copy_to_user.S +++ b/arch/arm/lib/copy_to_user.S @@ -109,8 +109,7 @@ ENTRY(__copy_to_user_std) WEAK(arm_copy_to_user) #ifdef CONFIG_CPU_SPECTRE - get_thread_info r3 - ldr r3, [r3, #TI_ADDR_LIMIT] + ldr r3, =TASK_SIZE uaccess_mask_range_ptr r0, r2, r3, ip #endif diff --git a/arch/arm/mach-bcm/bcm_kona_smc.c b/arch/arm/mach-bcm/bcm_kona_smc.c index 43a16f922b53..43829e49ad93 100644 --- a/arch/arm/mach-bcm/bcm_kona_smc.c +++ b/arch/arm/mach-bcm/bcm_kona_smc.c @@ -10,8 +10,6 @@ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - -#include <stdarg.h> #include <linux/smp.h> #include <linux/io.h> #include <linux/ioport.h> diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 6d89db7895d1..7ff9feea13a6 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -346,39 +346,6 @@ void flush_dcache_page(struct page *page) EXPORT_SYMBOL(flush_dcache_page); /* - * Ensure cache coherency for the kernel mapping of this page. We can - * assume that the page is pinned via kmap. - * - * If the page only exists in the page cache and there are no user - * space mappings, this is a no-op since the page was already marked - * dirty at creation. Otherwise, we need to flush the dirty kernel - * cache lines directly. - */ -void flush_kernel_dcache_page(struct page *page) -{ - if (cache_is_vivt() || cache_is_vipt_aliasing()) { - struct address_space *mapping; - - mapping = page_mapping_file(page); - - if (!mapping || mapping_mapped(mapping)) { - void *addr; - - addr = page_address(page); - /* - * kmap_atomic() doesn't set the page virtual - * address for highmem pages, and - * kunmap_atomic() takes care of cache - * flushing already. - */ - if (!IS_ENABLED(CONFIG_HIGHMEM) || addr) - __cpuc_flush_dcache_area(addr, PAGE_SIZE); - } - } -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - -/* * Flush an anonymous page so that users of get_user_pages() * can safely access the data. The expected sequence is: * diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 8b3d7191e2b8..2658f52903da 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -166,12 +166,6 @@ void flush_dcache_page(struct page *page) } EXPORT_SYMBOL(flush_dcache_page); -void flush_kernel_dcache_page(struct page *page) -{ - __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE); -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *dst, const void *src, unsigned long len) diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index f8a2d5aa17b7..e842209e135d 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -266,7 +266,7 @@ 249 common lookup_dcookie sys_lookup_dcookie 250 common epoll_create sys_epoll_create 251 common epoll_ctl sys_epoll_ctl sys_oabi_epoll_ctl -252 common epoll_wait sys_epoll_wait sys_oabi_epoll_wait +252 common epoll_wait sys_epoll_wait 253 common remap_file_pages sys_remap_file_pages # 254 for set_thread_area # 255 for get_thread_area @@ -460,3 +460,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9dc1720a909f..5c7ae4c3954b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -86,7 +86,7 @@ config ARM64 select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_SUPPORTS_CFI_CLANG select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG) + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT @@ -220,6 +220,7 @@ config ARM64 select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD + select TRACE_IRQFLAGS_SUPPORT help ARM 64-bit (AArch64) Linux support. @@ -287,9 +288,6 @@ config ILLEGAL_POINTER_VALUE config LOCKDEP_SUPPORT def_bool y -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config GENERIC_BUG def_bool y depends on BUG diff --git a/arch/arm64/boot/dts/qcom/ipq8074.dtsi b/arch/arm64/boot/dts/qcom/ipq8074.dtsi index a620ac0d0b19..db333001df4d 100644 --- a/arch/arm64/boot/dts/qcom/ipq8074.dtsi +++ b/arch/arm64/boot/dts/qcom/ipq8074.dtsi @@ -487,7 +487,6 @@ interrupts = <GIC_SPI 140 IRQ_TYPE_LEVEL_HIGH>; phys = <&qusb_phy_0>, <&usb0_ssphy>; phy-names = "usb2-phy", "usb3-phy"; - tx-fifo-resize; snps,is-utmi-l1-suspend; snps,hird-threshold = /bits/ 8 <0x0>; snps,dis_u2_susphy_quirk; @@ -528,7 +527,6 @@ interrupts = <GIC_SPI 99 IRQ_TYPE_LEVEL_HIGH>; phys = <&qusb_phy_1>, <&usb1_ssphy>; phy-names = "usb2-phy", "usb3-phy"; - tx-fifo-resize; snps,is-utmi-l1-suspend; snps,hird-threshold = /bits/ 8 <0x0>; snps,dis_u2_susphy_quirk; diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index 7535dc7cc5aa..bd68e1b7f29f 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -50,9 +50,6 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr); void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size); #define acpi_os_ioremap acpi_os_ioremap -void __iomem *acpi_os_memmap(acpi_physical_address phys, acpi_size size); -#define acpi_os_memmap acpi_os_memmap - typedef u64 phys_cpuid_t; #define PHYS_CPUID_INVALID INVALID_HWID diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 89faca0e740d..bfa58409a4d4 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -525,6 +525,11 @@ alternative_endif #define EXPORT_SYMBOL_NOKASAN(name) EXPORT_SYMBOL(name) #endif +#ifdef CONFIG_KASAN_HW_TAGS +#define EXPORT_SYMBOL_NOHWKASAN(name) +#else +#define EXPORT_SYMBOL_NOHWKASAN(name) EXPORT_SYMBOL_NOKASAN(name) +#endif /* * Emit a 64-bit absolute little endian symbol reference in a way that * ensures that it will be resolved at build time, even when building a diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index 79c1a750e357..eaa6ca062d89 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -107,11 +107,6 @@ struct compat_statfs { #define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current))) #define COMPAT_MINSIGSTKSZ 2048 -static inline void __user *arch_compat_alloc_user_space(long len) -{ - return (void __user *)compat_user_stack_pointer() - len; -} - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index cdfa2a242e9f..ef6be92b1921 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -602,14 +602,14 @@ static inline bool id_aa64pfr0_32bit_el1(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT); - return val == ID_AA64PFR0_EL1_32BIT_64BIT; + return val == ID_AA64PFR0_ELx_32BIT_64BIT; } static inline bool id_aa64pfr0_32bit_el0(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT); - return val == ID_AA64PFR0_EL0_32BIT_64BIT; + return val == ID_AA64PFR0_ELx_32BIT_64BIT; } static inline bool id_aa64pfr0_sve(u64 pfr0) @@ -784,13 +784,13 @@ extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) { switch (parange) { - case 0: return 32; - case 1: return 36; - case 2: return 40; - case 3: return 42; - case 4: return 44; - case 5: return 48; - case 6: return 52; + case ID_AA64MMFR0_PARANGE_32: return 32; + case ID_AA64MMFR0_PARANGE_36: return 36; + case ID_AA64MMFR0_PARANGE_40: return 40; + case ID_AA64MMFR0_PARANGE_42: return 42; + case ID_AA64MMFR0_PARANGE_44: return 44; + case ID_AA64MMFR0_PARANGE_48: return 48; + case ID_AA64MMFR0_PARANGE_52: return 52; /* * A future PE could use a value unknown to the kernel. * However, by the "D10.1.4 Principles of the ID scheme diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index d436831dd706..327120c0089f 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -12,8 +12,13 @@ #include <asm/types.h> /* Hyp Configuration Register (HCR) bits */ + +#define HCR_TID5 (UL(1) << 58) +#define HCR_DCT (UL(1) << 57) #define HCR_ATA_SHIFT 56 #define HCR_ATA (UL(1) << HCR_ATA_SHIFT) +#define HCR_AMVOFFEN (UL(1) << 51) +#define HCR_FIEN (UL(1) << 47) #define HCR_FWB (UL(1) << 46) #define HCR_API (UL(1) << 41) #define HCR_APK (UL(1) << 40) @@ -32,9 +37,9 @@ #define HCR_TVM (UL(1) << 26) #define HCR_TTLB (UL(1) << 25) #define HCR_TPU (UL(1) << 24) -#define HCR_TPC (UL(1) << 23) +#define HCR_TPC (UL(1) << 23) /* HCR_TPCP if FEAT_DPB */ #define HCR_TSW (UL(1) << 22) -#define HCR_TAC (UL(1) << 21) +#define HCR_TACR (UL(1) << 21) #define HCR_TIDCP (UL(1) << 20) #define HCR_TSC (UL(1) << 19) #define HCR_TID3 (UL(1) << 18) @@ -56,12 +61,13 @@ #define HCR_PTW (UL(1) << 2) #define HCR_SWIO (UL(1) << 1) #define HCR_VM (UL(1) << 0) +#define HCR_RES0 ((UL(1) << 48) | (UL(1) << 39)) /* * The bits we set in HCR: * TLOR: Trap LORegion register accesses * RW: 64bit by default, can be overridden for 32bit VMs - * TAC: Trap ACTLR + * TACR: Trap ACTLR * TSC: Trap SMC * TSW: Trap cache operations by set/way * TWE: Trap WFE @@ -76,7 +82,7 @@ * PTW: Take a stage2 fault if a stage1 walk steps in device memory */ #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ - HCR_BSU_IS | HCR_FB | HCR_TAC | \ + HCR_BSU_IS | HCR_FB | HCR_TACR | \ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \ HCR_FMO | HCR_IMO | HCR_PTW ) #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) @@ -275,24 +281,40 @@ #define CPTR_EL2_TTA (1 << 20) #define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT) #define CPTR_EL2_TZ (1 << 8) -#define CPTR_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 */ -#define CPTR_EL2_DEFAULT CPTR_EL2_RES1 +#define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */ +#define CPTR_EL2_DEFAULT CPTR_NVHE_EL2_RES1 +#define CPTR_NVHE_EL2_RES0 (GENMASK(63, 32) | \ + GENMASK(29, 21) | \ + GENMASK(19, 14) | \ + BIT(11)) /* Hyp Debug Configuration Register bits */ #define MDCR_EL2_E2TB_MASK (UL(0x3)) #define MDCR_EL2_E2TB_SHIFT (UL(24)) -#define MDCR_EL2_TTRF (1 << 19) -#define MDCR_EL2_TPMS (1 << 14) +#define MDCR_EL2_HPMFZS (UL(1) << 36) +#define MDCR_EL2_HPMFZO (UL(1) << 29) +#define MDCR_EL2_MTPME (UL(1) << 28) +#define MDCR_EL2_TDCC (UL(1) << 27) +#define MDCR_EL2_HCCD (UL(1) << 23) +#define MDCR_EL2_TTRF (UL(1) << 19) +#define MDCR_EL2_HPMD (UL(1) << 17) +#define MDCR_EL2_TPMS (UL(1) << 14) #define MDCR_EL2_E2PB_MASK (UL(0x3)) #define MDCR_EL2_E2PB_SHIFT (UL(12)) -#define MDCR_EL2_TDRA (1 << 11) -#define MDCR_EL2_TDOSA (1 << 10) -#define MDCR_EL2_TDA (1 << 9) -#define MDCR_EL2_TDE (1 << 8) -#define MDCR_EL2_HPME (1 << 7) -#define MDCR_EL2_TPM (1 << 6) -#define MDCR_EL2_TPMCR (1 << 5) -#define MDCR_EL2_HPMN_MASK (0x1F) +#define MDCR_EL2_TDRA (UL(1) << 11) +#define MDCR_EL2_TDOSA (UL(1) << 10) +#define MDCR_EL2_TDA (UL(1) << 9) +#define MDCR_EL2_TDE (UL(1) << 8) +#define MDCR_EL2_HPME (UL(1) << 7) +#define MDCR_EL2_TPM (UL(1) << 6) +#define MDCR_EL2_TPMCR (UL(1) << 5) +#define MDCR_EL2_HPMN_MASK (UL(0x1F)) +#define MDCR_EL2_RES0 (GENMASK(63, 37) | \ + GENMASK(35, 30) | \ + GENMASK(25, 24) | \ + GENMASK(22, 20) | \ + BIT(18) | \ + GENMASK(16, 15)) /* For compatibility with fault code shared with 32-bit */ #define FSC_FAULT ESR_ELx_FSC_FAULT diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 9f0bf2109be7..e86045ac43ba 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -59,12 +59,11 @@ #define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14 #define __KVM_HOST_SMCCC_FUNC___pkvm_init 15 -#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings 16 +#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp 16 #define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17 #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 #define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 -#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20 -#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21 +#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 20 #ifndef __ASSEMBLY__ @@ -210,7 +209,7 @@ extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); -extern u32 __kvm_get_mdcr_el2(void); +extern u64 __kvm_get_mdcr_el2(void); #define __KVM_EXTABLE(from, to) \ " .pushsection __kvm_ex_table, \"a\"\n" \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 41911585ae0c..f8be56d5342b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -66,7 +66,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); extern unsigned int kvm_sve_max_vl; int kvm_arm_init_sve(void); -int __attribute_const__ kvm_target_cpu(void); +u32 __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); @@ -185,7 +185,6 @@ enum vcpu_sysreg { PMCNTENSET_EL0, /* Count Enable Set Register */ PMINTENSET_EL1, /* Interrupt Enable Set Register */ PMOVSSET_EL0, /* Overflow Flag Status Set Register */ - PMSWINC_EL0, /* Software Increment Register */ PMUSERENR_EL0, /* User Enable Register */ /* Pointer Authentication Registers in a strict increasing order. */ @@ -287,9 +286,13 @@ struct kvm_vcpu_arch { /* Stage 2 paging state used by the hardware on next switch */ struct kvm_s2_mmu *hw_mmu; - /* HYP configuration */ + /* Values of trap registers for the guest. */ u64 hcr_el2; - u32 mdcr_el2; + u64 mdcr_el2; + u64 cptr_el2; + + /* Values of trap registers for the host before guest entry. */ + u64 mdcr_el2_host; /* Exception Information */ struct kvm_vcpu_fault_info fault; @@ -576,6 +579,7 @@ struct kvm_vcpu_stat { u64 wfi_exit_stat; u64 mmio_exit_user; u64 mmio_exit_kernel; + u64 signal_exits; u64 exits; }; @@ -771,6 +775,11 @@ void kvm_arch_free_vm(struct kvm *kvm); int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); +static inline bool kvm_vm_is_protected(struct kvm *kvm) +{ + return false; +} + int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 9d60b3006efc..657d0c94cf82 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -95,7 +95,7 @@ void __sve_restore_state(void *sve_pffr, u32 *fpsr); #ifndef __KVM_NVHE_HYPERVISOR__ void activate_traps_vhe_load(struct kvm_vcpu *vcpu); -void deactivate_traps_vhe_put(void); +void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu); #endif u64 __guest_enter(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index b52c5c4b9a3d..02d378887743 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -252,6 +252,11 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) +/* + * When this is (directly or indirectly) used on the TLB invalidation + * path, we rely on a previously issued DSB so that page table updates + * and VMID reads are correctly ordered. + */ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) { struct kvm_vmid *vmid = &mmu->vmid; @@ -259,7 +264,7 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0; baddr = mmu->pgd_phys; - vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT; + vmid_field = (u64)READ_ONCE(vmid->vmid) << VTTBR_VMID_SHIFT; return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; } @@ -267,9 +272,10 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) * Must be called from hyp code running at EL2 with an updated VTTBR * and interrupts disabled. */ -static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr) +static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, + struct kvm_arch *arch) { - write_sysreg(vtcr, vtcr_el2); + write_sysreg(arch->vtcr, vtcr_el2); write_sysreg(kvm_get_vttbr(mmu), vttbr_el2); /* @@ -280,11 +286,6 @@ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); } -static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu) -{ - __load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr); -} - static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu) { return container_of(mmu->arch, struct kvm, arch); diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index f004c0115d89..027783829584 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -25,6 +25,46 @@ static inline u64 kvm_get_parange(u64 mmfr0) typedef u64 kvm_pte_t; +#define KVM_PTE_VALID BIT(0) + +#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) +#define KVM_PTE_ADDR_51_48 GENMASK(15, 12) + +static inline bool kvm_pte_valid(kvm_pte_t pte) +{ + return pte & KVM_PTE_VALID; +} + +static inline u64 kvm_pte_to_phys(kvm_pte_t pte) +{ + u64 pa = pte & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) + pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + + return pa; +} + +static inline u64 kvm_granule_shift(u32 level) +{ + /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ + return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); +} + +static inline u64 kvm_granule_size(u32 level) +{ + return BIT(kvm_granule_shift(level)); +} + +static inline bool kvm_level_supports_block_mapping(u32 level) +{ + /* + * Reject invalid block mappings and don't bother with 4TB mappings for + * 52-bit PAs. + */ + return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); +} + /** * struct kvm_pgtable_mm_ops - Memory management callbacks. * @zalloc_page: Allocate a single zeroed memory page. @@ -76,30 +116,15 @@ enum kvm_pgtable_stage2_flags { }; /** - * struct kvm_pgtable - KVM page-table. - * @ia_bits: Maximum input address size, in bits. - * @start_level: Level at which the page-table walk starts. - * @pgd: Pointer to the first top-level entry of the page-table. - * @mm_ops: Memory management callbacks. - * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. - */ -struct kvm_pgtable { - u32 ia_bits; - u32 start_level; - kvm_pte_t *pgd; - struct kvm_pgtable_mm_ops *mm_ops; - - /* Stage-2 only */ - struct kvm_s2_mmu *mmu; - enum kvm_pgtable_stage2_flags flags; -}; - -/** * enum kvm_pgtable_prot - Page-table permissions and attributes. * @KVM_PGTABLE_PROT_X: Execute permission. * @KVM_PGTABLE_PROT_W: Write permission. * @KVM_PGTABLE_PROT_R: Read permission. * @KVM_PGTABLE_PROT_DEVICE: Device attributes. + * @KVM_PGTABLE_PROT_SW0: Software bit 0. + * @KVM_PGTABLE_PROT_SW1: Software bit 1. + * @KVM_PGTABLE_PROT_SW2: Software bit 2. + * @KVM_PGTABLE_PROT_SW3: Software bit 3. */ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_X = BIT(0), @@ -107,21 +132,48 @@ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_R = BIT(2), KVM_PGTABLE_PROT_DEVICE = BIT(3), + + KVM_PGTABLE_PROT_SW0 = BIT(55), + KVM_PGTABLE_PROT_SW1 = BIT(56), + KVM_PGTABLE_PROT_SW2 = BIT(57), + KVM_PGTABLE_PROT_SW3 = BIT(58), }; -#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) +#define KVM_PGTABLE_PROT_RW (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) +#define KVM_PGTABLE_PROT_RWX (KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X) + +#define PKVM_HOST_MEM_PROT KVM_PGTABLE_PROT_RWX +#define PKVM_HOST_MMIO_PROT KVM_PGTABLE_PROT_RW + +#define PAGE_HYP KVM_PGTABLE_PROT_RW #define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) #define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) #define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) +typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, + enum kvm_pgtable_prot prot); + /** - * struct kvm_mem_range - Range of Intermediate Physical Addresses - * @start: Start of the range. - * @end: End of the range. + * struct kvm_pgtable - KVM page-table. + * @ia_bits: Maximum input address size, in bits. + * @start_level: Level at which the page-table walk starts. + * @pgd: Pointer to the first top-level entry of the page-table. + * @mm_ops: Memory management callbacks. + * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. + * @flags: Stage-2 page-table flags. + * @force_pte_cb: Function that returns true if page level mappings must + * be used instead of block mappings. */ -struct kvm_mem_range { - u64 start; - u64 end; +struct kvm_pgtable { + u32 ia_bits; + u32 start_level; + kvm_pte_t *pgd; + struct kvm_pgtable_mm_ops *mm_ops; + + /* Stage-2 only */ + struct kvm_s2_mmu *mmu; + enum kvm_pgtable_stage2_flags flags; + kvm_pgtable_force_pte_cb_t force_pte_cb; }; /** @@ -216,21 +268,24 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); /** - * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table. + * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. * @pgt: Uninitialised page-table structure to initialise. * @arch: Arch-specific KVM structure representing the guest virtual * machine. * @mm_ops: Memory management callbacks. * @flags: Stage-2 configuration flags. + * @force_pte_cb: Function that returns true if page level mappings must + * be used instead of block mappings. * * Return: 0 on success, negative error code on failure. */ -int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch, - struct kvm_pgtable_mm_ops *mm_ops, - enum kvm_pgtable_stage2_flags flags); +int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, + struct kvm_pgtable_mm_ops *mm_ops, + enum kvm_pgtable_stage2_flags flags, + kvm_pgtable_force_pte_cb_t force_pte_cb); #define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \ - kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0) + __kvm_pgtable_stage2_init(pgt, arch, mm_ops, 0, NULL) /** * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. @@ -374,7 +429,8 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr); * If there is a valid, leaf page-table entry used to translate @addr, then * relax the permissions in that entry according to the read, write and * execute permissions specified by @prot. No permissions are removed, and - * TLB invalidation is performed after updating the entry. + * TLB invalidation is performed after updating the entry. Software bits cannot + * be set or cleared using kvm_pgtable_stage2_relax_perms(). * * Return: 0 on success, negative error code on failure. */ @@ -433,22 +489,42 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker); /** - * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical - * Addresses with compatible permission - * attributes. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). - * @addr: Address that must be covered by the range. - * @prot: Protection attributes that the range must be compatible with. - * @range: Range structure used to limit the search space at call time and - * that will hold the result. + * kvm_pgtable_get_leaf() - Walk a page-table and retrieve the leaf entry + * with its level. + * @pgt: Page-table structure initialised by kvm_pgtable_*_init() + * or a similar initialiser. + * @addr: Input address for the start of the walk. + * @ptep: Pointer to storage for the retrieved PTE. + * @level: Pointer to storage for the level of the retrieved PTE. + * + * The offset of @addr within a page is ignored. * - * The offset of @addr within a page is ignored. An IPA is compatible with @prot - * iff its corresponding stage-2 page-table entry has default ownership and, if - * valid, is mapped with protection attributes identical to @prot. + * The walker will walk the page-table entries corresponding to the input + * address specified, retrieving the leaf corresponding to this address. + * Invalid entries are treated as leaf entries. * * Return: 0 on success, negative error code on failure. */ -int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot, - struct kvm_mem_range *range); +int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, + kvm_pte_t *ptep, u32 *level); + +/** + * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a + * stage-2 Page-Table Entry. + * @pte: Page-table entry + * + * Return: protection attributes of the page-table entry in the enum + * kvm_pgtable_prot format. + */ +enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte); + +/** + * kvm_pgtable_hyp_pte_prot() - Retrieve the protection attributes of a stage-1 + * Page-Table Entry. + * @pte: Page-table entry + * + * Return: protection attributes of the page-table entry in the enum + * kvm_pgtable_prot format. + */ +enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte); #endif /* __ARM64_KVM_PGTABLE_H__ */ diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 3f93b9e0b339..02511650cffe 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -99,11 +99,17 @@ void mte_check_tfsr_el1(void); static inline void mte_check_tfsr_entry(void) { + if (!system_supports_mte()) + return; + mte_check_tfsr_el1(); } static inline void mte_check_tfsr_exit(void) { + if (!system_supports_mte()) + return; + /* * The asynchronous faults are sync'ed automatically with * TFSR_EL1 on kernel entry but for exit an explicit dsb() diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index 3a3264ff47b9..95f7686b728d 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -12,11 +12,13 @@ extern char *strrchr(const char *, int c); #define __HAVE_ARCH_STRCHR extern char *strchr(const char *, int c); +#ifndef CONFIG_KASAN_HW_TAGS #define __HAVE_ARCH_STRCMP extern int strcmp(const char *, const char *); #define __HAVE_ARCH_STRNCMP extern int strncmp(const char *, const char *, __kernel_size_t); +#endif #define __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index f2e06e7c0a31..b268082d67ed 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -784,14 +784,13 @@ #define ID_AA64PFR0_AMU 0x1 #define ID_AA64PFR0_SVE 0x1 #define ID_AA64PFR0_RAS_V1 0x1 +#define ID_AA64PFR0_RAS_V1P1 0x2 #define ID_AA64PFR0_FP_NI 0xf #define ID_AA64PFR0_FP_SUPPORTED 0x0 #define ID_AA64PFR0_ASIMD_NI 0xf #define ID_AA64PFR0_ASIMD_SUPPORTED 0x0 -#define ID_AA64PFR0_EL1_64BIT_ONLY 0x1 -#define ID_AA64PFR0_EL1_32BIT_64BIT 0x2 -#define ID_AA64PFR0_EL0_64BIT_ONLY 0x1 -#define ID_AA64PFR0_EL0_32BIT_64BIT 0x2 +#define ID_AA64PFR0_ELx_64BIT_ONLY 0x1 +#define ID_AA64PFR0_ELx_32BIT_64BIT 0x2 /* id_aa64pfr1 */ #define ID_AA64PFR1_MPAMFRAC_SHIFT 16 @@ -847,6 +846,9 @@ #define ID_AA64MMFR0_ASID_SHIFT 4 #define ID_AA64MMFR0_PARANGE_SHIFT 0 +#define ID_AA64MMFR0_ASID_8 0x0 +#define ID_AA64MMFR0_ASID_16 0x2 + #define ID_AA64MMFR0_TGRAN4_NI 0xf #define ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN 0x0 #define ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX 0x7 @@ -857,9 +859,16 @@ #define ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN 0x1 #define ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX 0xf +#define ID_AA64MMFR0_PARANGE_32 0x0 +#define ID_AA64MMFR0_PARANGE_36 0x1 +#define ID_AA64MMFR0_PARANGE_40 0x2 +#define ID_AA64MMFR0_PARANGE_42 0x3 +#define ID_AA64MMFR0_PARANGE_44 0x4 #define ID_AA64MMFR0_PARANGE_48 0x5 #define ID_AA64MMFR0_PARANGE_52 0x6 +#define ARM64_MIN_PARANGE_BITS 32 + #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT 0x0 #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE 0x1 #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN 0x2 @@ -904,6 +913,7 @@ #define ID_AA64MMFR2_CNP_SHIFT 0 /* id_aa64dfr0 */ +#define ID_AA64DFR0_MTPMU_SHIFT 48 #define ID_AA64DFR0_TRBE_SHIFT 44 #define ID_AA64DFR0_TRACE_FILT_SHIFT 40 #define ID_AA64DFR0_DOUBLELOCK_SHIFT 36 @@ -1034,14 +1044,17 @@ #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX +#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN4_2_SHIFT #elif defined(CONFIG_ARM64_16K_PAGES) #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN16_SHIFT #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX +#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN16_2_SHIFT #elif defined(CONFIG_ARM64_64K_PAGES) #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN64_SHIFT #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN64_SUPPORTED_MIN #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN64_SUPPORTED_MAX +#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN64_2_SHIFT #endif #define MVFR2_FPMISC_SHIFT 4 @@ -1172,6 +1185,11 @@ #define ICH_VTR_A3V_SHIFT 21 #define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT) +#define ARM64_FEATURE_FIELD_BITS 4 + +/* Create a mask for the feature bits of the specified feature. */ +#define ARM64_FEATURE_MASK(x) (GENMASK_ULL(x##_SHIFT + ARM64_FEATURE_FIELD_BITS - 1, x##_SHIFT)) + #ifdef __ASSEMBLY__ .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index b5f08621fa29..190b494e22ab 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -430,17 +430,6 @@ extern unsigned long __must_check __arch_copy_to_user(void __user *to, const voi __actu_ret; \ }) -extern unsigned long __must_check __arch_copy_in_user(void __user *to, const void __user *from, unsigned long n); -#define raw_copy_in_user(to, from, n) \ -({ \ - unsigned long __aciu_ret; \ - uaccess_ttbr0_enable(); \ - __aciu_ret = __arch_copy_in_user(__uaccess_mask_ptr(to), \ - __uaccess_mask_ptr(from), (n)); \ - uaccess_ttbr0_disable(); \ - __aciu_ret; \ -}) - #define INLINE_COPY_TO_USER #define INLINE_COPY_FROM_USER diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 727bfc3be99b..3cb206aea3db 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 447 +#define __NR_compat_syscalls 449 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 03d4ca47d253..844f6ae58662 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -649,11 +649,11 @@ __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch) #define __NR_inotify_rm_watch 318 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) #define __NR_mbind 319 -__SYSCALL(__NR_mbind, compat_sys_mbind) +__SYSCALL(__NR_mbind, sys_mbind) #define __NR_get_mempolicy 320 -__SYSCALL(__NR_get_mempolicy, compat_sys_get_mempolicy) +__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) #define __NR_set_mempolicy 321 -__SYSCALL(__NR_set_mempolicy, compat_sys_set_mempolicy) +__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) #define __NR_openat 322 __SYSCALL(__NR_openat, compat_sys_openat) #define __NR_mkdirat 323 @@ -699,7 +699,7 @@ __SYSCALL(__NR_tee, sys_tee) #define __NR_vmsplice 343 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages 344 -__SYSCALL(__NR_move_pages, compat_sys_move_pages) +__SYSCALL(__NR_move_pages, sys_move_pages) #define __NR_getcpu 345 __SYSCALL(__NR_getcpu, sys_getcpu) #define __NR_epoll_pwait 346 @@ -811,7 +811,7 @@ __SYSCALL(__NR_rseq, sys_rseq) #define __NR_io_pgetevents 399 __SYSCALL(__NR_io_pgetevents, compat_sys_io_pgetevents) #define __NR_migrate_pages 400 -__SYSCALL(__NR_migrate_pages, compat_sys_migrate_pages) +__SYSCALL(__NR_migrate_pages, sys_migrate_pages) #define __NR_kexec_file_load 401 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) /* 402 is unused */ @@ -901,6 +901,8 @@ __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) #define __NR_landlock_restrict_self 446 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) +#define __NR_process_mrelease 448 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 1c9c2f7a1c04..f3851724fe35 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -273,8 +273,7 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) return __pgprot(PROT_DEVICE_nGnRnE); } -static void __iomem *__acpi_os_ioremap(acpi_physical_address phys, - acpi_size size, bool memory) +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) { efi_memory_desc_t *md, *region = NULL; pgprot_t prot; @@ -300,11 +299,9 @@ static void __iomem *__acpi_os_ioremap(acpi_physical_address phys, * It is fine for AML to remap regions that are not represented in the * EFI memory map at all, as it only describes normal memory, and MMIO * regions that require a virtual mapping to make them accessible to - * the EFI runtime services. Determine the region default - * attributes by checking the requested memory semantics. + * the EFI runtime services. */ - prot = memory ? __pgprot(PROT_NORMAL_NC) : - __pgprot(PROT_DEVICE_nGnRnE); + prot = __pgprot(PROT_DEVICE_nGnRnE); if (region) { switch (region->type) { case EFI_LOADER_CODE: @@ -364,16 +361,6 @@ static void __iomem *__acpi_os_ioremap(acpi_physical_address phys, return __ioremap(phys, size, prot); } -void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) -{ - return __acpi_os_ioremap(phys, size, false); -} - -void __iomem *acpi_os_memmap(acpi_physical_address phys, acpi_size size) -{ - return __acpi_os_ioremap(phys, size, true); -} - /* * Claim Synchronous External Aborts as a firmware first notification. * diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index 7fa6828bb488..587543c6c51c 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -43,7 +43,7 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->type = type; } -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { unsigned int ctype, level, leaves, fw_level; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -78,7 +78,7 @@ static int __init_cache_level(unsigned int cpu) return 0; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { unsigned int level, idx; enum cache_type type; @@ -97,6 +97,3 @@ static int __populate_cache_leaves(unsigned int cpu) } return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index b2770d753ba3..6ec7036ef7e1 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -240,8 +240,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY), ARM64_FTR_END, }; @@ -1526,9 +1526,13 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, /* * For reasons that aren't entirely clear, enabling KPTI on Cavium * ThunderX leads to apparent I-cache corruption of kernel text, which - * ends as well as you might imagine. Don't even try. + * ends as well as you might imagine. Don't even try. We cannot rely + * on the cpus_have_*cap() helpers here to detect the CPU erratum + * because cpucap detection order may change. However, since we know + * affected CPUs are always in a homogeneous configuration, it is + * safe to rely on this_cpu_has_cap() here. */ - if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_27456)) { + if (this_cpu_has_cap(ARM64_WORKAROUND_CAVIUM_27456)) { str = "ARM64_WORKAROUND_CAVIUM_27456"; __kpti_forced = -1; } @@ -1983,7 +1987,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, .field_pos = ID_AA64PFR0_EL0_SHIFT, - .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, + .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT, }, #ifdef CONFIG_KVM { @@ -1994,7 +1998,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, .field_pos = ID_AA64PFR0_EL1_SHIFT, - .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT, + .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT, }, { .desc = "Protected KVM", diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 5a294f20e9de..ff4962750b3d 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -513,7 +513,7 @@ size_t sve_state_size(struct task_struct const *task) void sve_alloc(struct task_struct *task) { if (task->thread.sve_state) { - memset(task->thread.sve_state, 0, sve_state_size(current)); + memset(task->thread.sve_state, 0, sve_state_size(task)); return; } diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 9d314a3bad3b..e5e801bc5312 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -142,12 +142,7 @@ void mte_enable_kernel_async(void) #ifdef CONFIG_KASAN_HW_TAGS void mte_check_tfsr_el1(void) { - u64 tfsr_el1; - - if (!system_supports_mte()) - return; - - tfsr_el1 = read_sysreg_s(SYS_TFSR_EL1); + u64 tfsr_el1 = read_sysreg_s(SYS_TFSR_EL1); if (unlikely(tfsr_el1 & SYS_TFSR_EL1_TF1)) { /* @@ -199,6 +194,9 @@ void mte_thread_init_user(void) void mte_thread_switch(struct task_struct *next) { + if (!system_supports_mte()) + return; + mte_update_sctlr_user(next); /* diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index 1006ed2d7c60..2276689b5411 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -82,14 +82,29 @@ int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) { - if (!acpi_disabled) { - struct pci_config_window *cfg = bridge->bus->sysdata; - struct acpi_device *adev = to_acpi_device(cfg->parent); - struct device *bus_dev = &bridge->bus->dev; + struct pci_config_window *cfg; + struct acpi_device *adev; + struct device *bus_dev; - ACPI_COMPANION_SET(&bridge->dev, adev); - set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev))); - } + if (acpi_disabled) + return 0; + + cfg = bridge->bus->sysdata; + + /* + * On Hyper-V there is no corresponding ACPI device for a root bridge, + * therefore ->parent is set as NULL by the driver. And set 'adev' as + * NULL in this case because there is no proper ACPI device. + */ + if (!cfg->parent) + adev = NULL; + else + adev = to_acpi_device(cfg->parent); + + bus_dev = &bridge->bus->dev; + + ACPI_COMPANION_SET(&bridge->dev, adev); + set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev))); return 0; } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 2bd270cd603e..40adb8cdbf5a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -6,9 +6,6 @@ * Copyright (C) 1996-2000 Russell King - Converted to ARM. * Copyright (C) 2012 ARM Ltd. */ - -#include <stdarg.h> - #include <linux/compat.h> #include <linux/efi.h> #include <linux/elf.h> @@ -21,7 +18,6 @@ #include <linux/mman.h> #include <linux/mm.h> #include <linux/nospec.h> -#include <linux/sched.h> #include <linux/stddef.h> #include <linux/sysctl.h> #include <linux/unistd.h> @@ -61,7 +57,7 @@ #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include <linux/stackprotector.h> -unsigned long __stack_chk_guard __read_mostly; +unsigned long __stack_chk_guard __ro_after_init; EXPORT_SYMBOL(__stack_chk_guard); #endif diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 9fe70b12b34f..c287b9407f28 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -940,10 +940,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); - if (thread_flags & _TIF_NOTIFY_RESUME) { + if (thread_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } if (thread_flags & _TIF_FOREIGN_FPSTATE) fpsimd_restore_current_state(); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 709d2c433c5e..f6b1a88245db 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -181,6 +181,8 @@ SECTIONS /* everything from this point to __init_begin will be marked RO NX */ RO_DATA(PAGE_SIZE) + HYPERVISOR_DATA_SECTIONS + idmap_pg_dir = .; . += IDMAP_DIR_SIZE; idmap_pg_end = .; @@ -260,8 +262,6 @@ SECTIONS _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) - HYPERVISOR_DATA_SECTIONS - /* * Data written with the MMU off but read with the MMU on requires * cache lines to be invalidated, discarding up to a Cache Writeback diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index a4eba0908bfa..d7eec0b43744 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -26,6 +26,7 @@ menuconfig KVM select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select KVM_XFER_TO_GUEST_WORK select SRCU select KVM_VFIO select HAVE_KVM_EVENTFD @@ -46,6 +47,15 @@ if KVM source "virt/kvm/Kconfig" +config NVHE_EL2_DEBUG + bool "Debug mode for non-VHE EL2 object" + help + Say Y here to enable the debug mode for the non-VHE KVM EL2 object. + Failure reports will BUG() in the hypervisor. This is intended for + local EL2 hypervisor development. + + If unsure, say N. + endif # KVM endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 0ca72f5cda41..fe102cd2e518 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -6,6 +6,7 @@ #include <linux/bug.h> #include <linux/cpu_pm.h> +#include <linux/entry-kvm.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/kvm_host.h> @@ -15,6 +16,7 @@ #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> +#include <linux/kmemleak.h> #include <linux/kvm.h> #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> @@ -42,10 +44,6 @@ #include <kvm/arm_pmu.h> #include <kvm/arm_psci.h> -#ifdef REQUIRES_VIRT -__asm__(".arch_extension virt"); -#endif - static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); @@ -575,7 +573,7 @@ static void update_vmid(struct kvm_vmid *vmid) kvm_call_hyp(__kvm_flush_vm_context); } - vmid->vmid = kvm_next_vmid; + WRITE_ONCE(vmid->vmid, kvm_next_vmid); kvm_next_vmid++; kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1; @@ -719,6 +717,45 @@ static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) } /** + * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest + * @vcpu: The VCPU pointer + * @ret: Pointer to write optional return code + * + * Returns: true if the VCPU needs to return to a preemptible + interruptible + * and skip guest entry. + * + * This function disambiguates between two different types of exits: exits to a + * preemptible + interruptible kernel context and exits to userspace. For an + * exit to userspace, this function will write the return code to ret and return + * true. For an exit to preemptible + interruptible kernel context (i.e. check + * for pending work and re-enter), return true without writing to ret. + */ +static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) +{ + struct kvm_run *run = vcpu->run; + + /* + * If we're using a userspace irqchip, then check if we need + * to tell a userspace irqchip about timer or PMU level + * changes and if so, exit to userspace (the actual level + * state gets updated in kvm_timer_update_run and + * kvm_pmu_update_run below). + */ + if (static_branch_unlikely(&userspace_irqchip_in_use)) { + if (kvm_timer_should_notify_user(vcpu) || + kvm_pmu_should_notify_user(vcpu)) { + *ret = -EINTR; + run->exit_reason = KVM_EXIT_INTR; + return true; + } + } + + return kvm_request_pending(vcpu) || + need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) || + xfer_to_guest_mode_work_pending(); +} + +/** * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code * @vcpu: The VCPU pointer * @@ -761,7 +798,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* * Check conditions before entering the guest */ - cond_resched(); + ret = xfer_to_guest_mode_handle_work(vcpu); + if (!ret) + ret = 1; update_vmid(&vcpu->arch.hw_mmu->vmid); @@ -781,30 +820,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_vgic_flush_hwstate(vcpu); /* - * Exit if we have a signal pending so that we can deliver the - * signal to user space. - */ - if (signal_pending(current)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - } - - /* - * If we're using a userspace irqchip, then check if we need - * to tell a userspace irqchip about timer or PMU level - * changes and if so, exit to userspace (the actual level - * state gets updated in kvm_timer_update_run and - * kvm_pmu_update_run below). - */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) { - if (kvm_timer_should_notify_user(vcpu) || - kvm_pmu_should_notify_user(vcpu)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - } - } - - /* * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. * See the comment in kvm_vcpu_exiting_guest_mode() and @@ -812,8 +827,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - if (ret <= 0 || need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) || - kvm_request_pending(vcpu)) { + if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) { vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ kvm_pmu_sync_hwstate(vcpu); @@ -1039,7 +1053,7 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { unsigned int i, ret; - int phys_target = kvm_target_cpu(); + u32 phys_target = kvm_target_cpu(); if (init->target != phys_target) return -EINVAL; @@ -1108,6 +1122,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, } vcpu_reset_hcr(vcpu); + vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT; /* * Handle the "start in power-off" case. @@ -1219,6 +1234,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(®, argp, sizeof(reg))) break; + /* + * We could owe a reset due to PSCI. Handle the pending reset + * here to ensure userspace register accesses are ordered after + * the reset. + */ + if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) + kvm_reset_vcpu(vcpu); + if (ioctl == KVM_SET_ONE_REG) r = kvm_arm_set_reg(vcpu, ®); else @@ -1700,11 +1723,6 @@ static bool init_psci_relay(void) return true; } -static int init_common_resources(void) -{ - return kvm_set_ipa_limit(); -} - static int init_subsystems(void) { int err = 0; @@ -1958,56 +1976,17 @@ static void _kvm_host_prot_finalize(void *discard) WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)); } -static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) -{ - return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end); -} - -#define pkvm_mark_hyp_section(__section) \ - pkvm_mark_hyp(__pa_symbol(__section##_start), \ - __pa_symbol(__section##_end)) - static int finalize_hyp_mode(void) { - int cpu, ret; - if (!is_protected_kvm_enabled()) return 0; - ret = pkvm_mark_hyp_section(__hyp_idmap_text); - if (ret) - return ret; - - ret = pkvm_mark_hyp_section(__hyp_text); - if (ret) - return ret; - - ret = pkvm_mark_hyp_section(__hyp_rodata); - if (ret) - return ret; - - ret = pkvm_mark_hyp_section(__hyp_bss); - if (ret) - return ret; - - ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size); - if (ret) - return ret; - - for_each_possible_cpu(cpu) { - phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]); - phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order()); - - ret = pkvm_mark_hyp(start, end); - if (ret) - return ret; - - start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu)); - end = start + PAGE_SIZE; - ret = pkvm_mark_hyp(start, end); - if (ret) - return ret; - } + /* + * Exclude HYP BSS from kmemleak so that it doesn't get peeked + * at, which would end badly once the section is inaccessible. + * None of other sections should ever be introspected. + */ + kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); /* * Flip the static key upfront as that may no longer be possible @@ -2019,11 +1998,6 @@ static int finalize_hyp_mode(void) return 0; } -static void check_kvm_target_cpu(void *ret) -{ - *(int *)ret = kvm_target_cpu(); -} - struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) { struct kvm_vcpu *vcpu; @@ -2083,7 +2057,6 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) int kvm_arch_init(void *opaque) { int err; - int ret, cpu; bool in_hyp_mode; if (!is_hyp_mode_available()) { @@ -2098,15 +2071,7 @@ int kvm_arch_init(void *opaque) kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ "Only trusted guests should be used on this system.\n"); - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1); - if (ret < 0) { - kvm_err("Error, CPU %d not supported!\n", cpu); - return -ENODEV; - } - } - - err = init_common_resources(); + err = kvm_set_ipa_limit(); if (err) return err; diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index d5e79d7ee6e9..db9361338b2a 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -21,7 +21,7 @@ DBG_MDSCR_KDE | \ DBG_MDSCR_MDE) -static DEFINE_PER_CPU(u32, mdcr_el2); +static DEFINE_PER_CPU(u64, mdcr_el2); /** * save/restore_guest_debug_regs diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 1dfb83578277..5ce26bedf23c 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -31,8 +31,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -50,10 +48,9 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, wfi_exit_stat), STATS_DESC_COUNTER(VCPU, mmio_exit_user), STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), + STATS_DESC_COUNTER(VCPU, signal_exits), STATS_DESC_COUNTER(VCPU, exits) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -842,7 +839,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, return 0; } -int __attribute_const__ kvm_target_cpu(void) +u32 __attribute_const__ kvm_target_cpu(void) { unsigned long implementor = read_cpuid_implementor(); unsigned long part_number = read_cpuid_part_number(); @@ -874,7 +871,7 @@ int __attribute_const__ kvm_target_cpu(void) int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) { - int target = kvm_target_cpu(); + u32 target = kvm_target_cpu(); if (target < 0) return -ENODEV; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 6f48336b1d86..275a27368a04 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -113,34 +113,20 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) * guest and host are using the same debug facilities it will be up to * userspace to re-inject the correct exception for guest delivery. * - * @return: 0 (while setting vcpu->run->exit_reason), -1 for error + * @return: 0 (while setting vcpu->run->exit_reason) */ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 esr = kvm_vcpu_get_esr(vcpu); - int ret = 0; run->exit_reason = KVM_EXIT_DEBUG; run->debug.arch.hsr = esr; - switch (ESR_ELx_EC(esr)) { - case ESR_ELx_EC_WATCHPT_LOW: + if (ESR_ELx_EC(esr) == ESR_ELx_EC_WATCHPT_LOW) run->debug.arch.far = vcpu->arch.fault.far_el2; - fallthrough; - case ESR_ELx_EC_SOFTSTP_LOW: - case ESR_ELx_EC_BREAKPT_LOW: - case ESR_ELx_EC_BKPT32: - case ESR_ELx_EC_BRK64: - break; - default: - kvm_err("%s: un-handled case esr: %#08x\n", - __func__, (unsigned int) esr); - ret = -1; - break; - } - return ret; + return 0; } static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) @@ -292,11 +278,12 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); } -void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, +void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, + u64 elr_virt, u64 elr_phys, u64 par, uintptr_t vcpu, u64 far, u64 hpfar) { - u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr)); - u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr; + u64 elr_in_kimg = __phys_to_kimg(elr_phys); + u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt; u64 mode = spsr & PSR_MODE_MASK; /* @@ -309,20 +296,24 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, kvm_err("Invalid host exception to nVHE hyp!\n"); } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) { - struct bug_entry *bug = find_bug(elr_in_kimg); const char *file = NULL; unsigned int line = 0; /* All hyp bugs, including warnings, are treated as fatal. */ - if (bug) - bug_get_file_line(bug, &file, &line); + if (!is_protected_kvm_enabled() || + IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + struct bug_entry *bug = find_bug(elr_in_kimg); + + if (bug) + bug_get_file_line(bug, &file, &line); + } if (file) kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else - kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset); + kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset); } else { - kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset); + kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset); } /* @@ -334,5 +325,5 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, kvm_err("Hyp Offset: 0x%llx\n", hyp_offset); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n", - spsr, elr, esr, far, hpfar, par, vcpu); + spsr, elr_virt, esr, far, hpfar, par, vcpu); } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index e4a2f295a394..a0e78a6027be 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -92,11 +92,15 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) write_sysreg(0, pmselr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); } + + vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); } -static inline void __deactivate_traps_common(void) +static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { + write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2); + write_sysreg(0, hstr_el2); if (kvm_arm_support_pmu_v3()) write_sysreg(0, pmuserenr_el0); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 9c227d87c36d..b58c910babaf 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -12,6 +12,32 @@ #include <asm/virt.h> #include <nvhe/spinlock.h> +/* + * SW bits 0-1 are reserved to track the memory ownership state of each page: + * 00: The page is owned exclusively by the page-table owner. + * 01: The page is owned by the page-table owner, but is shared + * with another entity. + * 10: The page is shared with, but not owned by the page-table owner. + * 11: Reserved for future use (lending). + */ +enum pkvm_page_state { + PKVM_PAGE_OWNED = 0ULL, + PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0, + PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1, +}; + +#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) +static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, + enum pkvm_page_state state) +{ + return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state; +} + +static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) +{ + return prot & PKVM_PAGE_STATE_PROT_MASK; +} + struct host_kvm { struct kvm_arch arch; struct kvm_pgtable pgt; @@ -20,16 +46,21 @@ struct host_kvm { }; extern struct host_kvm host_kvm; +extern const u8 pkvm_hyp_id; + int __pkvm_prot_finalize(void); -int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end); +int __pkvm_host_share_hyp(u64 pfn); +bool addr_is_memory(phys_addr_t phys); +int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id); int kvm_host_prepare_stage2(void *pgt_pool_base); void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); static __always_inline void __load_host_stage2(void) { if (static_branch_likely(&kvm_protected_mode_initialized)) - __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr); + __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); else write_sysreg(0, vttbr_el2); } diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index 8ec3a5a7744b..c9a8f535212e 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -23,8 +23,7 @@ int hyp_map_vectors(void); int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back); int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot); int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot); -int __pkvm_create_mappings(unsigned long start, unsigned long size, - unsigned long phys, enum kvm_pgtable_prot prot); +int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot); unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h index 76b537f8d1c6..4652fd04bdbe 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h +++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h @@ -15,6 +15,7 @@ #include <asm/alternative.h> #include <asm/lse.h> +#include <asm/rwonce.h> typedef union hyp_spinlock { u32 __val; @@ -89,4 +90,28 @@ static inline void hyp_spin_unlock(hyp_spinlock_t *lock) : "memory"); } +static inline bool hyp_spin_is_locked(hyp_spinlock_t *lock) +{ + hyp_spinlock_t lockval = READ_ONCE(*lock); + + return lockval.owner != lockval.next; +} + +#ifdef CONFIG_NVHE_EL2_DEBUG +static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) +{ + /* + * The __pkvm_init() path accesses protected data-structures without + * holding locks as the other CPUs are guaranteed to not enter EL2 + * concurrently at this point in time. The point by which EL2 is + * initialized on all CPUs is reflected in the pkvm static key, so + * wait until it is set before checking the lock state. + */ + if (static_branch_likely(&kvm_protected_mode_initialized)) + BUG_ON(!hyp_spin_is_locked(lock)); +} +#else +static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) { } +#endif + #endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 7d3f25868cae..df361d839902 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -109,7 +109,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) __debug_switch_to_host_common(vcpu); } -u32 __kvm_get_mdcr_el2(void) +u64 __kvm_get_mdcr_el2(void) { return read_sysreg(mdcr_el2); } diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 2b23400e0fb3..4b652ffb591d 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -7,6 +7,7 @@ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> @@ -85,12 +86,24 @@ SYM_FUNC_START(__hyp_do_panic) mov x29, x0 +#ifdef CONFIG_NVHE_EL2_DEBUG + /* Ensure host stage-2 is disabled */ + mrs x0, hcr_el2 + bic x0, x0, #HCR_VM + msr hcr_el2, x0 + isb + tlbi vmalls12e1 + dsb nsh +#endif + /* Load the panic arguments into x0-7 */ mrs x0, esr_el2 - get_vcpu_ptr x4, x5 - mrs x5, far_el2 - mrs x6, hpfar_el2 - mov x7, xzr // Unused argument + mov x4, x3 + mov x3, x2 + hyp_pa x3, x6 + get_vcpu_ptr x5, x6 + mrs x6, far_el2 + mrs x7, hpfar_el2 /* Enter the host, conditionally restoring the host context. */ cbz x29, __host_enter_without_restoring diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 1632f001f4ed..2da6aa8da868 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -140,14 +140,11 @@ static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot); } -static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt) +static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt) { - DECLARE_REG(unsigned long, start, host_ctxt, 1); - DECLARE_REG(unsigned long, size, host_ctxt, 2); - DECLARE_REG(unsigned long, phys, host_ctxt, 3); - DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4); + DECLARE_REG(u64, pfn, host_ctxt, 1); - cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot); + cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(pfn); } static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt) @@ -163,14 +160,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) { cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } - -static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt) -{ - DECLARE_REG(phys_addr_t, start, host_ctxt, 1); - DECLARE_REG(phys_addr_t, end, host_ctxt, 2); - - cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end); -} typedef void (*hcall_t)(struct kvm_cpu_context *); #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x @@ -193,10 +182,9 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__vgic_v3_restore_aprs), HANDLE_FUNC(__pkvm_init), HANDLE_FUNC(__pkvm_cpu_set_vector), - HANDLE_FUNC(__pkvm_create_mappings), + HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__pkvm_create_private_mapping), HANDLE_FUNC(__pkvm_prot_finalize), - HANDLE_FUNC(__pkvm_mark_hyp), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index a6ce991b1467..bacd493a4eac 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -31,7 +31,7 @@ static struct hyp_pool host_s2_pool; u64 id_aa64mmfr0_el1_sys_val; u64 id_aa64mmfr1_el1_sys_val; -static const u8 pkvm_hyp_id = 1; +const u8 pkvm_hyp_id = 1; static void *host_s2_zalloc_pages_exact(size_t size) { @@ -89,6 +89,8 @@ static void prepare_host_vtcr(void) id_aa64mmfr1_el1_sys_val, phys_shift); } +static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot); + int kvm_host_prepare_stage2(void *pgt_pool_base) { struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; @@ -101,16 +103,17 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) if (ret) return ret; - ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch, - &host_kvm.mm_ops, KVM_HOST_S2_FLAGS); + ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch, + &host_kvm.mm_ops, KVM_HOST_S2_FLAGS, + host_stage2_force_pte_cb); if (ret) return ret; mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd); mmu->arch = &host_kvm.arch; mmu->pgt = &host_kvm.pgt; - mmu->vmid.vmid_gen = 0; - mmu->vmid.vmid = 0; + WRITE_ONCE(mmu->vmid.vmid_gen, 0); + WRITE_ONCE(mmu->vmid.vmid, 0); return 0; } @@ -126,7 +129,7 @@ int __pkvm_prot_finalize(void) kvm_flush_dcache_to_poc(params, sizeof(*params)); write_sysreg(params->hcr_el2, hcr_el2); - __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr); + __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); /* * Make sure to have an ISB before the TLB maintenance below but only @@ -159,6 +162,11 @@ static int host_stage2_unmap_dev_all(void) return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); } +struct kvm_mem_range { + u64 start; + u64 end; +}; + static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) { int cur, left = 0, right = hyp_memblock_nr; @@ -189,16 +197,26 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) return false; } +bool addr_is_memory(phys_addr_t phys) +{ + struct kvm_mem_range range; + + return find_mem_range(phys, &range); +} + +static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) +{ + return range->start <= addr && addr < range->end; +} + static bool range_is_memory(u64 start, u64 end) { - struct kvm_mem_range r1, r2; + struct kvm_mem_range r; - if (!find_mem_range(start, &r1) || !find_mem_range(end - 1, &r2)) - return false; - if (r1.start != r2.start) + if (!find_mem_range(start, &r)) return false; - return true; + return is_in_mem_range(end - 1, &r); } static inline int __host_stage2_idmap(u64 start, u64 end, @@ -208,60 +226,208 @@ static inline int __host_stage2_idmap(u64 start, u64 end, prot, &host_s2_pool); } +/* + * The pool has been provided with enough pages to cover all of memory with + * page granularity, but it is difficult to know how much of the MMIO range + * we will need to cover upfront, so we may need to 'recycle' the pages if we + * run out. + */ +#define host_stage2_try(fn, ...) \ + ({ \ + int __ret; \ + hyp_assert_lock_held(&host_kvm.lock); \ + __ret = fn(__VA_ARGS__); \ + if (__ret == -ENOMEM) { \ + __ret = host_stage2_unmap_dev_all(); \ + if (!__ret) \ + __ret = fn(__VA_ARGS__); \ + } \ + __ret; \ + }) + +static inline bool range_included(struct kvm_mem_range *child, + struct kvm_mem_range *parent) +{ + return parent->start <= child->start && child->end <= parent->end; +} + +static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) +{ + struct kvm_mem_range cur; + kvm_pte_t pte; + u32 level; + int ret; + + hyp_assert_lock_held(&host_kvm.lock); + ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level); + if (ret) + return ret; + + if (kvm_pte_valid(pte)) + return -EAGAIN; + + if (pte) + return -EPERM; + + do { + u64 granule = kvm_granule_size(level); + cur.start = ALIGN_DOWN(addr, granule); + cur.end = cur.start + granule; + level++; + } while ((level < KVM_PGTABLE_MAX_LEVELS) && + !(kvm_level_supports_block_mapping(level) && + range_included(&cur, range))); + + *range = cur; + + return 0; +} + +int host_stage2_idmap_locked(phys_addr_t addr, u64 size, + enum kvm_pgtable_prot prot) +{ + hyp_assert_lock_held(&host_kvm.lock); + + return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); +} + +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) +{ + hyp_assert_lock_held(&host_kvm.lock); + + return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt, + addr, size, &host_s2_pool, owner_id); +} + +static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) +{ + /* + * Block mappings must be used with care in the host stage-2 as a + * kvm_pgtable_stage2_map() operation targeting a page in the range of + * an existing block will delete the block under the assumption that + * mappings in the rest of the block range can always be rebuilt lazily. + * That assumption is correct for the host stage-2 with RWX mappings + * targeting memory or RW mappings targeting MMIO ranges (see + * host_stage2_idmap() below which implements some of the host memory + * abort logic). However, this is not safe for any other mappings where + * the host stage-2 page-table is in fact the only place where this + * state is stored. In all those cases, it is safer to use page-level + * mappings, hence avoiding to lose the state because of side-effects in + * kvm_pgtable_stage2_map(). + */ + if (range_is_memory(addr, end)) + return prot != PKVM_HOST_MEM_PROT; + else + return prot != PKVM_HOST_MMIO_PROT; +} + static int host_stage2_idmap(u64 addr) { - enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W; struct kvm_mem_range range; bool is_memory = find_mem_range(addr, &range); + enum kvm_pgtable_prot prot; int ret; - if (is_memory) - prot |= KVM_PGTABLE_PROT_X; + prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT; hyp_spin_lock(&host_kvm.lock); - ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range); + ret = host_stage2_adjust_range(addr, &range); if (ret) goto unlock; - ret = __host_stage2_idmap(range.start, range.end, prot); - if (ret != -ENOMEM) + ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot); +unlock: + hyp_spin_unlock(&host_kvm.lock); + + return ret; +} + +static inline bool check_prot(enum kvm_pgtable_prot prot, + enum kvm_pgtable_prot required, + enum kvm_pgtable_prot denied) +{ + return (prot & (required | denied)) == required; +} + +int __pkvm_host_share_hyp(u64 pfn) +{ + phys_addr_t addr = hyp_pfn_to_phys(pfn); + enum kvm_pgtable_prot prot, cur; + void *virt = __hyp_va(addr); + enum pkvm_page_state state; + kvm_pte_t pte; + int ret; + + if (!addr_is_memory(addr)) + return -EINVAL; + + hyp_spin_lock(&host_kvm.lock); + hyp_spin_lock(&pkvm_pgd_lock); + + ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, NULL); + if (ret) goto unlock; + if (!pte) + goto map_shared; /* - * The pool has been provided with enough pages to cover all of memory - * with page granularity, but it is difficult to know how much of the - * MMIO range we will need to cover upfront, so we may need to 'recycle' - * the pages if we run out. + * Check attributes in the host stage-2 PTE. We need the page to be: + * - mapped RWX as we're sharing memory; + * - not borrowed, as that implies absence of ownership. + * Otherwise, we can't let it got through */ - ret = host_stage2_unmap_dev_all(); - if (ret) + cur = kvm_pgtable_stage2_pte_prot(pte); + prot = pkvm_mkstate(0, PKVM_PAGE_SHARED_BORROWED); + if (!check_prot(cur, PKVM_HOST_MEM_PROT, prot)) { + ret = -EPERM; goto unlock; + } - ret = __host_stage2_idmap(range.start, range.end, prot); + state = pkvm_getstate(cur); + if (state == PKVM_PAGE_OWNED) + goto map_shared; -unlock: - hyp_spin_unlock(&host_kvm.lock); + /* + * Tolerate double-sharing the same page, but this requires + * cross-checking the hypervisor stage-1. + */ + if (state != PKVM_PAGE_SHARED_OWNED) { + ret = -EPERM; + goto unlock; + } - return ret; -} + ret = kvm_pgtable_get_leaf(&pkvm_pgtable, (u64)virt, &pte, NULL); + if (ret) + goto unlock; -int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) -{ - int ret; + /* + * If the page has been shared with the hypervisor, it must be + * already mapped as SHARED_BORROWED in its stage-1. + */ + cur = kvm_pgtable_hyp_pte_prot(pte); + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); + if (!check_prot(cur, prot, ~prot)) + ret = -EPERM; + goto unlock; +map_shared: /* - * host_stage2_unmap_dev_all() currently relies on MMIO mappings being - * non-persistent, so don't allow changing page ownership in MMIO range. + * If the page is not yet shared, adjust mappings in both page-tables + * while both locks are held. */ - if (!range_is_memory(start, end)) - return -EINVAL; + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); + ret = pkvm_create_mappings_locked(virt, virt + PAGE_SIZE, prot); + BUG_ON(ret); - hyp_spin_lock(&host_kvm.lock); - ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start, - &host_s2_pool, pkvm_hyp_id); + prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot); + BUG_ON(ret); + +unlock: + hyp_spin_unlock(&pkvm_pgd_lock); hyp_spin_unlock(&host_kvm.lock); - return ret != -EAGAIN ? ret : 0; + return ret; } void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index a8efdf0f9003..2fabeceb889a 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -23,8 +23,8 @@ u64 __io_map_base; struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS]; unsigned int hyp_memblock_nr; -int __pkvm_create_mappings(unsigned long start, unsigned long size, - unsigned long phys, enum kvm_pgtable_prot prot) +static int __pkvm_create_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot) { int err; @@ -67,13 +67,15 @@ out: return addr; } -int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) +int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot) { unsigned long start = (unsigned long)from; unsigned long end = (unsigned long)to; unsigned long virt_addr; phys_addr_t phys; + hyp_assert_lock_held(&pkvm_pgd_lock); + start = start & PAGE_MASK; end = PAGE_ALIGN(end); @@ -81,7 +83,8 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) int err; phys = hyp_virt_to_phys((void *)virt_addr); - err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot); + err = kvm_pgtable_hyp_map(&pkvm_pgtable, virt_addr, PAGE_SIZE, + phys, prot); if (err) return err; } @@ -89,6 +92,17 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) return 0; } +int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) +{ + int ret; + + hyp_spin_lock(&pkvm_pgd_lock); + ret = pkvm_create_mappings_locked(from, to, prot); + hyp_spin_unlock(&pkvm_pgd_lock); + + return ret; +} + int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back) { unsigned long start, end; diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 0b574d106519..57c27846320f 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -58,6 +58,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, { void *start, *end, *virt = hyp_phys_to_virt(phys); unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT; + enum kvm_pgtable_prot prot; int ret, i; /* Recreate the hyp page-table using the early page allocator */ @@ -83,10 +84,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO); - if (ret) - return ret; - ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO); if (ret) return ret; @@ -95,10 +92,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO); - if (ret) - return ret; - ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP); if (ret) return ret; @@ -117,6 +110,24 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, return ret; } + /* + * Map the host's .bss and .rodata sections RO in the hypervisor, but + * transfer the ownership from the host to the hypervisor itself to + * make sure it can't be donated or shared with another entity. + * + * The ownership transition requires matching changes in the host + * stage-2. This will be done later (see finalize_host_mappings()) once + * the hyp_vmemmap is addressable. + */ + prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED); + ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot); + if (ret) + return ret; + + ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot); + if (ret) + return ret; + return 0; } @@ -148,6 +159,57 @@ static void hpool_put_page(void *addr) hyp_put_page(&hpool, addr); } +static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + enum kvm_pgtable_prot prot; + enum pkvm_page_state state; + kvm_pte_t pte = *ptep; + phys_addr_t phys; + + if (!kvm_pte_valid(pte)) + return 0; + + if (level != (KVM_PGTABLE_MAX_LEVELS - 1)) + return -EINVAL; + + phys = kvm_pte_to_phys(pte); + if (!addr_is_memory(phys)) + return 0; + + /* + * Adjust the host stage-2 mappings to match the ownership attributes + * configured in the hypervisor stage-1. + */ + state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); + switch (state) { + case PKVM_PAGE_OWNED: + return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id); + case PKVM_PAGE_SHARED_OWNED: + prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED); + break; + case PKVM_PAGE_SHARED_BORROWED: + prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + break; + default: + return -EINVAL; + } + + return host_stage2_idmap_locked(phys, PAGE_SIZE, prot); +} + +static int finalize_host_mappings(void) +{ + struct kvm_pgtable_walker walker = { + .cb = finalize_host_mappings_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), &walker); +} + void __noreturn __pkvm_init_finalise(void) { struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data); @@ -167,6 +229,10 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; + ret = finalize_host_mappings(); + if (ret) + goto out; + pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_page = hyp_zalloc_hyp_page, .phys_to_virt = hyp_phys_to_virt, diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index f7af9688c1f7..a34b01cc8ab9 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -41,7 +41,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) ___activate_traps(vcpu); __activate_traps_common(vcpu); - val = CPTR_EL2_DEFAULT; + val = vcpu->arch.cptr_el2; val |= CPTR_EL2_TTA | CPTR_EL2_TAM; if (!update_fp_enabled(vcpu)) { val |= CPTR_EL2_TFP | CPTR_EL2_TZ; @@ -69,12 +69,10 @@ static void __activate_traps(struct kvm_vcpu *vcpu) static void __deactivate_traps(struct kvm_vcpu *vcpu) { extern char __kvm_hyp_host_vector[]; - u64 mdcr_el2, cptr; + u64 cptr; ___deactivate_traps(vcpu); - mdcr_el2 = read_sysreg(mdcr_el2); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; @@ -92,13 +90,8 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) isb(); } - __deactivate_traps_common(); - - mdcr_el2 &= MDCR_EL2_HPMN_MASK; - mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; - mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; + __deactivate_traps_common(vcpu); - write_sysreg(mdcr_el2, mdcr_el2); write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); cptr = CPTR_EL2_DEFAULT; @@ -170,6 +163,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; struct kvm_cpu_context *guest_ctxt; + struct kvm_s2_mmu *mmu; bool pmu_switch_needed; u64 exit_code; @@ -213,7 +207,8 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg32_restore_state(vcpu); __sysreg_restore_state_nvhe(guest_ctxt); - __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu)); + mmu = kern_hyp_va(vcpu->arch.hw_mmu); + __load_stage2(mmu, kern_hyp_va(mmu->arch)); __activate_traps(vcpu); __hyp_vgic_restore_state(vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 38ed0f6f2703..d296d617f589 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -34,12 +34,12 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, } /* - * __load_guest_stage2() includes an ISB only when the AT + * __load_stage2() includes an ISB only when the AT * workaround is applied. Take care of the opposite condition, * ensuring that we always have an ISB, but not two ISBs back * to back. */ - __load_guest_stage2(mmu); + __load_stage2(mmu, kern_hyp_va(mmu->arch)); asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); } diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 05321f4165e3..f8ceebe4982e 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -11,16 +11,12 @@ #include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> -#define KVM_PTE_VALID BIT(0) #define KVM_PTE_TYPE BIT(1) #define KVM_PTE_TYPE_BLOCK 0 #define KVM_PTE_TYPE_PAGE 1 #define KVM_PTE_TYPE_TABLE 1 -#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) -#define KVM_PTE_ADDR_51_48 GENMASK(15, 12) - #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) @@ -40,6 +36,8 @@ #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) +#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) + #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) @@ -48,9 +46,7 @@ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ KVM_PTE_LEAF_ATTR_HI_S2_XN) -#define KVM_PTE_LEAF_ATTR_S2_IGNORED GENMASK(58, 55) - -#define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56) +#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) #define KVM_MAX_OWNER_ID 1 struct kvm_pgtable_walk_data { @@ -61,17 +57,6 @@ struct kvm_pgtable_walk_data { u64 end; }; -static u64 kvm_granule_shift(u32 level) -{ - /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ - return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); -} - -static u64 kvm_granule_size(u32 level) -{ - return BIT(kvm_granule_shift(level)); -} - #define KVM_PHYS_INVALID (-1ULL) static bool kvm_phys_is_valid(u64 phys) @@ -79,15 +64,6 @@ static bool kvm_phys_is_valid(u64 phys) return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX)); } -static bool kvm_level_supports_block_mapping(u32 level) -{ - /* - * Reject invalid block mappings and don't bother with 4TB mappings for - * 52-bit PAs. - */ - return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); -} - static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) { u64 granule = kvm_granule_size(level); @@ -135,11 +111,6 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) return __kvm_pgd_page_idx(&pgt, -1ULL) + 1; } -static bool kvm_pte_valid(kvm_pte_t pte) -{ - return pte & KVM_PTE_VALID; -} - static bool kvm_pte_table(kvm_pte_t pte, u32 level) { if (level == KVM_PGTABLE_MAX_LEVELS - 1) @@ -151,16 +122,6 @@ static bool kvm_pte_table(kvm_pte_t pte, u32 level) return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; } -static u64 kvm_pte_to_phys(kvm_pte_t pte) -{ - u64 pa = pte & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) - pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; - - return pa; -} - static kvm_pte_t kvm_phys_to_pte(u64 pa) { kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; @@ -326,6 +287,45 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, return _kvm_pgtable_walk(&walk_data); } +struct leaf_walk_data { + kvm_pte_t pte; + u32 level; +}; + +static int leaf_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + struct leaf_walk_data *data = arg; + + data->pte = *ptep; + data->level = level; + + return 0; +} + +int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, + kvm_pte_t *ptep, u32 *level) +{ + struct leaf_walk_data data; + struct kvm_pgtable_walker walker = { + .cb = leaf_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &data, + }; + int ret; + + ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE), + PAGE_SIZE, &walker); + if (!ret) { + if (ptep) + *ptep = data.pte; + if (level) + *level = data.level; + } + + return ret; +} + struct hyp_map_data { u64 phys; kvm_pte_t attr; @@ -357,11 +357,47 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; + attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; return 0; } +enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) +{ + enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; + u32 ap; + + if (!kvm_pte_valid(pte)) + return prot; + + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) + prot |= KVM_PGTABLE_PROT_X; + + ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); + if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) + prot |= KVM_PGTABLE_PROT_R; + else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW) + prot |= KVM_PGTABLE_PROT_RW; + + return prot; +} + +static bool hyp_pte_needs_update(kvm_pte_t old, kvm_pte_t new) +{ + /* + * Tolerate KVM recreating the exact same mapping, or changing software + * bits if the existing mapping was valid. + */ + if (old == new) + return false; + + if (!kvm_pte_valid(old)) + return true; + + return !WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW); +} + static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct hyp_map_data *data) { @@ -371,9 +407,8 @@ static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, if (!kvm_block_mapping_supported(addr, end, phys, level)) return false; - /* Tolerate KVM recreating the exact same mapping */ new = kvm_init_valid_leaf_pte(phys, data->attr, level); - if (old != new && !WARN_ON(kvm_pte_valid(old))) + if (hyp_pte_needs_update(old, new)) smp_store_release(ptep, new); data->phys += granule; @@ -438,6 +473,8 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; pgt->mm_ops = mm_ops; pgt->mmu = NULL; + pgt->force_pte_cb = NULL; + return 0; } @@ -475,6 +512,9 @@ struct stage2_map_data { void *memcache; struct kvm_pgtable_mm_ops *mm_ops; + + /* Force mappings to page granularity */ + bool force_pte; }; u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) @@ -539,11 +579,29 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; return 0; } +enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) +{ + enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; + + if (!kvm_pte_valid(pte)) + return prot; + + if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R) + prot |= KVM_PGTABLE_PROT_R; + if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) + prot |= KVM_PGTABLE_PROT_W; + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) + prot |= KVM_PGTABLE_PROT_X; + + return prot; +} + static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) { if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) @@ -588,6 +646,15 @@ static bool stage2_pte_executable(kvm_pte_t pte) return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); } +static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level, + struct stage2_map_data *data) +{ + if (data->force_pte && (level < (KVM_PGTABLE_MAX_LEVELS - 1))) + return false; + + return kvm_block_mapping_supported(addr, end, data->phys, level); +} + static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) @@ -597,7 +664,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, struct kvm_pgtable *pgt = data->mmu->pgt; struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; - if (!kvm_block_mapping_supported(addr, end, phys, level)) + if (!stage2_leaf_mapping_allowed(addr, end, level, data)) return -E2BIG; if (kvm_phys_is_valid(phys)) @@ -641,7 +708,7 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, if (data->anchor) return 0; - if (!kvm_block_mapping_supported(addr, end, data->phys, level)) + if (!stage2_leaf_mapping_allowed(addr, end, level, data)) return 0; data->childp = kvm_pte_follow(*ptep, data->mm_ops); @@ -771,6 +838,7 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, .mmu = pgt->mmu, .memcache = mc, .mm_ops = pgt->mm_ops, + .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -802,6 +870,7 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, .memcache = mc, .mm_ops = pgt->mm_ops, .owner_id = owner_id, + .force_pte = true, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -995,6 +1064,9 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, u32 level; kvm_pte_t set = 0, clr = 0; + if (prot & KVM_PTE_LEAF_ATTR_HI_SW) + return -EINVAL; + if (prot & KVM_PGTABLE_PROT_R) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; @@ -1043,9 +1115,11 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) return kvm_pgtable_walk(pgt, addr, size, &walker); } -int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch, - struct kvm_pgtable_mm_ops *mm_ops, - enum kvm_pgtable_stage2_flags flags) + +int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, + struct kvm_pgtable_mm_ops *mm_ops, + enum kvm_pgtable_stage2_flags flags, + kvm_pgtable_force_pte_cb_t force_pte_cb) { size_t pgd_sz; u64 vtcr = arch->vtcr; @@ -1063,6 +1137,7 @@ int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch pgt->mm_ops = mm_ops; pgt->mmu = &arch->mmu; pgt->flags = flags; + pgt->force_pte_cb = force_pte_cb; /* Ensure zeroed PGD pages are visible to the hardware walker */ dsb(ishst); @@ -1102,77 +1177,3 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz); pgt->pgd = NULL; } - -#define KVM_PTE_LEAF_S2_COMPAT_MASK (KVM_PTE_LEAF_ATTR_S2_PERMS | \ - KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \ - KVM_PTE_LEAF_ATTR_S2_IGNORED) - -static int stage2_check_permission_walker(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) -{ - kvm_pte_t old_attr, pte = *ptep, *new_attr = arg; - - /* - * Compatible mappings are either invalid and owned by the page-table - * owner (whose id is 0), or valid with matching permission attributes. - */ - if (kvm_pte_valid(pte)) { - old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK; - if (old_attr != *new_attr) - return -EEXIST; - } else if (pte) { - return -EEXIST; - } - - return 0; -} - -int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot, - struct kvm_mem_range *range) -{ - kvm_pte_t attr; - struct kvm_pgtable_walker check_perm_walker = { - .cb = stage2_check_permission_walker, - .flags = KVM_PGTABLE_WALK_LEAF, - .arg = &attr, - }; - u64 granule, start, end; - u32 level; - int ret; - - ret = stage2_set_prot_attr(pgt, prot, &attr); - if (ret) - return ret; - attr &= KVM_PTE_LEAF_S2_COMPAT_MASK; - - for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) { - granule = kvm_granule_size(level); - start = ALIGN_DOWN(addr, granule); - end = start + granule; - - if (!kvm_level_supports_block_mapping(level)) - continue; - - if (start < range->start || range->end < end) - continue; - - /* - * Check the presence of existing mappings with incompatible - * permissions within the current block range, and try one level - * deeper if one is found. - */ - ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker); - if (ret != -EEXIST) - break; - } - - if (!ret) { - range->start = start; - range->end = end; - } - - return ret; -} diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c index d654921dd09b..578670e3f608 100644 --- a/arch/arm64/kvm/hyp/reserved_mem.c +++ b/arch/arm64/kvm/hyp/reserved_mem.c @@ -92,12 +92,10 @@ void __init kvm_hyp_reserve(void) * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. */ hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; - hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(), - ALIGN(hyp_mem_size, PMD_SIZE), - PMD_SIZE); + hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE), + PMD_SIZE); if (!hyp_mem_base) - hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(), - hyp_mem_size, PAGE_SIZE); + hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE); else hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); @@ -105,7 +103,6 @@ void __init kvm_hyp_reserve(void) kvm_err("Failed to reserve hyp memory\n"); return; } - memblock_reserve(hyp_mem_base, hyp_mem_size); kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, hyp_mem_base); diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c index f1e2e5a00933..289689b2682d 100644 --- a/arch/arm64/kvm/hyp/vhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c @@ -20,7 +20,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) __debug_switch_to_host_common(vcpu); } -u32 __kvm_get_mdcr_el2(void) +u64 __kvm_get_mdcr_el2(void) { return read_sysreg(mdcr_el2); } diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index b3229924d243..ded2c66675f0 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -91,17 +91,9 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu) __activate_traps_common(vcpu); } -void deactivate_traps_vhe_put(void) +void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) { - u64 mdcr_el2 = read_sysreg(mdcr_el2); - - mdcr_el2 &= MDCR_EL2_HPMN_MASK | - MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT | - MDCR_EL2_TPMS; - - write_sysreg(mdcr_el2, mdcr_el2); - - __deactivate_traps_common(); + __deactivate_traps_common(vcpu); } /* Switch to the guest for VHE systems running in EL2 */ @@ -124,11 +116,11 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) * * We have already configured the guest's stage 1 translation in * kvm_vcpu_load_sysregs_vhe above. We must now call - * __load_guest_stage2 before __activate_traps, because - * __load_guest_stage2 configures stage 2 translation, and + * __load_stage2 before __activate_traps, because + * __load_stage2 configures stage 2 translation, and * __activate_traps clear HCR_EL2.TGE (among other things). */ - __load_guest_stage2(vcpu->arch.hw_mmu); + __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); __activate_traps(vcpu); __kvm_adjust_pc(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 2a0b8c88d74f..007a12dd4351 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -101,7 +101,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *host_ctxt; host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; - deactivate_traps_vhe_put(); + deactivate_traps_vhe_put(vcpu); __sysreg_save_el1_state(guest_ctxt); __sysreg_save_user_state(guest_ctxt); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 66f17349f0c3..24cef9b87f9e 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -50,10 +50,10 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, * * ARM erratum 1165522 requires some special handling (again), * as we need to make sure both stages of translation are in - * place before clearing TGE. __load_guest_stage2() already + * place before clearing TGE. __load_stage2() already * has an ISB in order to deal with this. */ - __load_guest_stage2(mmu); + __load_stage2(mmu, mmu->arch); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; write_sysreg(val, hcr_el2); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0625bf2353c2..1a94a7ca48f2 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -80,6 +80,7 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) */ void kvm_flush_remote_tlbs(struct kvm *kvm) { + ++kvm->stat.generic.remote_tlb_flush_requests; kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); } @@ -259,10 +260,8 @@ static int __create_hyp_mappings(unsigned long start, unsigned long size, { int err; - if (!kvm_host_owns_hyp_mappings()) { - return kvm_call_hyp_nvhe(__pkvm_create_mappings, - start, size, phys, prot); - } + if (WARN_ON(!kvm_host_owns_hyp_mappings())) + return -EINVAL; mutex_lock(&kvm_hyp_pgd_mutex); err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); @@ -282,6 +281,21 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr) } } +static int pkvm_share_hyp(phys_addr_t start, phys_addr_t end) +{ + phys_addr_t addr; + int ret; + + for (addr = ALIGN_DOWN(start, PAGE_SIZE); addr < end; addr += PAGE_SIZE) { + ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, + __phys_to_pfn(addr)); + if (ret) + return ret; + } + + return 0; +} + /** * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode * @from: The virtual kernel start address of the range @@ -302,6 +316,13 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) if (is_kernel_in_hyp_mode()) return 0; + if (!kvm_host_owns_hyp_mappings()) { + if (WARN_ON(prot != PAGE_HYP)) + return -EPERM; + return pkvm_share_hyp(kvm_kaddr_to_phys(from), + kvm_kaddr_to_phys(to)); + } + start = start & PAGE_MASK; end = PAGE_ALIGN(end); @@ -433,6 +454,32 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, return 0; } +static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { + /* We shouldn't need any other callback to walk the PT */ + .phys_to_virt = kvm_host_va, +}; + +static int get_user_mapping_size(struct kvm *kvm, u64 addr) +{ + struct kvm_pgtable pgt = { + .pgd = (kvm_pte_t *)kvm->mm->pgd, + .ia_bits = VA_BITS, + .start_level = (KVM_PGTABLE_MAX_LEVELS - + CONFIG_PGTABLE_LEVELS), + .mm_ops = &kvm_user_mm_ops, + }; + kvm_pte_t pte = 0; /* Keep GCC quiet... */ + u32 level = ~0; + int ret; + + ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); + VM_BUG_ON(ret); + VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS); + VM_BUG_ON(!(pte & PTE_VALID)); + + return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); +} + static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .zalloc_page = stage2_memcache_zalloc_page, .zalloc_pages_exact = kvm_host_zalloc_pages_exact, @@ -485,7 +532,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) mmu->arch = &kvm->arch; mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); - mmu->vmid.vmid_gen = 0; + WRITE_ONCE(mmu->vmid.vmid_gen, 0); return 0; out_destroy_pgtable: @@ -780,7 +827,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, * Returns the size of the mapping. */ static unsigned long -transparent_hugepage_adjust(struct kvm_memory_slot *memslot, +transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long hva, kvm_pfn_t *pfnp, phys_addr_t *ipap) { @@ -791,8 +838,8 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot, * sure that the HVA and IPA are sufficiently aligned and that the * block map is contained within the memslot. */ - if (kvm_is_transparent_hugepage(pfn) && - fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && + get_user_mapping_size(kvm, hva) >= PMD_SIZE) { /* * The address we faulted on is backed by a transparent huge * page. However, because we map the compound huge page and @@ -814,7 +861,7 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot, *ipap &= PMD_MASK; kvm_release_pfn_clean(pfn); pfn &= ~(PTRS_PER_PMD - 1); - kvm_get_pfn(pfn); + get_page(pfn_to_page(pfn)); *pfnp = pfn; return PMD_SIZE; @@ -1050,9 +1097,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * If we are not forced to use page mapping, check if we are * backed by a THP and thus use block mapping if possible. */ - if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) - vma_pagesize = transparent_hugepage_adjust(memslot, hva, - &pfn, &fault_ipa); + if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { + if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE) + vma_pagesize = fault_granule; + else + vma_pagesize = transparent_hugepage_adjust(kvm, memslot, + hva, &pfn, + &fault_ipa); + } if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new VM_SHARED VMA */ diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index 151c31fb9860..f9bb3b14130e 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -50,7 +50,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { int kvm_perf_init(void) { - if (kvm_pmu_probe_pmuver() != 0xf && !is_protected_kvm_enabled()) + if (kvm_pmu_probe_pmuver() != ID_AA64DFR0_PMUVER_IMP_DEF && !is_protected_kvm_enabled()) static_branch_enable(&kvm_arm_pmu_available); return perf_register_guest_info_callbacks(&kvm_guest_cbs); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index f33825c995cb..f5065f23b413 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -373,7 +373,6 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); - reg &= kvm_pmu_valid_counter_mask(vcpu); } return reg; @@ -564,20 +563,21 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) */ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) { - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); int i; if (val & ARMV8_PMU_PMCR_E) { kvm_pmu_enable_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask); + __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); } else { - kvm_pmu_disable_counter_mask(vcpu, mask); + kvm_pmu_disable_counter_mask(vcpu, + __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); } if (val & ARMV8_PMU_PMCR_C) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { + unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); for_each_set_bit(i, &mask, 32) kvm_pmu_set_counter_value(vcpu, i, 0); @@ -745,7 +745,7 @@ int kvm_pmu_probe_pmuver(void) struct perf_event_attr attr = { }; struct perf_event *event; struct arm_pmu *pmu; - int pmuver = 0xf; + int pmuver = ID_AA64DFR0_PMUVER_IMP_DEF; /* * Create a dummy event that only counts user cycles. As we'll never @@ -770,7 +770,7 @@ int kvm_pmu_probe_pmuver(void) if (IS_ERR(event)) { pr_err_once("kvm: pmu event creation failed %ld\n", PTR_ERR(event)); - return 0xf; + return ID_AA64DFR0_PMUVER_IMP_DEF; } if (event->pmu) { @@ -923,7 +923,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (!vcpu->kvm->arch.pmuver) vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver(); - if (vcpu->kvm->arch.pmuver == 0xf) + if (vcpu->kvm->arch.pmuver == ID_AA64DFR0_PMUVER_IMP_DEF) return -ENODEV; switch (attr->attr) { diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index db4056ecccfd..74c47d420253 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -59,6 +59,12 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) kvm_vcpu_kick(vcpu); } +static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu, + unsigned long affinity) +{ + return !(affinity & ~MPIDR_HWID_BITMASK); +} + static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) { struct vcpu_reset_state *reset_state; @@ -66,9 +72,9 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) struct kvm_vcpu *vcpu = NULL; unsigned long cpu_id; - cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK; - if (vcpu_mode_is_32bit(source_vcpu)) - cpu_id &= ~((u32) 0); + cpu_id = smccc_get_arg1(source_vcpu); + if (!kvm_psci_valid_affinity(source_vcpu, cpu_id)) + return PSCI_RET_INVALID_PARAMS; vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); @@ -126,6 +132,9 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) target_affinity = smccc_get_arg1(vcpu); lowest_affinity_level = smccc_get_arg2(vcpu); + if (!kvm_psci_valid_affinity(vcpu, target_affinity)) + return PSCI_RET_INVALID_PARAMS; + /* Determine target affinity mask */ target_affinity_mask = psci_affinity_mask(lowest_affinity_level); if (!target_affinity_mask) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index cba7872d69a8..5ce36b0a3343 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -210,10 +210,16 @@ static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu) */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { + struct vcpu_reset_state reset_state; int ret; bool loaded; u32 pstate; + mutex_lock(&vcpu->kvm->lock); + reset_state = vcpu->arch.reset_state; + WRITE_ONCE(vcpu->arch.reset_state.reset, false); + mutex_unlock(&vcpu->kvm->lock); + /* Reset PMU outside of the non-preemptible section */ kvm_pmu_vcpu_reset(vcpu); @@ -276,8 +282,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) * Additional reset state handling that PSCI may have imposed on us. * Must be done after all the sys_reg reset. */ - if (vcpu->arch.reset_state.reset) { - unsigned long target_pc = vcpu->arch.reset_state.pc; + if (reset_state.reset) { + unsigned long target_pc = reset_state.pc; /* Gracefully handle Thumb2 entry point */ if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) { @@ -286,13 +292,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) } /* Propagate caller endianness */ - if (vcpu->arch.reset_state.be) + if (reset_state.be) kvm_vcpu_set_be(vcpu); *vcpu_pc(vcpu) = target_pc; - vcpu_set_reg(vcpu, 0, vcpu->arch.reset_state.r0); - - vcpu->arch.reset_state.reset = false; + vcpu_set_reg(vcpu, 0, reset_state.r0); } /* Reset timer */ @@ -311,31 +315,26 @@ u32 get_kvm_ipa_limit(void) int kvm_set_ipa_limit(void) { - unsigned int parange, tgran_2; + unsigned int parange; u64 mmfr0; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_PARANGE_SHIFT); + /* + * IPA size beyond 48 bits could not be supported + * on either 4K or 16K page size. Hence let's cap + * it to 48 bits, in case it's reported as larger + * on the system. + */ + if (PAGE_SIZE != SZ_64K) + parange = min(parange, (unsigned int)ID_AA64MMFR0_PARANGE_48); /* * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at * Stage-2. If not, things will stop very quickly. */ - switch (PAGE_SIZE) { - default: - case SZ_4K: - tgran_2 = ID_AA64MMFR0_TGRAN4_2_SHIFT; - break; - case SZ_16K: - tgran_2 = ID_AA64MMFR0_TGRAN16_2_SHIFT; - break; - case SZ_64K: - tgran_2 = ID_AA64MMFR0_TGRAN64_2_SHIFT; - break; - } - - switch (cpuid_feature_extract_unsigned_field(mmfr0, tgran_2)) { + switch (cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_TGRAN_2_SHIFT)) { case ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE: kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n"); return -EINVAL; @@ -369,7 +368,7 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); if (phys_shift) { if (phys_shift > kvm_ipa_limit || - phys_shift < 32) + phys_shift < ARM64_MIN_PARANGE_BITS) return -EINVAL; } else { phys_shift = KVM_PHYS_SHIFT; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f6f126eb6ac1..1d46e185f31e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -44,10 +44,6 @@ * 64bit interface. */ -#define reg_to_encoding(x) \ - sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ - (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2) - static bool read_from_write_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) @@ -318,14 +314,14 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu, /* * We want to avoid world-switching all the DBG registers all the * time: - * + * * - If we've touched any debug register, it is likely that we're * going to touch more of them. It then makes sense to disable the * traps and start doing the save/restore dance * - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is * then mandatory to save/restore the registers, as the guest * depends on them. - * + * * For this, we use a DIRTY bit, indicating the guest has modified the * debug registers, used as follow: * @@ -603,6 +599,41 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } +static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX); + + /* No PMU available, any PMU reg may UNDEF... */ + if (!kvm_arm_support_pmu_v3()) + return; + + n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT; + n &= ARMV8_PMU_PMCR_N_MASK; + if (n) + mask |= GENMASK(n - 1, 0); + + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= mask; +} + +static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0); +} + +static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK; +} + +static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK; +} + static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 pmcr, val; @@ -845,7 +876,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, kvm_pmu_disable_counter_mask(vcpu, val); } } else { - p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask; + p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); } return true; @@ -869,7 +900,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* accessing PMINTENCLR_EL1 */ __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val; } else { - p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1) & mask; + p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1); } return true; @@ -891,7 +922,7 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* accessing PMOVSCLR_EL0 */ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask); } else { - p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0) & mask; + p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); } return true; @@ -944,16 +975,18 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, trap_wcr, reset_wcr, 0, 0, get_wcr, set_wcr } #define PMU_SYS_REG(r) \ - SYS_DESC(r), .reset = reset_unknown, .visibility = pmu_visibility + SYS_DESC(r), .reset = reset_pmu_reg, .visibility = pmu_visibility /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \ + .reset = reset_pmevcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ #define PMU_PMEVTYPER_EL0(n) \ { PMU_SYS_REG(SYS_PMEVTYPERn_EL0(n)), \ + .reset = reset_pmevtyper, \ .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -1026,8 +1059,6 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } -#define FEATURE(x) (GENMASK_ULL(x##_SHIFT + 3, x##_SHIFT)) - /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r, bool raz) @@ -1038,40 +1069,40 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, switch (id) { case SYS_ID_AA64PFR0_EL1: if (!vcpu_has_sve(vcpu)) - val &= ~FEATURE(ID_AA64PFR0_SVE); - val &= ~FEATURE(ID_AA64PFR0_AMU); - val &= ~FEATURE(ID_AA64PFR0_CSV2); - val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); - val &= ~FEATURE(ID_AA64PFR0_CSV3); - val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_SVE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_AMU); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); break; case SYS_ID_AA64PFR1_EL1: - val &= ~FEATURE(ID_AA64PFR1_MTE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); if (kvm_has_mte(vcpu->kvm)) { u64 pfr, mte; pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT); - val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), mte); } break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) - val &= ~(FEATURE(ID_AA64ISAR1_APA) | - FEATURE(ID_AA64ISAR1_API) | - FEATURE(ID_AA64ISAR1_GPA) | - FEATURE(ID_AA64ISAR1_GPI)); + val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_API) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI)); break; case SYS_ID_AA64DFR0_EL1: /* Limit debug to ARMv8.0 */ - val &= ~FEATURE(ID_AA64DFR0_DEBUGVER); - val |= FIELD_PREP(FEATURE(ID_AA64DFR0_DEBUGVER), 6); + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), 6); /* Limit guests to PMUv3 for ARMv8.4 */ val = cpuid_feature_cap_perfmon_field(val, ID_AA64DFR0_PMUVER_SHIFT, kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0); /* Hide SPE from guests */ - val &= ~FEATURE(ID_AA64DFR0_PMSVER); + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_PMSVER); break; case SYS_ID_DFR0_EL1: /* Limit guests to PMUv3 for ARMv8.4 */ @@ -1249,6 +1280,20 @@ static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return __set_id_reg(vcpu, rd, uaddr, true); } +static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + int err; + u64 val; + + /* Perform the access even if we are going to ignore the value */ + err = reg_from_user(&val, uaddr, sys_reg_to_index(rd)); + if (err) + return err; + + return 0; +} + static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { @@ -1592,16 +1637,21 @@ static const struct sys_reg_desc sys_reg_descs[] = { .access = access_pmcnten, .reg = PMCNTENSET_EL0 }, { PMU_SYS_REG(SYS_PMOVSCLR_EL0), .access = access_pmovs, .reg = PMOVSSET_EL0 }, + /* + * PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was + * previously (and pointlessly) advertised in the past... + */ { PMU_SYS_REG(SYS_PMSWINC_EL0), - .access = access_pmswinc, .reg = PMSWINC_EL0 }, + .get_user = get_raz_id_reg, .set_user = set_wi_reg, + .access = access_pmswinc, .reset = NULL }, { PMU_SYS_REG(SYS_PMSELR_EL0), - .access = access_pmselr, .reg = PMSELR_EL0 }, + .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 }, { PMU_SYS_REG(SYS_PMCEID0_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(SYS_PMCEID1_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(SYS_PMCCNTR_EL0), - .access = access_pmu_evcntr, .reg = PMCCNTR_EL0 }, + .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 }, { PMU_SYS_REG(SYS_PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, { PMU_SYS_REG(SYS_PMXEVCNTR_EL0), @@ -2106,23 +2156,6 @@ static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, return 0; } -static int match_sys_reg(const void *key, const void *elt) -{ - const unsigned long pval = (unsigned long)key; - const struct sys_reg_desc *r = elt; - - return pval - reg_to_encoding(r); -} - -static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params, - const struct sys_reg_desc table[], - unsigned int num) -{ - unsigned long pval = reg_to_encoding(params); - - return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); -} - int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); @@ -2365,13 +2398,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) trace_kvm_handle_sys_reg(esr); - params.Op0 = (esr >> 20) & 3; - params.Op1 = (esr >> 14) & 0x7; - params.CRn = (esr >> 10) & 0xf; - params.CRm = (esr >> 1) & 0xf; - params.Op2 = (esr >> 17) & 0x7; + params = esr_sys64_to_params(esr); params.regval = vcpu_get_reg(vcpu, Rt); - params.is_write = !(esr & 1); ret = emulate_sys_reg(vcpu, ¶ms); diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 9d0621417c2a..cc0cc95a0280 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -11,6 +11,12 @@ #ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__ #define __ARM64_KVM_SYS_REGS_LOCAL_H__ +#include <linux/bsearch.h> + +#define reg_to_encoding(x) \ + sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ + (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2) + struct sys_reg_params { u8 Op0; u8 Op1; @@ -21,6 +27,14 @@ struct sys_reg_params { bool is_write; }; +#define esr_sys64_to_params(esr) \ + ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \ + .Op1 = ((esr) >> 14) & 0x7, \ + .CRn = ((esr) >> 10) & 0xf, \ + .CRm = ((esr) >> 1) & 0xf, \ + .Op2 = ((esr) >> 17) & 0x7, \ + .is_write = !((esr) & 1) }) + struct sys_reg_desc { /* Sysreg string for debug */ const char *name; @@ -152,6 +166,23 @@ static inline int cmp_sys_reg(const struct sys_reg_desc *i1, return i1->Op2 - i2->Op2; } +static inline int match_sys_reg(const void *key, const void *elt) +{ + const unsigned long pval = (unsigned long)key; + const struct sys_reg_desc *r = elt; + + return pval - reg_to_encoding(r); +} + +static inline const struct sys_reg_desc * +find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], + unsigned int num) +{ + unsigned long pval = reg_to_encoding(params); + + return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); +} + const struct sys_reg_desc *find_reg_by_id(u64 id, struct sys_reg_params *params, const struct sys_reg_desc table[], diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h index 8d78acc4fba7..064a58c19f48 100644 --- a/arch/arm64/kvm/trace_handle_exit.h +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -78,13 +78,17 @@ TRACE_EVENT(kvm_arm_clear_debug, TP_printk("flags: 0x%08x", __entry->guest_debug) ); +/* + * The dreg32 name is a leftover from a distant past. This will really + * output a 64bit value... + */ TRACE_EVENT(kvm_arm_set_dreg32, - TP_PROTO(const char *name, __u32 value), + TP_PROTO(const char *name, __u64 value), TP_ARGS(name, value), TP_STRUCT__entry( __field(const char *, name) - __field(__u32, value) + __field(__u64, value) ), TP_fast_assign( @@ -92,7 +96,7 @@ TRACE_EVENT(kvm_arm_set_dreg32, __entry->value = value; ), - TP_printk("%s: 0x%08x", __entry->name, __entry->value) + TP_printk("%s: 0x%llx", __entry->name, __entry->value) ); TRACE_DEFINE_SIZEOF(__u64); diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c index a016f07adc28..5f9014ae595b 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -282,7 +282,7 @@ static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu, case GIC_CPU_PRIMASK: /* * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the - * the PMR field as GICH_VMCR.VMPriMask rather than + * PMR field as GICH_VMCR.VMPriMask rather than * GICC_PMR.Priority, so we expose the upper five bits of * priority mask to userspace using the lower bits in the * unsigned long. @@ -329,7 +329,7 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, case GIC_CPU_PRIMASK: /* * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the - * the PMR field as GICH_VMCR.VMPriMask rather than + * PMR field as GICH_VMCR.VMPriMask rather than * GICC_PMR.Priority, so we expose the upper five bits of * priority mask to userspace using the lower bits in the * unsigned long. diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 2c580204f1dc..95a18cec14a3 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) u32 val = cpuif->vgic_lr[lr]; u32 cpuid, intid = val & GICH_LR_VIRTUALID; struct vgic_irq *irq; + bool deactivated; /* Extract the source vCPU id from the LR */ cpuid = val & GICH_LR_PHYSID_CPUID; @@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) raw_spin_lock(&irq->irq_lock); - /* Always preserve the active bit */ + /* Always preserve the active bit, note deactivation */ + deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT); irq->active = !!(val & GICH_LR_ACTIVE_BIT); if (irq->active && vgic_irq_is_sgi(intid)) @@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE)) irq->pending_latch = false; - /* - * Level-triggered mapped IRQs are special because we only - * observe rising edges as input to the VGIC. - * - * If the guest never acked the interrupt we have to sample - * the physical line and set the line level, because the - * device state could have changed or we simply need to - * process the still pending interrupt later. - * - * If this causes us to lower the level, we have to also clear - * the physical active state, since we will otherwise never be - * told when the interrupt becomes asserted again. - * - * Another case is when the interrupt requires a helping hand - * on deactivation (no HW deactivation, for example). - */ - if (vgic_irq_is_mapped_level(irq)) { - bool resample = false; - - if (val & GICH_LR_PENDING_BIT) { - irq->line_level = vgic_get_phys_line_level(irq); - resample = !irq->line_level; - } else if (vgic_irq_needs_resampling(irq) && - !(irq->active || irq->pending_latch)) { - resample = true; - } - - if (resample) - vgic_irq_set_phys_active(irq, false); - } + /* Handle resampling for mapped interrupts if required */ + vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT); raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 66004f61cd83..21a6207fb2ee 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) u32 intid, cpuid; struct vgic_irq *irq; bool is_v2_sgi = false; + bool deactivated; cpuid = val & GICH_LR_PHYSID_CPUID; cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; @@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) raw_spin_lock(&irq->irq_lock); - /* Always preserve the active bit */ + /* Always preserve the active bit, note deactivation */ + deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); irq->active = !!(val & ICH_LR_ACTIVE_BIT); if (irq->active && is_v2_sgi) @@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) irq->pending_latch = false; - /* - * Level-triggered mapped IRQs are special because we only - * observe rising edges as input to the VGIC. - * - * If the guest never acked the interrupt we have to sample - * the physical line and set the line level, because the - * device state could have changed or we simply need to - * process the still pending interrupt later. - * - * If this causes us to lower the level, we have to also clear - * the physical active state, since we will otherwise never be - * told when the interrupt becomes asserted again. - * - * Another case is when the interrupt requires a helping hand - * on deactivation (no HW deactivation, for example). - */ - if (vgic_irq_is_mapped_level(irq)) { - bool resample = false; - - if (val & ICH_LR_PENDING_BIT) { - irq->line_level = vgic_get_phys_line_level(irq); - resample = !irq->line_level; - } else if (vgic_irq_needs_resampling(irq) && - !(irq->active || irq->pending_latch)) { - resample = true; - } - - if (resample) - vgic_irq_set_phys_active(irq, false); - } + /* Handle resampling for mapped interrupts if required */ + vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 111bff47e471..5dad4996cfb2 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -106,7 +106,6 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, if (intid >= VGIC_MIN_LPI) return vgic_get_lpi(kvm, intid); - WARN(1, "Looking up struct vgic_irq for reserved INTID"); return NULL; } @@ -1022,3 +1021,41 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) return map_is_active; } + +/* + * Level-triggered mapped IRQs are special because we only observe rising + * edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample the physical + * line and set the line level, because the device state could have changed + * or we simply need to process the still pending interrupt later. + * + * We could also have entered the guest with the interrupt active+pending. + * On the next exit, we need to re-evaluate the pending state, as it could + * otherwise result in a spurious interrupt by injecting a now potentially + * stale pending state. + * + * If this causes us to lower the level, we have to also clear the physical + * active state, since we will otherwise never be told when the interrupt + * becomes asserted again. + * + * Another case is when the interrupt requires a helping hand on + * deactivation (no HW deactivation, for example). + */ +void vgic_irq_handle_resampling(struct vgic_irq *irq, + bool lr_deactivated, bool lr_pending) +{ + if (vgic_irq_is_mapped_level(irq)) { + bool resample = false; + + if (unlikely(vgic_irq_needs_resampling(irq))) { + resample = !(irq->active || irq->pending_latch); + } else if (lr_pending || (lr_deactivated && irq->line_level)) { + irq->line_level = vgic_get_phys_line_level(irq); + resample = !irq->line_level; + } + + if (resample) + vgic_irq_set_phys_active(irq, false); + } +} diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index dc1f3d1657ee..14a9218641f5 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, unsigned long flags); void vgic_kick_vcpus(struct kvm *kvm); +void vgic_irq_handle_resampling(struct vgic_irq *irq, + bool lr_deactivated, bool lr_pending); int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, phys_addr_t addr, phys_addr_t alignment); diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 6dd56a49790a..0941180a86d3 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 lib-y := clear_user.o delay.o copy_from_user.o \ - copy_to_user.o copy_in_user.o copy_page.o \ + copy_to_user.o copy_page.o \ clear_page.o csum.o insn.o memchr.o memcpy.o \ memset.o memcmp.o strcmp.o strncmp.o strlen.o \ strnlen.o strchr.o strrchr.o tishift.o diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S deleted file mode 100644 index dbea3799c3ef..000000000000 --- a/arch/arm64/lib/copy_in_user.S +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copy from user space to user space - * - * Copyright (C) 2012 ARM Ltd. - */ - -#include <linux/linkage.h> - -#include <asm/asm-uaccess.h> -#include <asm/assembler.h> -#include <asm/cache.h> - -/* - * Copy from user space to user space (alignment handled by the hardware) - * - * Parameters: - * x0 - to - * x1 - from - * x2 - n - * Returns: - * x0 - bytes not copied - */ - .macro ldrb1 reg, ptr, val - user_ldst 9998f, ldtrb, \reg, \ptr, \val - .endm - - .macro strb1 reg, ptr, val - user_ldst 9998f, sttrb, \reg, \ptr, \val - .endm - - .macro ldrh1 reg, ptr, val - user_ldst 9997f, ldtrh, \reg, \ptr, \val - .endm - - .macro strh1 reg, ptr, val - user_ldst 9997f, sttrh, \reg, \ptr, \val - .endm - - .macro ldr1 reg, ptr, val - user_ldst 9997f, ldtr, \reg, \ptr, \val - .endm - - .macro str1 reg, ptr, val - user_ldst 9997f, sttr, \reg, \ptr, \val - .endm - - .macro ldp1 reg1, reg2, ptr, val - user_ldp 9997f, \reg1, \reg2, \ptr, \val - .endm - - .macro stp1 reg1, reg2, ptr, val - user_stp 9997f, \reg1, \reg2, \ptr, \val - .endm - -end .req x5 -srcin .req x15 -SYM_FUNC_START(__arch_copy_in_user) - add end, x0, x2 - mov srcin, x1 -#include "copy_template.S" - mov x0, #0 - ret -SYM_FUNC_END(__arch_copy_in_user) -EXPORT_SYMBOL(__arch_copy_in_user) - - .section .fixup,"ax" - .align 2 -9997: cmp dst, dstin - b.ne 9998f - // Before being absolutely sure we couldn't copy anything, try harder -USER(9998f, ldtrb tmp1w, [srcin]) -USER(9998f, sttrb tmp1w, [dst]) - add dst, dst, #1 -9998: sub x0, end, dst // bytes not copied - ret - .previous diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index d7bee210a798..83bcad72ec97 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -173,4 +173,4 @@ L(done): ret SYM_FUNC_END_PI(strcmp) -EXPORT_SYMBOL_NOKASAN(strcmp) +EXPORT_SYMBOL_NOHWKASAN(strcmp) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index 48d44f7fddb1..e42bcfcd37e6 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -258,4 +258,4 @@ L(ret0): ret SYM_FUNC_END_PI(strncmp) -EXPORT_SYMBOL_NOKASAN(strncmp) +EXPORT_SYMBOL_NOHWKASAN(strncmp) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index edc8e950bada..37a81754d9b6 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -30,6 +30,7 @@ #include <linux/crash_dump.h> #include <linux/hugetlb.h> #include <linux/acpi_iort.h> +#include <linux/kmemleak.h> #include <asm/boot.h> #include <asm/fixmap.h> @@ -74,6 +75,7 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init; static void __init reserve_crashkernel(void) { unsigned long long crash_base, crash_size; + unsigned long long crash_max = arm64_dma_phys_limit; int ret; ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), @@ -84,37 +86,27 @@ static void __init reserve_crashkernel(void) crash_size = PAGE_ALIGN(crash_size); - if (crash_base == 0) { - /* Current arm64 boot protocol requires 2MB alignment */ - crash_base = memblock_find_in_range(0, arm64_dma_phys_limit, - crash_size, SZ_2M); - if (crash_base == 0) { - pr_warn("cannot allocate crashkernel (size:0x%llx)\n", - crash_size); - return; - } - } else { - /* User specifies base address explicitly. */ - if (!memblock_is_region_memory(crash_base, crash_size)) { - pr_warn("cannot reserve crashkernel: region is not memory\n"); - return; - } + /* User specifies base address explicitly. */ + if (crash_base) + crash_max = crash_base + crash_size; - if (memblock_is_region_reserved(crash_base, crash_size)) { - pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n"); - return; - } - - if (!IS_ALIGNED(crash_base, SZ_2M)) { - pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n"); - return; - } + /* Current arm64 boot protocol requires 2MB alignment */ + crash_base = memblock_phys_alloc_range(crash_size, SZ_2M, + crash_base, crash_max); + if (!crash_base) { + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; } - memblock_reserve(crash_base, crash_size); pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", crash_base, crash_base + crash_size, crash_size >> 20); + /* + * The crashkernel memory will be removed from the kernel linear + * map. Inform kmemleak so that it won't try to access it. + */ + kmemleak_ignore_phys(crash_base); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; } @@ -236,7 +228,21 @@ early_param("mem", early_mem); void __init arm64_memblock_init(void) { - const s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual); + s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual); + + /* + * Corner case: 52-bit VA capable systems running KVM in nVHE mode may + * be limited in their ability to support a linear map that exceeds 51 + * bits of VA space, depending on the placement of the ID map. Given + * that the placement of the ID map may be randomized, let's simply + * limit the kernel's linear map to 51 bits as well if we detect this + * configuration. + */ + if (IS_ENABLED(CONFIG_KVM) && vabits_actual == 52 && + is_hyp_mode_available() && !is_kernel_in_hyp_mode()) { + pr_info("Capping linear region to 51 bits for KVM in nVHE mode on LVA capable hardware.\n"); + linear_region_size = min_t(u64, linear_region_size, BIT(51)); + } /* Remove memory above our supported physical address size */ memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9ff0de1b2b93..cfd9deb347c3 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1502,8 +1502,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 2716f6395ba7..9d4d898df76b 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -82,6 +82,7 @@ config CSKY select PCI_SYSCALL if PCI select PCI_MSI if PCI select SET_FS + select TRACE_IRQFLAGS_SUPPORT config LOCKDEP_SUPPORT def_bool y @@ -139,9 +140,6 @@ config STACKTRACE_SUPPORT config TIME_LOW_RES def_bool y -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config CPU_TLB_SIZE int default "128" if (CPU_CK610 || CPU_CK807 || CPU_CK810) diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 07ff17ea33de..fb91b069dc69 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -56,17 +56,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, } } -void flush_kernel_dcache_page(struct page *page) -{ - struct address_space *mapping; - - mapping = page_mapping_file(page); - - if (!mapping || mapping_mapped(mapping)) - dcache_wbinv_all(); -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index 6cab7afae962..ed62e2066ba7 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -14,12 +14,10 @@ extern void flush_dcache_page(struct page *); #define flush_cache_page(vma, page, pfn) cache_wbinv_all() #define flush_cache_dup_mm(mm) cache_wbinv_all() -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -extern void flush_kernel_dcache_page(struct page *); - #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) { dcache_wbinv_all(); diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c index 68b22b499aeb..8fffa34d4e1c 100644 --- a/arch/csky/kernel/probes/kprobes.c +++ b/arch/csky/kernel/probes/kprobes.c @@ -283,8 +283,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr) * normal page fault. */ regs->pc = (unsigned long) cur->addr; - if (!instruction_pointer(regs)) - BUG(); + BUG_ON(!instruction_pointer(regs)); if (kcb->kprobe_status == KPROBE_REENTER) restore_previous_kprobe(kcb); diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c index 312f046d452d..bc4238b9f709 100644 --- a/arch/csky/kernel/signal.c +++ b/arch/csky/kernel/signal.c @@ -260,8 +260,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } } diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c index 5d8b969cd8f3..bdbe988d8dbc 100644 --- a/arch/h8300/kernel/traps.c +++ b/arch/h8300/kernel/traps.c @@ -39,10 +39,6 @@ void __init base_trap_init(void) { } -void __init trap_init(void) -{ -} - asmlinkage void set_esp0(unsigned long ssp) { current->thread.esp0 = ssp; diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index aab1a40eb653..15dd8f38b698 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -32,6 +32,7 @@ config HEXAGON select GENERIC_CPU_DEVICES select SET_FS select ARCH_WANT_LD_ORPHAN_WARN + select TRACE_IRQFLAGS_SUPPORT help Qualcomm Hexagon is a processor architecture designed for high performance and low power across a wide variety of applications. @@ -53,9 +54,6 @@ config EARLY_PRINTK config MMU def_bool y -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config GENERIC_CSUM def_bool y diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c index 904134b37232..edfc35dafeb1 100644 --- a/arch/hexagon/kernel/traps.c +++ b/arch/hexagon/kernel/traps.c @@ -28,10 +28,6 @@ #define TRAP_SYSCALL 1 #define TRAP_DEBUG 0xdb -void __init trap_init(void) -{ -} - #ifdef CONFIG_GENERIC_BUG /* Maybe should resemble arch/sh/kernel/traps.c ?? */ int is_valid_bugaddr(unsigned long addr) diff --git a/arch/ia64/Kbuild b/arch/ia64/Kbuild index a4e40e534e6a..e77cc76d228c 100644 --- a/arch/ia64/Kbuild +++ b/arch/ia64/Kbuild @@ -1 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only +obj-y += kernel/ mm/ +obj-$(CONFIG_IA64_SGI_UV) += uv/ diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile index 467b7e7f967c..7e548c654a29 100644 --- a/arch/ia64/Makefile +++ b/arch/ia64/Makefile @@ -47,8 +47,6 @@ KBUILD_CFLAGS += $(cflags-y) head-y := arch/ia64/kernel/head.o libs-y += arch/ia64/lib/ -core-y += arch/ia64/kernel/ arch/ia64/mm/ -core-$(CONFIG_IA64_SGI_UV) += arch/ia64/uv/ drivers-y += arch/ia64/pci/ arch/ia64/hp/common/ diff --git a/arch/ia64/include/asm/meminit.h b/arch/ia64/include/asm/meminit.h index 6c47a239fc26..f1d5bf2ba847 100644 --- a/arch/ia64/include/asm/meminit.h +++ b/arch/ia64/include/asm/meminit.h @@ -29,7 +29,6 @@ struct rsvd_region { }; extern struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1]; -extern int num_rsvd_regions; extern void find_memory (void); extern void reserve_memory (void); @@ -40,7 +39,6 @@ extern unsigned long efi_memmap_init(u64 *s, u64 *e); extern int find_max_min_low_pfn (u64, u64, void *); extern unsigned long vmcore_find_descriptor_size(unsigned long address); -extern int reserve_elfcorehdr(u64 *start, u64 *end); /* * For rounding an address to the next IA64_GRANULE_SIZE or order diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index e2af6b172200..96d13cb7c19f 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -906,6 +906,6 @@ EXPORT_SYMBOL(acpi_unregister_ioapic); /* * acpi_suspend_lowlevel() - save kernel state and suspend. * - * TBD when when IA64 starts to support suspend... + * TBD when IA64 starts to support suspend... */ int acpi_suspend_lowlevel(void) { return 0; } diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index dd595fbd8006..31fb84de2d21 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -131,7 +131,7 @@ unsigned long ia64_cache_stride_shift = ~0; * We use a special marker for the end of memory and it uses the extra (+1) slot */ struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1] __initdata; -int num_rsvd_regions __initdata; +static int num_rsvd_regions __initdata; /* @@ -325,6 +325,31 @@ static inline void __init setup_crashkernel(unsigned long total, int *n) {} #endif +#ifdef CONFIG_CRASH_DUMP +static int __init reserve_elfcorehdr(u64 *start, u64 *end) +{ + u64 length; + + /* We get the address using the kernel command line, + * but the size is extracted from the EFI tables. + * Both address and size are required for reservation + * to work properly. + */ + + if (!is_vmcore_usable()) + return -EINVAL; + + if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) { + vmcore_unusable(); + return -EINVAL; + } + + *start = (unsigned long)__va(elfcorehdr_addr); + *end = *start + length; + return 0; +} +#endif /* CONFIG_CRASH_DUMP */ + /** * reserve_memory - setup reserved memory areas * @@ -522,32 +547,6 @@ static __init int setup_nomca(char *s) } early_param("nomca", setup_nomca); -#ifdef CONFIG_CRASH_DUMP -int __init reserve_elfcorehdr(u64 *start, u64 *end) -{ - u64 length; - - /* We get the address using the kernel command line, - * but the size is extracted from the EFI tables. - * Both address and size are required for reservation - * to work properly. - */ - - if (!is_vmcore_usable()) - return -EINVAL; - - if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) { - vmcore_unusable(); - return -EINVAL; - } - - *start = (unsigned long)__va(elfcorehdr_addr); - *end = *start + length; - return 0; -} - -#endif /* CONFIG_PROC_VMCORE */ - void __init setup_arch (char **cmdline_p) { diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 4b20224b14d9..6fea1844fb95 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -367,3 +367,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 064a967a7b6e..5c6da8d83c1a 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -484,8 +484,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/m68k/include/asm/raw_io.h b/arch/m68k/include/asm/raw_io.h index 911826ea83ce..80eb2396d01e 100644 --- a/arch/m68k/include/asm/raw_io.h +++ b/arch/m68k/include/asm/raw_io.h @@ -17,21 +17,21 @@ * two accesses to memory, which may be undesirable for some devices. */ #define in_8(addr) \ - ({ u8 __v = (*(__force volatile u8 *) (addr)); __v; }) + ({ u8 __v = (*(__force volatile u8 *) (unsigned long)(addr)); __v; }) #define in_be16(addr) \ - ({ u16 __v = (*(__force volatile u16 *) (addr)); __v; }) + ({ u16 __v = (*(__force volatile u16 *) (unsigned long)(addr)); __v; }) #define in_be32(addr) \ - ({ u32 __v = (*(__force volatile u32 *) (addr)); __v; }) + ({ u32 __v = (*(__force volatile u32 *) (unsigned long)(addr)); __v; }) #define in_le16(addr) \ - ({ u16 __v = le16_to_cpu(*(__force volatile __le16 *) (addr)); __v; }) + ({ u16 __v = le16_to_cpu(*(__force volatile __le16 *) (unsigned long)(addr)); __v; }) #define in_le32(addr) \ - ({ u32 __v = le32_to_cpu(*(__force volatile __le32 *) (addr)); __v; }) + ({ u32 __v = le32_to_cpu(*(__force volatile __le32 *) (unsigned long)(addr)); __v; }) -#define out_8(addr,b) (void)((*(__force volatile u8 *) (addr)) = (b)) -#define out_be16(addr,w) (void)((*(__force volatile u16 *) (addr)) = (w)) -#define out_be32(addr,l) (void)((*(__force volatile u32 *) (addr)) = (l)) -#define out_le16(addr,w) (void)((*(__force volatile __le16 *) (addr)) = cpu_to_le16(w)) -#define out_le32(addr,l) (void)((*(__force volatile __le32 *) (addr)) = cpu_to_le32(l)) +#define out_8(addr,b) (void)((*(__force volatile u8 *) (unsigned long)(addr)) = (b)) +#define out_be16(addr,w) (void)((*(__force volatile u16 *) (unsigned long)(addr)) = (w)) +#define out_be32(addr,l) (void)((*(__force volatile u32 *) (unsigned long)(addr)) = (l)) +#define out_le16(addr,w) (void)((*(__force volatile __le16 *) (unsigned long)(addr)) = cpu_to_le16(w)) +#define out_le32(addr,l) (void)((*(__force volatile __le32 *) (unsigned long)(addr)) = cpu_to_le32(l)) #define raw_inb in_8 #define raw_inw in_be16 diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 3ec1291c268d..7976dff8f879 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -446,3 +446,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c index e1e90c49a496..dfd6202fd403 100644 --- a/arch/m68k/mvme147/config.c +++ b/arch/m68k/mvme147/config.c @@ -171,7 +171,6 @@ static int bcd2int (unsigned char b) int mvme147_hwclk(int op, struct rtc_time *t) { -#warning check me! if (!op) { m147_rtc->ctrl = RTC_READ; t->tm_year = bcd2int (m147_rtc->bcd_year); @@ -183,6 +182,9 @@ int mvme147_hwclk(int op, struct rtc_time *t) m147_rtc->ctrl = 0; if (t->tm_year < 70) t->tm_year += 100; + } else { + /* FIXME Setting the time is not yet supported */ + return -EOPNOTSUPP; } return 0; } diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c index b59593c7cfb9..b4422c2dfbbf 100644 --- a/arch/m68k/mvme16x/config.c +++ b/arch/m68k/mvme16x/config.c @@ -436,7 +436,6 @@ int bcd2int (unsigned char b) int mvme16x_hwclk(int op, struct rtc_time *t) { -#warning check me! if (!op) { rtc->ctrl = RTC_READ; t->tm_year = bcd2int (rtc->bcd_year); @@ -448,6 +447,9 @@ int mvme16x_hwclk(int op, struct rtc_time *t) rtc->ctrl = 0; if (t->tm_year < 70) t->tm_year += 100; + } else { + /* FIXME Setting the time is not yet supported */ + return -EOPNOTSUPP; } return 0; } diff --git a/arch/microblaze/Kbuild b/arch/microblaze/Kbuild index a4e40e534e6a..a1c597889319 100644 --- a/arch/microblaze/Kbuild +++ b/arch/microblaze/Kbuild @@ -1 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +obj-y += kernel/ +obj-y += mm/ +obj-$(CONFIG_PCI) += pci/ +obj-y += boot/dts/ diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 14a67a42fcae..59798e43cdb0 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -44,6 +44,7 @@ config MICROBLAZE select SPARSE_IRQ select SET_FS select ZONE_DMA + select TRACE_IRQFLAGS_SUPPORT # Endianness selection choice diff --git a/arch/microblaze/Kconfig.debug b/arch/microblaze/Kconfig.debug index 865527ac332a..a4e40e534e6a 100644 --- a/arch/microblaze/Kconfig.debug +++ b/arch/microblaze/Kconfig.debug @@ -1,6 +1 @@ # SPDX-License-Identifier: GPL-2.0-only -# For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.rst. - -config TRACE_IRQFLAGS_SUPPORT - def_bool y diff --git a/arch/microblaze/Makefile b/arch/microblaze/Makefile index 6d4af39e3890..9adc6b6434df 100644 --- a/arch/microblaze/Makefile +++ b/arch/microblaze/Makefile @@ -50,17 +50,12 @@ KBUILD_CFLAGS += -ffixed-r31 $(CPUFLAGS-y) $(CPUFLAGS-1) $(CPUFLAGS-2) head-y := arch/microblaze/kernel/head.o libs-y += arch/microblaze/lib/ -core-y += arch/microblaze/kernel/ -core-y += arch/microblaze/mm/ -core-$(CONFIG_PCI) += arch/microblaze/pci/ boot := arch/microblaze/boot # Are we making a simpleImage.<boardname> target? If so, crack out the boardname DTB:=$(subst simpleImage.,,$(filter simpleImage.%, $(MAKECMDGOALS))) -core-y += $(boot)/dts/ - export DTB all: linux.bin diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index ce550978f4fc..4b8b2fa78fc5 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -112,8 +112,7 @@ extern int page_is_ram(unsigned long pfn); # define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) # define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT) -# define pfn_valid(pfn) ((pfn) < (max_mapnr + ARCH_PFN_OFFSET)) - +# define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (max_mapnr + ARCH_PFN_OFFSET)) # endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr))) diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 71cd547655d9..c136a01e467e 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -443,8 +443,6 @@ extern int mem_init_done; asmlinkage void __init mmu_init(void); -void __init *early_get_page(void); - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 9be3ace12938..6b0e11362bd2 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -452,3 +452,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index ab55c70380a5..952f35b335b2 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -265,18 +265,6 @@ asmlinkage void __init mmu_init(void) dma_contiguous_reserve(memory_start + lowmem_size - 1); } -/* This is only called until mem_init is done. */ -void __init *early_get_page(void) -{ - /* - * Mem start + kernel_tlb -> here is limit - * because of mem mapping from head.S - */ - return memblock_alloc_try_nid_raw(PAGE_SIZE, PAGE_SIZE, - MEMBLOCK_LOW_LIMIT, memory_start + kernel_tlb, - NUMA_NO_NODE); -} - void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) { void *p; diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index 38ccb909bc9d..c1833b159d3b 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -33,6 +33,7 @@ #include <linux/init.h> #include <linux/mm_types.h> #include <linux/pgtable.h> +#include <linux/memblock.h> #include <asm/pgalloc.h> #include <linux/io.h> @@ -242,15 +243,13 @@ unsigned long iopa(unsigned long addr) __ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - pte_t *pte; - if (mem_init_done) { - pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - } else { - pte = (pte_t *)early_get_page(); - if (pte) - clear_page(pte); - } - return pte; + if (mem_init_done) + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + else + return memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE, + MEMBLOCK_LOW_LIMIT, + memory_start + kernel_tlb, + NUMA_NO_NODE); } void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) diff --git a/arch/mips/Kbuild.platforms b/arch/mips/Kbuild.platforms index e4f6e49417a9..584081df89c2 100644 --- a/arch/mips/Kbuild.platforms +++ b/arch/mips/Kbuild.platforms @@ -21,7 +21,6 @@ platform-$(CONFIG_MIPS_MALTA) += mti-malta/ platform-$(CONFIG_MACH_NINTENDO64) += n64/ platform-$(CONFIG_NLM_COMMON) += netlogic/ platform-$(CONFIG_PIC32MZDA) += pic32/ -platform-$(CONFIG_MACH_PISTACHIO) += pistachio/ platform-$(CONFIG_RALINK) += ralink/ platform-$(CONFIG_MIKROTIK_RB532) += rb532/ platform-$(CONFIG_SGI_IP22) += sgi-ip22/ diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 24e374266fdc..771ca53af06d 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -98,6 +98,7 @@ config MIPS select PCI_MSI_ARCH_FALLBACKS if PCI_MSI select RTC_LIB select SYSCTL_EXCEPTION_TRACE + select TRACE_IRQFLAGS_SUPPORT select VIRT_TO_BUS select ARCH_HAS_ELFCORE_COMPAT @@ -514,35 +515,6 @@ config MACH_LOONGSON64 and Loongson-2F which will be removed), developed by the Institute of Computing Technology (ICT), Chinese Academy of Sciences (CAS). -config MACH_PISTACHIO - bool "IMG Pistachio SoC based boards" - select BOOT_ELF32 - select BOOT_RAW - select CEVT_R4K - select CLKSRC_MIPS_GIC - select COMMON_CLK - select CSRC_R4K - select DMA_NONCOHERENT - select GPIOLIB - select IRQ_MIPS_CPU - select MFD_SYSCON - select MIPS_CPU_SCACHE - select MIPS_GIC - select PINCTRL - select REGULATOR - select SYS_HAS_CPU_MIPS32_R2 - select SYS_SUPPORTS_32BIT_KERNEL - select SYS_SUPPORTS_LITTLE_ENDIAN - select SYS_SUPPORTS_MIPS_CPS - select SYS_SUPPORTS_MULTITHREADING - select SYS_SUPPORTS_RELOCATABLE - select SYS_SUPPORTS_ZBOOT - select SYS_HAS_EARLY_PRINTK - select USE_GENERIC_EARLY_PRINTK_8250 - select USE_OF - help - This enables support for the IMG Pistachio SoC platform. - config MIPS_MALTA bool "MIPS Malta board" select ARCH_MAY_HAVE_PC_FDC @@ -1089,7 +1061,6 @@ source "arch/mips/ingenic/Kconfig" source "arch/mips/jazz/Kconfig" source "arch/mips/lantiq/Kconfig" source "arch/mips/pic32/Kconfig" -source "arch/mips/pistachio/Kconfig" source "arch/mips/ralink/Kconfig" source "arch/mips/sgi-ip27/Kconfig" source "arch/mips/sibyte/Kconfig" diff --git a/arch/mips/Kconfig.debug b/arch/mips/Kconfig.debug index 43dbf5930796..f4ae7900fcd3 100644 --- a/arch/mips/Kconfig.debug +++ b/arch/mips/Kconfig.debug @@ -1,9 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -config TRACE_IRQFLAGS_SUPPORT - bool - default y - config EARLY_PRINTK bool "Early printk" if EXPERT depends on SYS_HAS_EARLY_PRINTK diff --git a/arch/mips/Makefile b/arch/mips/Makefile index 653befc1b176..ea3cd080a1c7 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -254,7 +254,7 @@ endif # # Board-dependent options and extra files # -include arch/mips/Kbuild.platforms +include $(srctree)/arch/mips/Kbuild.platforms ifdef CONFIG_PHYSICAL_START load-y = $(CONFIG_PHYSICAL_START) @@ -560,6 +560,9 @@ sead3micro_defconfig-y := micro32r2el_defconfig BOARDS=sead-3 legacy_defconfigs += xilfpga_defconfig xilfpga_defconfig-y := 32r2el_defconfig BOARDS=xilfpga +legacy_defconfigs += pistachio_defconfig +pistachio_defconfig-y := 32r2el_defconfig BOARDS=marduk + .PHONY: $(legacy_defconfigs) $(legacy_defconfigs): $(Q)$(MAKE) -f $(srctree)/Makefile $($@-y) diff --git a/arch/mips/alchemy/devboards/db1200.c b/arch/mips/alchemy/devboards/db1200.c index 421d651433b6..1864eb935ca5 100644 --- a/arch/mips/alchemy/devboards/db1200.c +++ b/arch/mips/alchemy/devboards/db1200.c @@ -835,7 +835,7 @@ int __init db1200_dev_setup(void) if (!IS_ERR(c)) { pfc = clk_round_rate(c, 50000000); if ((pfc < 1) || (abs(50000000 - pfc) > 2500000)) - pr_warn("DB1200: cant get I2C close to 50MHz\n"); + pr_warn("DB1200: can't get I2C close to 50MHz\n"); else clk_set_rate(c, pfc); clk_prepare_enable(c); diff --git a/arch/mips/boot/dts/Makefile b/arch/mips/boot/dts/Makefile index 60bd7d2a9ad8..be96d35eb582 100644 --- a/arch/mips/boot/dts/Makefile +++ b/arch/mips/boot/dts/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 subdir-$(CONFIG_BMIPS_GENERIC) += brcm subdir-$(CONFIG_CAVIUM_OCTEON_SOC) += cavium-octeon -subdir-$(CONFIG_MACH_PISTACHIO) += img +subdir-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += img subdir-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += img subdir-$(CONFIG_MACH_INGENIC) += ingenic subdir-$(CONFIG_LANTIQ) += lantiq diff --git a/arch/mips/boot/dts/img/Makefile b/arch/mips/boot/dts/img/Makefile index 441a3c16efb0..ebb47490b04b 100644 --- a/arch/mips/boot/dts/img/Makefile +++ b/arch/mips/boot/dts/img/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 dtb-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += boston.dtb -dtb-$(CONFIG_MACH_PISTACHIO) += pistachio_marduk.dtb -obj-$(CONFIG_MACH_PISTACHIO) += pistachio_marduk.dtb.o +dtb-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += pistachio_marduk.dtb diff --git a/arch/mips/boot/dts/img/pistachio.dtsi b/arch/mips/boot/dts/img/pistachio.dtsi index dc3b7909de73..b1db8b8f446f 100644 --- a/arch/mips/boot/dts/img/pistachio.dtsi +++ b/arch/mips/boot/dts/img/pistachio.dtsi @@ -900,6 +900,16 @@ }; }; + cpc: cpc@1bde0000 { + compatible = "mti,mips-cpc"; + reg = <0x1bde0000 0x10000>; + }; + + cdmm: cdmm@1bdf0000 { + compatible = "mti,mips-cdmm"; + reg = <0x1bdf0000 0x10000>; + }; + usb_phy: usb-phy { compatible = "img,pistachio-usb-phy"; clocks = <&clk_core CLK_USB_PHY>; diff --git a/arch/mips/boot/dts/mscc/ocelot.dtsi b/arch/mips/boot/dts/mscc/ocelot.dtsi index 535a98284dcb..e51db651af13 100644 --- a/arch/mips/boot/dts/mscc/ocelot.dtsi +++ b/arch/mips/boot/dts/mscc/ocelot.dtsi @@ -150,36 +150,47 @@ port0: port@0 { reg = <0>; + status = "disabled"; }; port1: port@1 { reg = <1>; + status = "disabled"; }; port2: port@2 { reg = <2>; + status = "disabled"; }; port3: port@3 { reg = <3>; + status = "disabled"; }; port4: port@4 { reg = <4>; + status = "disabled"; }; port5: port@5 { reg = <5>; + status = "disabled"; }; port6: port@6 { reg = <6>; + status = "disabled"; }; port7: port@7 { reg = <7>; + status = "disabled"; }; port8: port@8 { reg = <8>; + status = "disabled"; }; port9: port@9 { reg = <9>; + status = "disabled"; }; port10: port@10 { reg = <10>; + status = "disabled"; }; }; }; diff --git a/arch/mips/boot/dts/mscc/ocelot_pcb120.dts b/arch/mips/boot/dts/mscc/ocelot_pcb120.dts index 897de5025d7f..bd240690cb37 100644 --- a/arch/mips/boot/dts/mscc/ocelot_pcb120.dts +++ b/arch/mips/boot/dts/mscc/ocelot_pcb120.dts @@ -69,40 +69,52 @@ }; &port0 { + status = "okay"; phy-handle = <&phy0>; + phy-mode = "internal"; }; &port1 { + status = "okay"; phy-handle = <&phy1>; + phy-mode = "internal"; }; &port2 { + status = "okay"; phy-handle = <&phy2>; + phy-mode = "internal"; }; &port3 { + status = "okay"; phy-handle = <&phy3>; + phy-mode = "internal"; }; &port4 { + status = "okay"; phy-handle = <&phy7>; phy-mode = "sgmii"; phys = <&serdes 4 SERDES1G(2)>; }; &port5 { + status = "okay"; phy-handle = <&phy4>; phy-mode = "sgmii"; phys = <&serdes 5 SERDES1G(5)>; }; &port6 { + status = "okay"; phy-handle = <&phy6>; phy-mode = "sgmii"; phys = <&serdes 6 SERDES1G(3)>; }; &port9 { + status = "okay"; phy-handle = <&phy5>; phy-mode = "sgmii"; phys = <&serdes 9 SERDES1G(4)>; diff --git a/arch/mips/boot/dts/mscc/ocelot_pcb123.dts b/arch/mips/boot/dts/mscc/ocelot_pcb123.dts index ef852f382da8..0185045c7630 100644 --- a/arch/mips/boot/dts/mscc/ocelot_pcb123.dts +++ b/arch/mips/boot/dts/mscc/ocelot_pcb123.dts @@ -47,17 +47,25 @@ }; &port0 { + status = "okay"; phy-handle = <&phy0>; + phy-mode = "internal"; }; &port1 { + status = "okay"; phy-handle = <&phy1>; + phy-mode = "internal"; }; &port2 { + status = "okay"; phy-handle = <&phy2>; + phy-mode = "internal"; }; &port3 { + status = "okay"; phy-handle = <&phy3>; + phy-mode = "internal"; }; diff --git a/arch/mips/cavium-octeon/executive/cvmx-bootmem.c b/arch/mips/cavium-octeon/executive/cvmx-bootmem.c index e794b2d53adf..b63ad5d42cc7 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-bootmem.c +++ b/arch/mips/cavium-octeon/executive/cvmx-bootmem.c @@ -44,7 +44,7 @@ static struct cvmx_bootmem_desc *cvmx_bootmem_desc; /* See header file for descriptions of functions */ -/** +/* * This macro returns a member of the * cvmx_bootmem_named_block_desc_t structure. These members can't * be directly addressed as they might be in memory not directly @@ -60,7 +60,7 @@ static struct cvmx_bootmem_desc *cvmx_bootmem_desc; offsetof(struct cvmx_bootmem_named_block_desc, field), \ sizeof_field(struct cvmx_bootmem_named_block_desc, field)) -/** +/* * This function is the implementation of the get macros defined * for individual structure members. The argument are generated * by the macros inorder to read only the needed memory. @@ -115,7 +115,7 @@ static uint64_t cvmx_bootmem_phy_get_next(uint64_t addr) return cvmx_read64_uint64((addr + NEXT_OFFSET) | (1ull << 63)); } -/** +/* * Allocate a block of memory from the free list that was * passed to the application by the bootloader within a specified * address range. This is an allocate-only algorithm, so @@ -550,7 +550,7 @@ bootmem_free_done: } -/** +/* * Finds a named memory block by name. * Also used for finding an unused entry in the named block table. * @@ -657,7 +657,7 @@ struct cvmx_bootmem_named_block_desc *cvmx_bootmem_find_named_block(char *name) } EXPORT_SYMBOL(cvmx_bootmem_find_named_block); -/** +/* * Frees a named block. * * @name: name of block to free diff --git a/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c b/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c index 3839feba68f2..20189e9ad94d 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c +++ b/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c @@ -42,14 +42,14 @@ #include <asm/octeon/cvmx-pexp-defs.h> #include <asm/octeon/cvmx-pko-defs.h> -/** +/* * This application uses this pointer to access the global queue * state. It points to a bootmem named block. */ __cvmx_cmd_queue_all_state_t *__cvmx_cmd_queue_state_ptr; EXPORT_SYMBOL_GPL(__cvmx_cmd_queue_state_ptr); -/** +/* * Initialize the Global queue state pointer. * * Returns CVMX_CMD_QUEUE_SUCCESS or a failure code @@ -57,27 +57,14 @@ EXPORT_SYMBOL_GPL(__cvmx_cmd_queue_state_ptr); static cvmx_cmd_queue_result_t __cvmx_cmd_queue_init_state_ptr(void) { char *alloc_name = "cvmx_cmd_queues"; -#if defined(CONFIG_CAVIUM_RESERVE32) && CONFIG_CAVIUM_RESERVE32 - extern uint64_t octeon_reserve32_memory; -#endif if (likely(__cvmx_cmd_queue_state_ptr)) return CVMX_CMD_QUEUE_SUCCESS; -#if defined(CONFIG_CAVIUM_RESERVE32) && CONFIG_CAVIUM_RESERVE32 - if (octeon_reserve32_memory) - __cvmx_cmd_queue_state_ptr = - cvmx_bootmem_alloc_named_range(sizeof(*__cvmx_cmd_queue_state_ptr), - octeon_reserve32_memory, - octeon_reserve32_memory + - (CONFIG_CAVIUM_RESERVE32 << - 20) - 1, 128, alloc_name); - else -#endif - __cvmx_cmd_queue_state_ptr = - cvmx_bootmem_alloc_named(sizeof(*__cvmx_cmd_queue_state_ptr), - 128, - alloc_name); + __cvmx_cmd_queue_state_ptr = + cvmx_bootmem_alloc_named(sizeof(*__cvmx_cmd_queue_state_ptr), + 128, + alloc_name); if (__cvmx_cmd_queue_state_ptr) memset(__cvmx_cmd_queue_state_ptr, 0, sizeof(*__cvmx_cmd_queue_state_ptr)); @@ -97,7 +84,7 @@ static cvmx_cmd_queue_result_t __cvmx_cmd_queue_init_state_ptr(void) return CVMX_CMD_QUEUE_SUCCESS; } -/** +/* * Initialize a command queue for use. The initial FPA buffer is * allocated and the hardware unit is configured to point to the * new command queue. @@ -195,7 +182,7 @@ cvmx_cmd_queue_result_t cvmx_cmd_queue_initialize(cvmx_cmd_queue_id_t queue_id, } } -/** +/* * Shutdown a queue a free it's command buffers to the FPA. The * hardware connected to the queue must be stopped before this * function is called. @@ -231,7 +218,7 @@ cvmx_cmd_queue_result_t cvmx_cmd_queue_shutdown(cvmx_cmd_queue_id_t queue_id) return CVMX_CMD_QUEUE_SUCCESS; } -/** +/* * Return the number of command words pending in the queue. This * function may be relatively slow for some hardware units. * @@ -287,7 +274,7 @@ int cvmx_cmd_queue_length(cvmx_cmd_queue_id_t queue_id) return CVMX_CMD_QUEUE_INVALID_PARAM; } -/** +/* * Return the command buffer to be written to. The purpose of this * function is to allow CVMX routine access t othe low level buffer * for initial hardware setup. User applications should not call this diff --git a/arch/mips/cavium-octeon/executive/cvmx-helper-board.c b/arch/mips/cavium-octeon/executive/cvmx-helper-board.c index abd11b7af22f..1daa0c6b6f4e 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-helper-board.c +++ b/arch/mips/cavium-octeon/executive/cvmx-helper-board.c @@ -44,7 +44,7 @@ #include <asm/octeon/cvmx-gmxx-defs.h> #include <asm/octeon/cvmx-asxx-defs.h> -/** +/* * Return the MII PHY address associated with the given IPD * port. A result of -1 means there isn't a MII capable PHY * connected to this port. On chips supporting multiple MII @@ -189,7 +189,7 @@ int cvmx_helper_board_get_mii_address(int ipd_port) return -1; } -/** +/* * This function is the board specific method of determining an * ethernet ports link speed. Most Octeon boards have Marvell PHYs * and are handled by the fall through case. This function must be @@ -274,7 +274,7 @@ union cvmx_helper_link_info __cvmx_helper_board_link_get(int ipd_port) return result; } -/** +/* * This function is called by cvmx_helper_interface_probe() after it * determines the number of ports Octeon can support on a specific * interface. This function is the per board location to override @@ -320,7 +320,7 @@ int __cvmx_helper_board_interface_probe(int interface, int supported_ports) return supported_ports; } -/** +/* * Get the clock type used for the USB block based on board type. * Used by the USB code for auto configuration of clock type. * diff --git a/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c b/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c index c4b58598aa9d..a8c3be4eb6f0 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c +++ b/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c @@ -42,7 +42,7 @@ #include <asm/octeon/cvmx-asxx-defs.h> #include <asm/octeon/cvmx-dbg-defs.h> -/** +/* * Probe RGMII ports and determine the number present * * @interface: Interface to probe @@ -88,7 +88,7 @@ int __cvmx_helper_rgmii_probe(int interface) return num_ports; } -/** +/* * Put an RGMII interface in loopback mode. Internal packets sent * out will be received back again on the same port. Externally * received packets will echo back out. @@ -120,7 +120,7 @@ void cvmx_helper_rgmii_internal_loopback(int port) cvmx_write_csr(CVMX_GMXX_PRTX_CFG(index, interface), gmx_cfg.u64); } -/** +/* * Workaround ASX setup errata with CN38XX pass1 * * @interface: Interface to setup @@ -148,7 +148,7 @@ static int __cvmx_helper_errata_asx_pass1(int interface, int port, return 0; } -/** +/* * Configure all of the ASX, GMX, and PKO registers required * to get RGMII to function on the supplied interface. * @@ -251,7 +251,7 @@ int __cvmx_helper_rgmii_enable(int interface) return 0; } -/** +/* * Return the link state of an IPD/PKO port as returned by * auto negotiation. The result of this function may not match * Octeon's link config if auto negotiation has changed since @@ -280,7 +280,7 @@ union cvmx_helper_link_info __cvmx_helper_rgmii_link_get(int ipd_port) return __cvmx_helper_board_link_get(ipd_port); } -/** +/* * Configure an IPD/PKO port for the specified link state. This * function does not influence auto negotiation at the PHY level. * The passed link state must always match the link state returned diff --git a/arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c b/arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c index 842990e8404f..fea71a85bb29 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c +++ b/arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c @@ -54,7 +54,7 @@ int __cvmx_helper_xaui_enumerate(int interface) return 1; } -/** +/* * Probe a XAUI interface and determine the number of ports * connected to it. The XAUI interface should still be down * after this call. @@ -102,7 +102,7 @@ int __cvmx_helper_xaui_probe(int interface) return __cvmx_helper_xaui_enumerate(interface); } -/** +/* * Bringup and enable a XAUI interface. After this call packet * I/O should be fully functional. This is called with IPD * enabled but PKO disabled. @@ -249,7 +249,7 @@ int __cvmx_helper_xaui_enable(int interface) return 0; } -/** +/* * Return the link state of an IPD/PKO port as returned by * auto negotiation. The result of this function may not match * Octeon's link config if auto negotiation has changed since @@ -288,7 +288,7 @@ union cvmx_helper_link_info __cvmx_helper_xaui_link_get(int ipd_port) return result; } -/** +/* * Configure an IPD/PKO port for the specified link state. This * function does not influence auto negotiation at the PHY level. * The passed link state must always match the link state returned diff --git a/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c b/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c index 2f415d9d0f3c..67d6da21d49f 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c +++ b/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c @@ -46,7 +46,9 @@ /** - * __cvmx_interrupt_gmxx_rxx_int_en_enable enables all interrupt bits in cvmx_gmxx_rxx_int_en_t + * __cvmx_interrupt_gmxx_rxx_int_en_enable - enable all interrupt bits in cvmx_gmxx_rxx_int_en_t + * @index: interrupt register offset + * @block: interrupt register block_id */ void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block) { @@ -227,7 +229,9 @@ void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block) cvmx_write_csr(CVMX_GMXX_RXX_INT_EN(index, block), gmx_rx_int_en.u64); } /** - * __cvmx_interrupt_pcsx_intx_en_reg_enable enables all interrupt bits in cvmx_pcsx_intx_en_reg_t + * __cvmx_interrupt_pcsx_intx_en_reg_enable - enable all interrupt bits in cvmx_pcsx_intx_en_reg_t + * @index: interrupt register offset + * @block: interrupt register block_id */ void __cvmx_interrupt_pcsx_intx_en_reg_enable(int index, int block) { @@ -268,7 +272,8 @@ void __cvmx_interrupt_pcsx_intx_en_reg_enable(int index, int block) cvmx_write_csr(CVMX_PCSX_INTX_EN_REG(index, block), pcs_int_en_reg.u64); } /** - * __cvmx_interrupt_pcsxx_int_en_reg_enable enables all interrupt bits in cvmx_pcsxx_int_en_reg_t + * __cvmx_interrupt_pcsxx_int_en_reg_enable - enable all interrupt bits in cvmx_pcsxx_int_en_reg_t + * @index: interrupt register block_id */ void __cvmx_interrupt_pcsxx_int_en_reg_enable(int index) { @@ -298,7 +303,8 @@ void __cvmx_interrupt_pcsxx_int_en_reg_enable(int index) } /** - * __cvmx_interrupt_spxx_int_msk_enable enables all interrupt bits in cvmx_spxx_int_msk_t + * __cvmx_interrupt_spxx_int_msk_enable - enable all interrupt bits in cvmx_spxx_int_msk_t + * @index: interrupt register block_id */ void __cvmx_interrupt_spxx_int_msk_enable(int index) { @@ -337,7 +343,8 @@ void __cvmx_interrupt_spxx_int_msk_enable(int index) cvmx_write_csr(CVMX_SPXX_INT_MSK(index), spx_int_msk.u64); } /** - * __cvmx_interrupt_stxx_int_msk_enable enables all interrupt bits in cvmx_stxx_int_msk_t + * __cvmx_interrupt_stxx_int_msk_enable - enable all interrupt bits in cvmx_stxx_int_msk_t + * @index: interrupt register block_id */ void __cvmx_interrupt_stxx_int_msk_enable(int index) { diff --git a/arch/mips/cavium-octeon/executive/cvmx-l2c.c b/arch/mips/cavium-octeon/executive/cvmx-l2c.c index 83df0a963a8b..33b303691bc2 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-l2c.c +++ b/arch/mips/cavium-octeon/executive/cvmx-l2c.c @@ -281,7 +281,7 @@ uint64_t cvmx_l2c_read_perf(uint32_t counter) } } -/** +/* * @INTERNAL * Helper function use to fault in cache lines for L2 cache locking * @@ -575,7 +575,7 @@ union __cvmx_l2c_tag { }; -/** +/* * @INTERNAL * Function to read a L2C tag. This code make the current core * the 'debug core' for the L2. This code must only be executed by @@ -764,9 +764,8 @@ int cvmx_l2c_get_cache_size_bytes(void) CVMX_CACHE_LINE_SIZE; } -/** +/* * Return log base 2 of the number of sets in the L2 cache - * Returns */ int cvmx_l2c_get_set_bits(void) { @@ -857,7 +856,7 @@ int cvmx_l2c_get_num_assoc(void) return l2_assoc; } -/** +/* * Flush a line from the L2 cache * This should only be called from one core at a time, as this routine * sets the core to the 'debug' core in order to flush the line. diff --git a/arch/mips/cavium-octeon/executive/cvmx-pko.c b/arch/mips/cavium-octeon/executive/cvmx-pko.c index b0efc35e95c4..7c4879e74318 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-pko.c +++ b/arch/mips/cavium-octeon/executive/cvmx-pko.c @@ -35,7 +35,7 @@ #include <asm/octeon/cvmx-pko.h> #include <asm/octeon/cvmx-helper.h> -/** +/* * Internal state of packet output */ @@ -176,7 +176,7 @@ static void __cvmx_pko_chip_init(void) } } -/** +/* * Call before any other calls to initialize the packet * output system. This does chip global config, and should only be * done by one core. @@ -229,7 +229,7 @@ void cvmx_pko_initialize_global(void) } } -/** +/* * This function does per-core initialization required by the PKO routines. * This must be called on all cores that will do packet output, and must * be called after the FPA has been initialized and filled with pages. @@ -243,7 +243,7 @@ int cvmx_pko_initialize_local(void) return 0; } -/** +/* * Enables the packet output hardware. It must already be * configured. */ @@ -266,7 +266,7 @@ void cvmx_pko_enable(void) cvmx_write_csr(CVMX_PKO_REG_FLAGS, flags.u64); } -/** +/* * Disables the packet output. Does not affect any configuration. */ void cvmx_pko_disable(void) @@ -278,7 +278,7 @@ void cvmx_pko_disable(void) } EXPORT_SYMBOL_GPL(cvmx_pko_disable); -/** +/* * Reset the packet output. */ static void __cvmx_pko_reset(void) @@ -289,7 +289,7 @@ static void __cvmx_pko_reset(void) cvmx_write_csr(CVMX_PKO_REG_FLAGS, pko_reg_flags.u64); } -/** +/* * Shutdown and free resources required by packet output. */ void cvmx_pko_shutdown(void) @@ -320,7 +320,7 @@ void cvmx_pko_shutdown(void) } EXPORT_SYMBOL_GPL(cvmx_pko_shutdown); -/** +/* * Configure a output port and the associated queues for use. * * @port: Port to configure. @@ -548,7 +548,7 @@ cvmx_pko_status_t cvmx_pko_config_port(uint64_t port, uint64_t base_queue, } #ifdef PKO_DEBUG -/** +/* * Show map of ports -> queues for different cores. */ void cvmx_pko_show_queue_map() @@ -573,7 +573,7 @@ void cvmx_pko_show_queue_map() } #endif -/** +/* * Rate limit a PKO port to a max packets/sec. This function is only * supported on CN51XX and higher, excluding CN58XX. * @@ -606,7 +606,7 @@ int cvmx_pko_rate_limit_packets(int port, int packets_s, int burst) return 0; } -/** +/* * Rate limit a PKO port to a max bits/sec. This function is only * supported on CN51XX and higher, excluding CN58XX. * diff --git a/arch/mips/cavium-octeon/executive/cvmx-spi.c b/arch/mips/cavium-octeon/executive/cvmx-spi.c index f51957a3e915..eb9333e84a6b 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-spi.c +++ b/arch/mips/cavium-octeon/executive/cvmx-spi.c @@ -66,7 +66,7 @@ static cvmx_spi_callbacks_t cvmx_spi_callbacks = { .interface_up_cb = cvmx_spi_interface_up_cb }; -/** +/* * Get current SPI4 initialization callbacks * * @callbacks: Pointer to the callbacks structure.to fill @@ -78,7 +78,7 @@ void cvmx_spi_get_callbacks(cvmx_spi_callbacks_t *callbacks) memcpy(callbacks, &cvmx_spi_callbacks, sizeof(cvmx_spi_callbacks)); } -/** +/* * Set new SPI4 initialization callbacks * * @new_callbacks: Pointer to an updated callbacks structure. @@ -88,7 +88,7 @@ void cvmx_spi_set_callbacks(cvmx_spi_callbacks_t *new_callbacks) memcpy(&cvmx_spi_callbacks, new_callbacks, sizeof(cvmx_spi_callbacks)); } -/** +/* * Initialize and start the SPI interface. * * @interface: The identifier of the packet interface to configure and @@ -133,7 +133,7 @@ int cvmx_spi_start_interface(int interface, cvmx_spi_mode_t mode, int timeout, return res; } -/** +/* * This routine restarts the SPI interface after it has lost synchronization * with its correspondent system. * @@ -179,7 +179,7 @@ int cvmx_spi_restart_interface(int interface, cvmx_spi_mode_t mode, int timeout) } EXPORT_SYMBOL_GPL(cvmx_spi_restart_interface); -/** +/* * Callback to perform SPI4 reset * * @interface: The identifier of the packet interface to configure and @@ -294,7 +294,7 @@ int cvmx_spi_reset_cb(int interface, cvmx_spi_mode_t mode) return 0; } -/** +/* * Callback to setup calendar and miscellaneous settings before clock detection * * @interface: The identifier of the packet interface to configure and @@ -413,7 +413,7 @@ int cvmx_spi_calendar_setup_cb(int interface, cvmx_spi_mode_t mode, return 0; } -/** +/* * Callback to perform clock detection * * @interface: The identifier of the packet interface to configure and @@ -491,7 +491,7 @@ int cvmx_spi_clock_detect_cb(int interface, cvmx_spi_mode_t mode, int timeout) return 0; } -/** +/* * Callback to perform link training * * @interface: The identifier of the packet interface to configure and @@ -560,7 +560,7 @@ int cvmx_spi_training_cb(int interface, cvmx_spi_mode_t mode, int timeout) return 0; } -/** +/* * Callback to perform calendar data synchronization * * @interface: The identifier of the packet interface to configure and @@ -617,7 +617,7 @@ int cvmx_spi_calendar_sync_cb(int interface, cvmx_spi_mode_t mode, int timeout) return 0; } -/** +/* * Callback to handle interface up * * @interface: The identifier of the packet interface to configure and diff --git a/arch/mips/cavium-octeon/flash_setup.c b/arch/mips/cavium-octeon/flash_setup.c index a5e8f4a784af..c8a8c6d359b9 100644 --- a/arch/mips/cavium-octeon/flash_setup.c +++ b/arch/mips/cavium-octeon/flash_setup.c @@ -62,7 +62,7 @@ static void octeon_flash_map_copy_to(struct map_info *map, unsigned long to, up(&octeon_bootbus_sem); } -/** +/* * Module/ driver initialization. * * Returns Zero on success diff --git a/arch/mips/cavium-octeon/octeon-memcpy.S b/arch/mips/cavium-octeon/octeon-memcpy.S index 600d018cf354..0a515cde1c18 100644 --- a/arch/mips/cavium-octeon/octeon-memcpy.S +++ b/arch/mips/cavium-octeon/octeon-memcpy.S @@ -154,8 +154,6 @@ FEXPORT(__raw_copy_from_user) EXPORT_SYMBOL(__raw_copy_from_user) FEXPORT(__raw_copy_to_user) EXPORT_SYMBOL(__raw_copy_to_user) -FEXPORT(__raw_copy_in_user) -EXPORT_SYMBOL(__raw_copy_in_user) /* * Note: dst & src may be unaligned, len may be 0 * Temps diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c index ce4e2806159b..00bf269763cf 100644 --- a/arch/mips/cavium-octeon/setup.c +++ b/arch/mips/cavium-octeon/setup.c @@ -284,11 +284,6 @@ void octeon_crash_smp_send_stop(void) #endif /* CONFIG_KEXEC */ -#ifdef CONFIG_CAVIUM_RESERVE32 -uint64_t octeon_reserve32_memory; -EXPORT_SYMBOL(octeon_reserve32_memory); -#endif - #ifdef CONFIG_KEXEC /* crashkernel cmdline parameter is parsed _after_ memory setup * we also parse it here (workaround for EHB5200) */ @@ -300,9 +295,10 @@ static int octeon_uart; extern asmlinkage void handle_int(void); /** - * Return non zero if we are currently running in the Octeon simulator + * octeon_is_simulation - Return non-zero if we are currently running + * in the Octeon simulator * - * Returns + * Return: non-0 if running in the Octeon simulator, 0 otherwise */ int octeon_is_simulation(void) { @@ -311,10 +307,10 @@ int octeon_is_simulation(void) EXPORT_SYMBOL(octeon_is_simulation); /** - * Return true if Octeon is in PCI Host mode. This means + * octeon_is_pci_host - Return true if Octeon is in PCI Host mode. This means * Linux can control the PCI bus. * - * Returns Non zero if Octeon in host mode. + * Return: Non-zero if Octeon is in host mode. */ int octeon_is_pci_host(void) { @@ -326,9 +322,9 @@ int octeon_is_pci_host(void) } /** - * Get the clock rate of Octeon + * octeon_get_clock_rate - Get the clock rate of Octeon * - * Returns Clock rate in HZ + * Return: Clock rate in HZ */ uint64_t octeon_get_clock_rate(void) { @@ -348,11 +344,11 @@ EXPORT_SYMBOL(octeon_get_io_clock_rate); /** - * Write to the LCD display connected to the bootbus. This display - * exists on most Cavium evaluation boards. If it doesn't exist, then - * this function doesn't do anything. - * + * octeon_write_lcd - Write to the LCD display connected to the bootbus. * @s: String to write + * + * This display exists on most Cavium evaluation boards. If it doesn't exist, + * then this function doesn't do anything. */ static void octeon_write_lcd(const char *s) { @@ -372,9 +368,9 @@ static void octeon_write_lcd(const char *s) } /** - * Return the console uart passed by the bootloader + * octeon_get_boot_uart - Return the console uart passed by the bootloader * - * Returns uart (0 or 1) + * Return: uart number (0 or 1) */ static int octeon_get_boot_uart(void) { @@ -383,9 +379,9 @@ static int octeon_get_boot_uart(void) } /** - * Get the coremask Linux was booted on. + * octeon_get_boot_coremask - Get the coremask Linux was booted on. * - * Returns Core mask + * Return: Core mask */ int octeon_get_boot_coremask(void) { @@ -393,7 +389,7 @@ int octeon_get_boot_coremask(void) } /** - * Check the hardware BIST results for a CPU + * octeon_check_cpu_bist - Check the hardware BIST results for a CPU */ void octeon_check_cpu_bist(void) { @@ -424,7 +420,7 @@ void octeon_check_cpu_bist(void) } /** - * Reboot Octeon + * octeon_restart - Reboot Octeon * * @command: Command to pass to the bootloader. Currently ignored. */ @@ -449,7 +445,7 @@ static void octeon_restart(char *command) /** - * Permanently stop a core. + * octeon_kill_core - Permanently stop a core. * * @arg: Ignored. */ @@ -469,7 +465,7 @@ static void octeon_kill_core(void *arg) /** - * Halt the system + * octeon_halt - Halt the system */ static void octeon_halt(void) { @@ -512,9 +508,9 @@ static void __init init_octeon_system_type(void) } /** - * Return a string representing the system type + * octeon_board_type_string - Return a string representing the system type * - * Returns + * Return: system type string */ const char *octeon_board_type_string(void) { @@ -655,7 +651,7 @@ void octeon_user_io_init(void) } /** - * Early entry point for arch setup + * prom_init - Early entry point for arch setup */ void __init prom_init(void) { @@ -665,9 +661,7 @@ void __init prom_init(void) int i; u64 t; int argc; -#ifdef CONFIG_CAVIUM_RESERVE32 - int64_t addr = -1; -#endif + /* * The bootloader passes a pointer to the boot descriptor in * $a3, this is available as fw_arg3. @@ -782,25 +776,6 @@ void __init prom_init(void) cvmx_write_csr(CVMX_LED_UDD_DATX(1), 0); cvmx_write_csr(CVMX_LED_EN, 1); } -#ifdef CONFIG_CAVIUM_RESERVE32 - /* - * We need to temporarily allocate all memory in the reserve32 - * region. This makes sure the kernel doesn't allocate this - * memory when it is getting memory from the - * bootloader. Later, after the memory allocations are - * complete, the reserve32 will be freed. - * - * Allocate memory for RESERVED32 aligned on 2MB boundary. This - * is in case we later use hugetlb entries with it. - */ - addr = cvmx_bootmem_phy_named_block_alloc(CONFIG_CAVIUM_RESERVE32 << 20, - 0, 0, 2 << 20, - "CAVIUM_RESERVE32", 0); - if (addr < 0) - pr_err("Failed to allocate CAVIUM_RESERVE32 memory area\n"); - else - octeon_reserve32_memory = addr; -#endif #ifdef CONFIG_CAVIUM_OCTEON_LOCK_L2 if (cvmx_read_csr(CVMX_L2D_FUS3) & (3ull << 34)) { @@ -1078,16 +1053,6 @@ void __init plat_mem_setup(void) cvmx_bootmem_unlock(); #endif /* CONFIG_CRASH_DUMP */ -#ifdef CONFIG_CAVIUM_RESERVE32 - /* - * Now that we've allocated the kernel memory it is safe to - * free the reserved region. We free it here so that builtin - * drivers can use the memory. - */ - if (octeon_reserve32_memory) - cvmx_bootmem_free_named("CAVIUM_RESERVE32"); -#endif /* CONFIG_CAVIUM_RESERVE32 */ - if (total == 0) panic("Unable to allocate memory from " "cvmx_bootmem_phy_alloc"); diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index 66ce5527da54..89954f5f87fb 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -91,7 +91,7 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/** +/* * Cause the function described by call_data to be executed on the passed * cpu. When the function has finished, increment the finished field of * call_data. @@ -115,7 +115,7 @@ static inline void octeon_send_ipi_mask(const struct cpumask *mask, octeon_send_ipi_single(i, action); } -/** +/* * Detect available CPUs, populate cpu_possible_mask */ static void octeon_smp_hotplug_setup(void) @@ -202,9 +202,8 @@ int plat_post_relocation(long offset) } #endif /* CONFIG_RELOCATABLE */ -/** +/* * Firmware CPU startup hook - * */ static int octeon_boot_secondary(int cpu, struct task_struct *idle) { @@ -232,7 +231,7 @@ static int octeon_boot_secondary(int cpu, struct task_struct *idle) return 0; } -/** +/* * After we've done initial boot, this function is called to allow the * board code to clean up state, if needed */ @@ -250,9 +249,8 @@ static void octeon_init_secondary(void) octeon_irq_setup_secondary(); } -/** +/* * Callout to firmware before smp_init - * */ static void __init octeon_prepare_cpus(unsigned int max_cpus) { @@ -268,7 +266,7 @@ static void __init octeon_prepare_cpus(unsigned int max_cpus) } } -/** +/* * Last chance for the board code to finish SMP initialization before * the CPU is "online". */ diff --git a/arch/mips/configs/generic/board-marduk.config b/arch/mips/configs/generic/board-marduk.config new file mode 100644 index 000000000000..05ca34cd5a73 --- /dev/null +++ b/arch/mips/configs/generic/board-marduk.config @@ -0,0 +1,53 @@ +CONFIG_FIT_IMAGE_FDT_MARDUK=y + +CONFIG_SCSI=y +CONFIG_BLK_DEV_SD=y + +CONFIG_CLKSRC_PISTACHIO=y + +CONFIG_COMMON_CLK_PISTACHIO=y + +CONFIG_DMADEVICES=y +CONFIG_IMG_MDC_DMA=y + +CONFIG_GPIOLIB=y +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_PCH=y + +CONFIG_I2C=y +CONFIG_I2C_IMG=y + +CONFIG_MMC=y +CONFIG_MMC_SDHCI=y +CONFIG_MMC_DW=y +CONFIG_MMC_DW_PLTFM=y + +CONFIG_NETDEVICES=y +CONFIG_STMMAC_ETH=y +CONFIG_STMMAC_PLATFORM=y + +CONFIG_PHY_PISTACHIO_USB=y + +CONFIG_PINCTRL=y +CONFIG_PINCTRL_PISTACHIO=y + +CONFIG_RESET_PISTACHIO=y + +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_SERIAL_8250_DW=y + +CONFIG_SPI=y +CONFIG_SRAM=y + +CONFIG_USB=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_DWC2=y + +CONFIG_CRYPTO_DEV_IMGTEC_HASH=y +CONFIG_IMGPDC_WDT=y +CONFIG_IR_IMG=y +CONFIG_CC10001_ADC=y +CONFIG_SND_SOC_IMG=y diff --git a/arch/mips/configs/lemote2f_defconfig b/arch/mips/configs/lemote2f_defconfig index aaf9d5e0aa2c..791894c4d8fb 100644 --- a/arch/mips/configs/lemote2f_defconfig +++ b/arch/mips/configs/lemote2f_defconfig @@ -116,7 +116,6 @@ CONFIG_8139TOO=y CONFIG_R8169=y CONFIG_USB_USBNET=m CONFIG_USB_NET_CDC_EEM=m -CONFIG_INPUT_POLLDEV=m CONFIG_INPUT_EVDEV=y # CONFIG_MOUSE_PS2_ALPS is not set # CONFIG_MOUSE_PS2_LOGIPS2PP is not set diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig index 63fe2da1b37f..fd567247adc7 100644 --- a/arch/mips/configs/pic32mzda_defconfig +++ b/arch/mips/configs/pic32mzda_defconfig @@ -34,7 +34,6 @@ CONFIG_SCSI_CONSTANTS=y CONFIG_SCSI_SCAN_ASYNC=y # CONFIG_SCSI_LOWLEVEL is not set CONFIG_INPUT_LEDS=m -CONFIG_INPUT_POLLDEV=y CONFIG_INPUT_MOUSEDEV=m CONFIG_INPUT_EVDEV=y CONFIG_INPUT_EVBUG=m diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig deleted file mode 100644 index b9adf15ebbec..000000000000 --- a/arch/mips/configs/pistachio_defconfig +++ /dev/null @@ -1,316 +0,0 @@ -# CONFIG_LOCALVERSION_AUTO is not set -CONFIG_DEFAULT_HOSTNAME="localhost" -CONFIG_SYSVIPC=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_PREEMPT_VOLUNTARY=y -CONFIG_IKCONFIG=m -CONFIG_IKCONFIG_PROC=y -CONFIG_LOG_BUF_SHIFT=18 -CONFIG_CGROUPS=y -CONFIG_CGROUP_SCHED=y -CONFIG_CFS_BANDWIDTH=y -CONFIG_CGROUP_FREEZER=y -CONFIG_NAMESPACES=y -CONFIG_USER_NS=y -CONFIG_BLK_DEV_INITRD=y -# CONFIG_RD_BZIP2 is not set -# CONFIG_RD_LZMA is not set -# CONFIG_RD_LZO is not set -# CONFIG_RD_LZ4 is not set -CONFIG_CC_OPTIMIZE_FOR_SIZE=y -CONFIG_EMBEDDED=y -# CONFIG_COMPAT_BRK is not set -CONFIG_PROFILING=y -CONFIG_MACH_PISTACHIO=y -CONFIG_MIPS_CPS=y -CONFIG_NR_CPUS=4 -CONFIG_PM_DEBUG=y -CONFIG_PM_ADVANCED_DEBUG=y -CONFIG_CPU_IDLE=y -# CONFIG_MIPS_CPS_CPUIDLE is not set -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_PARTITION_ADVANCED=y -# CONFIG_COMPACTION is not set -CONFIG_DEFAULT_MMAP_MIN_ADDR=32768 -CONFIG_ZSMALLOC=y -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_UNIX=y -CONFIG_NET_KEY=m -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -CONFIG_IP_MROUTE=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m -# CONFIG_INET_DIAG is not set -CONFIG_TCP_CONG_ADVANCED=y -# CONFIG_TCP_CONG_BIC is not set -# CONFIG_TCP_CONG_WESTWOOD is not set -# CONFIG_TCP_CONG_HTCP is not set -CONFIG_TCP_CONG_LP=m -CONFIG_TCP_MD5SIG=y -CONFIG_INET6_AH=m -CONFIG_INET6_ESP=m -CONFIG_INET6_XFRM_MODE_TRANSPORT=m -CONFIG_INET6_XFRM_MODE_TUNNEL=m -CONFIG_INET6_XFRM_MODE_BEET=m -CONFIG_IPV6_SIT=m -CONFIG_NETWORK_SECMARK=y -CONFIG_NETFILTER=y -# CONFIG_BRIDGE_NETFILTER is not set -CONFIG_NF_CONNTRACK=y -CONFIG_NF_CT_NETLINK=y -CONFIG_NETFILTER_XT_MARK=m -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y -CONFIG_NETFILTER_XT_TARGET_DSCP=y -CONFIG_NETFILTER_XT_TARGET_NFLOG=y -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y -CONFIG_NETFILTER_XT_TARGET_SECMARK=y -CONFIG_NETFILTER_XT_TARGET_TCPMSS=m -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y -CONFIG_NETFILTER_XT_MATCH_DSCP=y -CONFIG_NETFILTER_XT_MATCH_POLICY=y -CONFIG_NETFILTER_XT_MATCH_STATE=y -CONFIG_NF_NAT_IPV4=m -CONFIG_IP_NF_IPTABLES=y -CONFIG_IP_NF_FILTER=y -CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_IP_NF_MANGLE=y -CONFIG_NF_NAT_IPV6=m -CONFIG_IP6_NF_IPTABLES=m -CONFIG_IP6_NF_MATCH_IPV6HEADER=m -CONFIG_IP6_NF_FILTER=m -CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_BRIDGE=m -CONFIG_VLAN_8021Q=m -CONFIG_NET_SCHED=y -CONFIG_NET_SCH_HTB=m -CONFIG_NET_SCH_CODEL=m -CONFIG_NET_SCH_FQ_CODEL=m -CONFIG_NET_CLS_U32=m -CONFIG_CLS_U32_MARK=y -CONFIG_BT=m -CONFIG_BT_RFCOMM=m -CONFIG_BT_HCIBTUSB=m -CONFIG_BT_HCIBFUSB=m -CONFIG_BT_HCIVHCI=m -CONFIG_CFG80211=m -CONFIG_NL80211_TESTMODE=y -CONFIG_CFG80211_DEBUGFS=y -CONFIG_CFG80211_WEXT=y -CONFIG_MAC80211=m -CONFIG_MAC80211_LEDS=y -CONFIG_MAC80211_DEBUGFS=y -CONFIG_MAC80211_DEBUG_MENU=y -CONFIG_MAC80211_VERBOSE_DEBUG=y -CONFIG_RFKILL=y -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y -CONFIG_DEBUG_DEVRES=y -CONFIG_CONNECTOR=y -CONFIG_MTD=y -CONFIG_MTD_BLOCK=y -CONFIG_MTD_SPI_NOR=y -CONFIG_MTD_UBI=y -CONFIG_MTD_UBI_BLOCK=y -CONFIG_ZRAM=m -CONFIG_BLK_DEV_LOOP=y -CONFIG_SCSI=y -CONFIG_BLK_DEV_SD=y -CONFIG_BLK_DEV_SR=m -CONFIG_SCSI_SPI_ATTRS=y -CONFIG_MD=y -CONFIG_BLK_DEV_DM=y -CONFIG_DM_CRYPT=y -CONFIG_DM_VERITY=y -CONFIG_NETDEVICES=y -CONFIG_TUN=m -CONFIG_VETH=m -# CONFIG_NET_VENDOR_MARVELL is not set -# CONFIG_NET_VENDOR_MICREL is not set -# CONFIG_NET_VENDOR_MICROCHIP is not set -# CONFIG_NET_VENDOR_NATSEMI is not set -# CONFIG_NET_VENDOR_SEEQ is not set -# CONFIG_NET_VENDOR_SMSC is not set -CONFIG_STMMAC_ETH=y -# CONFIG_NET_VENDOR_VIA is not set -CONFIG_PPP=m -CONFIG_PPP_ASYNC=m -CONFIG_USB_PEGASUS=m -CONFIG_USB_RTL8150=m -CONFIG_USB_RTL8152=m -CONFIG_USB_NET_DM9601=m -CONFIG_USB_NET_SMSC75XX=m -CONFIG_USB_NET_SMSC95XX=m -CONFIG_USB_NET_MCS7830=m -# CONFIG_USB_NET_CDC_SUBSET is not set -# CONFIG_USB_NET_ZAURUS is not set -CONFIG_HOSTAP=m -CONFIG_HOSTAP_FIRMWARE=y -CONFIG_HOSTAP_FIRMWARE_NVRAM=y -CONFIG_LIBERTAS_THINFIRM=m -CONFIG_RT2X00=m -CONFIG_RT2800USB=m -CONFIG_MAC80211_HWSIM=m -CONFIG_USB_NET_RNDIS_WLAN=m -CONFIG_INPUT_EVDEV=y -# CONFIG_KEYBOARD_ATKBD is not set -CONFIG_KEYBOARD_GPIO=y -# CONFIG_INPUT_MOUSE is not set -# CONFIG_SERIO is not set -# CONFIG_VT is not set -# CONFIG_LEGACY_PTYS is not set -CONFIG_SERIAL_8250=y -# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_8250_DW=y -CONFIG_SERIAL_OF_PLATFORM=y -CONFIG_HW_RANDOM=y -CONFIG_TCG_TPM=y -CONFIG_I2C=y -CONFIG_I2C_CHARDEV=m -CONFIG_I2C_IMG=y -CONFIG_I2C_STUB=m -CONFIG_SPI=y -CONFIG_SPI_BITBANG=m -CONFIG_SPI_IMG_SPFI=y -CONFIG_SPI_SPIDEV=y -CONFIG_DEBUG_GPIO=y -CONFIG_GPIO_SYSFS=y -CONFIG_POWER_SUPPLY=y -CONFIG_THERMAL=y -CONFIG_WATCHDOG=y -CONFIG_IMGPDC_WDT=y -CONFIG_REGULATOR_FIXED_VOLTAGE=y -CONFIG_REGULATOR_GPIO=y -CONFIG_RC_CORE=y -CONFIG_RC_DEVICES=y -CONFIG_IR_IMG=y -CONFIG_IR_IMG_NEC=y -CONFIG_IR_IMG_JVC=y -CONFIG_IR_IMG_SONY=y -CONFIG_IR_IMG_SHARP=y -CONFIG_IR_IMG_SANYO=y -CONFIG_IR_IMG_RC5=y -CONFIG_IR_IMG_RC6=y -CONFIG_MEDIA_SUPPORT=y -CONFIG_FB=y -CONFIG_FB_MODE_HELPERS=y -# CONFIG_LCD_CLASS_DEVICE is not set -CONFIG_BACKLIGHT_CLASS_DEVICE=y -CONFIG_SOUND=y -CONFIG_SND=y -CONFIG_SND_HRTIMER=m -CONFIG_SND_DYNAMIC_MINORS=y -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m -# CONFIG_SND_SPI is not set -CONFIG_SND_USB_AUDIO=m -CONFIG_USB=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -# CONFIG_USB_DEFAULT_PERSIST is not set -CONFIG_USB_MON=y -CONFIG_USB_EHCI_HCD=y -CONFIG_USB_EHCI_ROOT_HUB_TT=y -CONFIG_USB_ACM=y -CONFIG_USB_STORAGE=y -CONFIG_USB_DWC2=y -CONFIG_USB_SERIAL=y -CONFIG_USB_SERIAL_GENERIC=y -CONFIG_USB_SERIAL_CP210X=m -CONFIG_USB_SERIAL_FTDI_SIO=m -CONFIG_USB_SERIAL_KEYSPAN=m -CONFIG_USB_SERIAL_PL2303=m -CONFIG_USB_SERIAL_OTI6858=m -CONFIG_USB_SERIAL_QUALCOMM=m -CONFIG_USB_SERIAL_SIERRAWIRELESS=m -CONFIG_USB_SERIAL_OPTION=m -CONFIG_MMC=y -CONFIG_MMC_BLOCK_MINORS=16 -CONFIG_MMC_TEST=m -CONFIG_MMC_DW=y -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=y -CONFIG_RTC_CLASS=y -CONFIG_DMADEVICES=y -CONFIG_IMG_MDC_DMA=y -CONFIG_STAGING=y -CONFIG_ASHMEM=y -# CONFIG_IOMMU_SUPPORT is not set -CONFIG_MEMORY=y -CONFIG_IIO=y -CONFIG_CC10001_ADC=y -CONFIG_PWM=y -CONFIG_PWM_IMG=y -CONFIG_PHY_PISTACHIO_USB=y -CONFIG_ANDROID=y -CONFIG_EXT4_FS=y -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_EXT4_FS_SECURITY=y -# CONFIG_DNOTIFY is not set -CONFIG_FUSE_FS=m -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_UDF_FS=m -CONFIG_VFAT_FS=m -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_ECRYPT_FS=y -CONFIG_HFSPLUS_FS=m -CONFIG_UBIFS_FS=y -CONFIG_SQUASHFS=y -CONFIG_SQUASHFS_FILE_DIRECT=y -CONFIG_SQUASHFS_LZO=y -CONFIG_PSTORE=y -CONFIG_PSTORE_CONSOLE=y -CONFIG_PSTORE_RAM=y -CONFIG_NFS_FS=y -CONFIG_ROOT_NFS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_ASCII=m -CONFIG_NLS_ISO8859_1=m -CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_SECURITY_YAMA=y -CONFIG_CRYPTO_AUTHENC=y -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA256=y -CONFIG_CRYPTO_SHA512=m -CONFIG_CRYPTO_ARC4=y -CONFIG_CRYPTO_DES=y -CONFIG_CRC_CCITT=y -CONFIG_CRC_T10DIF=m -CONFIG_CRC7=m -# CONFIG_XZ_DEC_X86 is not set -CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y -CONFIG_MAGIC_SYSRQ=y -CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0 -# CONFIG_SCHED_DEBUG is not set -CONFIG_SCHEDSTATS=y -CONFIG_DEBUG_SPINLOCK=y -CONFIG_DEBUG_CREDENTIALS=y -CONFIG_FUNCTION_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_LKDTM=y -CONFIG_TEST_UDELAY=m diff --git a/arch/mips/configs/rt305x_defconfig b/arch/mips/configs/rt305x_defconfig index fec5851c164b..eb359db15dba 100644 --- a/arch/mips/configs/rt305x_defconfig +++ b/arch/mips/configs/rt305x_defconfig @@ -90,7 +90,6 @@ CONFIG_PPPOE=m CONFIG_PPP_ASYNC=m CONFIG_ISDN=y CONFIG_INPUT=m -CONFIG_INPUT_POLLDEV=m # CONFIG_KEYBOARD_ATKBD is not set # CONFIG_INPUT_MOUSE is not set CONFIG_INPUT_MISC=y diff --git a/arch/mips/configs/xway_defconfig b/arch/mips/configs/xway_defconfig index 9abbc0debc2a..eeb689f715cb 100644 --- a/arch/mips/configs/xway_defconfig +++ b/arch/mips/configs/xway_defconfig @@ -96,7 +96,6 @@ CONFIG_PPPOE=m CONFIG_PPP_ASYNC=m CONFIG_ISDN=y CONFIG_INPUT=m -CONFIG_INPUT_POLLDEV=m # CONFIG_KEYBOARD_ATKBD is not set # CONFIG_INPUT_MOUSE is not set CONFIG_INPUT_MISC=y diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig index 657dd93c5e76..7dc5b3821cc6 100644 --- a/arch/mips/generic/Kconfig +++ b/arch/mips/generic/Kconfig @@ -58,6 +58,12 @@ config FIT_IMAGE_FDT_BOSTON enable this if you wish to boot on a MIPS Boston board, as it is expected by the bootloader. +config FIT_IMAGE_FDT_MARDUK + bool "Include FDT for IMG Pistachio Marduk (CI40) boards" + help + Enable this to include the FDT for the IMG Pistachio Marduk (CI40) + from Imagination Technologies in the FIT kernel image. + config FIT_IMAGE_FDT_NI169445 bool "Include FDT for NI 169445" help diff --git a/arch/mips/generic/Platform b/arch/mips/generic/Platform index b871af16b5b6..e1abc113b409 100644 --- a/arch/mips/generic/Platform +++ b/arch/mips/generic/Platform @@ -24,3 +24,4 @@ its-$(CONFIG_FIT_IMAGE_FDT_LUTON) += board-luton.its.S its-$(CONFIG_FIT_IMAGE_FDT_JAGUAR2) += board-jaguar2.its.S its-$(CONFIG_FIT_IMAGE_FDT_SERVAL) += board-serval.its.S its-$(CONFIG_FIT_IMAGE_FDT_XILFPGA) += board-xilfpga.its.S +its-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += board-marduk.its.S diff --git a/arch/mips/generic/board-ingenic.c b/arch/mips/generic/board-ingenic.c index 0cec0bea13d6..3f44f14bdb33 100644 --- a/arch/mips/generic/board-ingenic.c +++ b/arch/mips/generic/board-ingenic.c @@ -7,6 +7,8 @@ * Copyright (C) 2020 Paul Cercueil <paul@crapouillou.net> */ +#include <linux/clk.h> +#include <linux/of.h> #include <linux/of_address.h> #include <linux/of_fdt.h> #include <linux/pm.h> @@ -21,6 +23,10 @@ static __init char *ingenic_get_system_type(unsigned long machtype) { switch (machtype) { + case MACH_INGENIC_X2100: + return "X2100"; + case MACH_INGENIC_X2000H: + return "X2000H"; case MACH_INGENIC_X2000E: return "X2000E"; case MACH_INGENIC_X2000: @@ -37,8 +43,18 @@ static __init char *ingenic_get_system_type(unsigned long machtype) return "JZ4775"; case MACH_INGENIC_JZ4770: return "JZ4770"; + case MACH_INGENIC_JZ4760B: + return "JZ4760B"; + case MACH_INGENIC_JZ4760: + return "JZ4760"; + case MACH_INGENIC_JZ4755: + return "JZ4755"; + case MACH_INGENIC_JZ4750: + return "JZ4750"; case MACH_INGENIC_JZ4725B: return "JZ4725B"; + case MACH_INGENIC_JZ4730: + return "JZ4730"; default: return "JZ4740"; } @@ -61,8 +77,13 @@ static __init const void *ingenic_fixup_fdt(const void *fdt, const void *match_d } static const struct of_device_id ingenic_of_match[] __initconst = { + { .compatible = "ingenic,jz4730", .data = (void *)MACH_INGENIC_JZ4730 }, { .compatible = "ingenic,jz4740", .data = (void *)MACH_INGENIC_JZ4740 }, { .compatible = "ingenic,jz4725b", .data = (void *)MACH_INGENIC_JZ4725B }, + { .compatible = "ingenic,jz4750", .data = (void *)MACH_INGENIC_JZ4750 }, + { .compatible = "ingenic,jz4755", .data = (void *)MACH_INGENIC_JZ4755 }, + { .compatible = "ingenic,jz4760", .data = (void *)MACH_INGENIC_JZ4760 }, + { .compatible = "ingenic,jz4760b", .data = (void *)MACH_INGENIC_JZ4760B }, { .compatible = "ingenic,jz4770", .data = (void *)MACH_INGENIC_JZ4770 }, { .compatible = "ingenic,jz4775", .data = (void *)MACH_INGENIC_JZ4775 }, { .compatible = "ingenic,jz4780", .data = (void *)MACH_INGENIC_JZ4780 }, @@ -71,6 +92,8 @@ static const struct of_device_id ingenic_of_match[] __initconst = { { .compatible = "ingenic,x1830", .data = (void *)MACH_INGENIC_X1830 }, { .compatible = "ingenic,x2000", .data = (void *)MACH_INGENIC_X2000 }, { .compatible = "ingenic,x2000e", .data = (void *)MACH_INGENIC_X2000E }, + { .compatible = "ingenic,x2000h", .data = (void *)MACH_INGENIC_X2000H }, + { .compatible = "ingenic,x2100", .data = (void *)MACH_INGENIC_X2100 }, {} }; @@ -108,10 +131,36 @@ static const struct platform_suspend_ops ingenic_pm_ops __maybe_unused = { static int __init ingenic_pm_init(void) { + struct device_node *cpu_node; + struct clk *cpu0_clk; + int ret; + if (boot_cpu_type() == CPU_XBURST) { if (IS_ENABLED(CONFIG_PM_SLEEP)) suspend_set_ops(&ingenic_pm_ops); _machine_halt = ingenic_halt; + + /* + * Unconditionally enable the clock for the first CPU. + * This makes sure that the PLL that feeds the CPU won't be + * stopped while the kernel is running. + */ + cpu_node = of_get_cpu_node(0, NULL); + if (!cpu_node) { + pr_err("Unable to get CPU node\n"); + } else { + cpu0_clk = of_clk_get(cpu_node, 0); + if (IS_ERR(cpu0_clk)) { + pr_err("Unable to get CPU0 clock\n"); + return PTR_ERR(cpu0_clk); + } + + ret = clk_prepare_enable(cpu0_clk); + if (ret) { + pr_err("Unable to enable CPU0 clock\n"); + return ret; + } + } } return 0; diff --git a/arch/mips/generic/board-marduk.its.S b/arch/mips/generic/board-marduk.its.S new file mode 100644 index 000000000000..4f633794db90 --- /dev/null +++ b/arch/mips/generic/board-marduk.its.S @@ -0,0 +1,22 @@ +/ { + images { + fdt-marduk { + description = "img,pistachio-marduk Device Tree"; + data = /incbin/("boot/dts/img/pistachio_marduk.dtb"); + type = "flat_dt"; + arch = "mips"; + compression = "none"; + hash { + algo = "sha1"; + }; + }; + }; + + configurations { + conf-marduk { + description = "Marduk Linux kernel"; + kernel = "kernel"; + fdt = "fdt-marduk"; + }; + }; +}; diff --git a/arch/mips/generic/board-ocelot.c b/arch/mips/generic/board-ocelot.c index c238e95190ac..7115410acb4f 100644 --- a/arch/mips/generic/board-ocelot.c +++ b/arch/mips/generic/board-ocelot.c @@ -26,13 +26,13 @@ static __init bool ocelot_detect(void) tlb_probe_hazard(); idx = read_c0_index(); if (idx < 0) - return 0; + return false; /* A TLB entry exists, lets assume its usable and check the CHIP ID */ rev = __raw_readl((void __iomem *)DEVCPU_GCB_CHIP_REGS_CHIP_ID); if ((rev & CHIP_ID_PART_ID) != OCELOT_PART_ID) - return 0; + return false; /* Copy command line from bootloader early for Initrd detection */ if (fw_arg0 < 10 && (fw_arg1 & 0xFFF00000) == 0x80000000) { @@ -44,7 +44,7 @@ static __init bool ocelot_detect(void) strcpy(arcs_cmdline, prom_argv[1]); } - return 1; + return true; } static void __init ocelot_earlyprintk_init(void) diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index 95e1f7f3597f..a0b9e7c1e4fc 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -206,7 +206,7 @@ ATOMIC_OPS(atomic64, xor, s64, ^=, xor, lld, scd) * The function returns the old value of @v minus @i. */ #define ATOMIC_SIP_OP(pfx, type, op, ll, sc) \ -static __inline__ int arch_##pfx##_sub_if_positive(type i, pfx##_t * v) \ +static __inline__ type arch_##pfx##_sub_if_positive(type i, pfx##_t * v) \ { \ type temp, result; \ \ diff --git a/arch/mips/include/asm/bootinfo.h b/arch/mips/include/asm/bootinfo.h index 4c2e8173e6ec..2128ba903391 100644 --- a/arch/mips/include/asm/bootinfo.h +++ b/arch/mips/include/asm/bootinfo.h @@ -75,6 +75,7 @@ enum ingenic_machine_type { MACH_INGENIC_JZ4750, MACH_INGENIC_JZ4755, MACH_INGENIC_JZ4760, + MACH_INGENIC_JZ4760B, MACH_INGENIC_JZ4770, MACH_INGENIC_JZ4775, MACH_INGENIC_JZ4780, @@ -83,6 +84,8 @@ enum ingenic_machine_type { MACH_INGENIC_X1830, MACH_INGENIC_X2000, MACH_INGENIC_X2000E, + MACH_INGENIC_X2000H, + MACH_INGENIC_X2100, }; extern char *system_type; diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index d687b40b9fbb..b3dc9c589442 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -125,13 +125,7 @@ static inline void kunmap_noncoherent(void) kunmap_coherent(); } -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ - BUG_ON(cpu_has_dc_aliases && PageHighMem(page)); - flush_dcache_page(page); -} - +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 /* * For now flush_kernel_vmap_range and invalidate_kernel_vmap_range both do a * cache writeback and invalidate operation. diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index 53f015a1b0a7..bbb3bc5a42fd 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -96,14 +96,6 @@ struct compat_statfs { #define COMPAT_OFF_T_MAX 0x7fffffff -static inline void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = (struct pt_regs *) - ((unsigned long) current_thread_info() + THREAD_SIZE - 32) - 1; - - return (void __user *) (regs->regs[29] - len); -} - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/mips/include/asm/cpu.h b/arch/mips/include/asm/cpu.h index 9e6211e6d76b..d45a52f65b7a 100644 --- a/arch/mips/include/asm/cpu.h +++ b/arch/mips/include/asm/cpu.h @@ -46,8 +46,8 @@ #define PRID_COMP_NETLOGIC 0x0c0000 #define PRID_COMP_CAVIUM 0x0d0000 #define PRID_COMP_LOONGSON 0x140000 -#define PRID_COMP_INGENIC_13 0x130000 /* X2000 */ -#define PRID_COMP_INGENIC_D0 0xd00000 /* JZ4740, JZ4750, X1830 */ +#define PRID_COMP_INGENIC_13 0x130000 /* X2000, X2100 */ +#define PRID_COMP_INGENIC_D0 0xd00000 /* JZ4730, JZ4740, JZ4750, JZ4755, JZ4760, X1830 */ #define PRID_COMP_INGENIC_D1 0xd10000 /* JZ4770, JZ4775, X1000 */ #define PRID_COMP_INGENIC_E1 0xe10000 /* JZ4780 */ diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index 783fecce65c8..f8f74f9f5883 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -428,7 +428,6 @@ do { \ extern size_t __raw_copy_from_user(void *__to, const void *__from, size_t __n); extern size_t __raw_copy_to_user(void *__to, const void *__from, size_t __n); -extern size_t __raw_copy_in_user(void *__to, const void *__from, size_t __n); static inline unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) @@ -480,31 +479,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER -static inline unsigned long -raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - register void __user *__cu_to_r __asm__("$4"); - register const void __user *__cu_from_r __asm__("$5"); - register long __cu_len_r __asm__("$6"); - - __cu_to_r = to; - __cu_from_r = from; - __cu_len_r = n; - - __asm__ __volatile__( - ".set\tnoreorder\n\t" - __MODULE_JAL(__raw_copy_in_user) - ".set\tnoat\n\t" - __UA_ADDU "\t$1, %1, %2\n\t" - ".set\tat\n\t" - ".set\treorder" - : "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r) - : - : "$8", "$9", "$10", "$11", "$12", "$14", "$15", "$24", "$31", - DADDI_SCRATCH, "memory"); - return __cu_len_r; -} - extern __kernel_size_t __bzero(void __user *addr, __kernel_size_t size); /* diff --git a/arch/mips/kernel/cacheinfo.c b/arch/mips/kernel/cacheinfo.c index 53d8ea7d36e6..495dd058231d 100644 --- a/arch/mips/kernel/cacheinfo.c +++ b/arch/mips/kernel/cacheinfo.c @@ -17,7 +17,7 @@ do { \ leaf++; \ } while (0) -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { struct cpuinfo_mips *c = ¤t_cpu_data; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -74,7 +74,7 @@ static void fill_cpumask_cluster(int cpu, cpumask_t *cpu_map) cpumask_set_cpu(cpu1, cpu_map); } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { struct cpuinfo_mips *c = ¤t_cpu_data; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -114,6 +114,3 @@ static int __populate_cache_leaves(unsigned int cpu) return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index 6c590ef27648..67e130d3f038 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c @@ -76,13 +76,13 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len, if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) return -EFAULT; - get_online_cpus(); + cpus_read_lock(); rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { rcu_read_unlock(); - put_online_cpus(); + cpus_read_unlock(); return -ESRCH; } @@ -147,7 +147,7 @@ out_free_cpus_allowed: free_cpumask_var(cpus_allowed); out_put_task: put_task_struct(p); - put_online_cpus(); + cpus_read_unlock(); return retval; } @@ -166,7 +166,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, if (len < real_len) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); rcu_read_lock(); retval = -ESRCH; @@ -182,7 +182,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, out_unlock: rcu_read_unlock(); - put_online_cpus(); + cpus_read_unlock(); if (retval) return retval; if (copy_to_user(user_mask_ptr, &mask, real_len)) diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 73c8e7990a97..95aa86fa6077 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -859,10 +859,10 @@ int mips_set_process_fp_mode(struct task_struct *task, unsigned int value) * scheduled in then it will already have picked up the new FP mode * whilst doing so. */ - get_online_cpus(); + cpus_read_lock(); for_each_cpu_and(cpu, &process_cpus, cpu_online_mask) work_on_cpu(cpu, prepare_for_fp_mode_switch, NULL); - put_online_cpus(); + cpus_read_unlock(); return 0; } diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 23a140327a0b..f979adfd4fc2 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -452,8 +452,9 @@ static void __init mips_parse_crashkernel(void) return; if (crash_base <= 0) { - crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_MAX, - crash_size, CRASH_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + CRASH_ALIGN, + CRASH_ADDR_MAX); if (!crash_base) { pr_warn("crashkernel reservation failed - No suitable area found.\n"); return; @@ -461,8 +462,9 @@ static void __init mips_parse_crashkernel(void) } else { unsigned long long start; - start = memblock_find_in_range(crash_base, crash_base + crash_size, - crash_size, 1); + start = memblock_phys_alloc_range(crash_size, 1, + crash_base, + crash_base + crash_size); if (start != crash_base) { pr_warn("Invalid memory region reserved for crash kernel\n"); return; @@ -656,10 +658,6 @@ static void __init arch_mem_init(char **cmdline_p) mips_reserve_vmcore(); mips_parse_crashkernel(); -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) - memblock_reserve(crashk_res.start, resource_size(&crashk_res)); -#endif device_tree_init(); /* diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index f1e985109da0..c9b2a75563e1 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -906,10 +906,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused, if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } user_enter(); } diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index c2d2e19abea8..70e32de2bcaa 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -239,9 +239,9 @@ 228 n32 clock_nanosleep sys_clock_nanosleep_time32 229 n32 tgkill sys_tgkill 230 n32 utimes sys_utimes_time32 -231 n32 mbind compat_sys_mbind -232 n32 get_mempolicy compat_sys_get_mempolicy -233 n32 set_mempolicy compat_sys_set_mempolicy +231 n32 mbind sys_mbind +232 n32 get_mempolicy sys_get_mempolicy +233 n32 set_mempolicy sys_set_mempolicy 234 n32 mq_open compat_sys_mq_open 235 n32 mq_unlink sys_mq_unlink 236 n32 mq_timedsend sys_mq_timedsend_time32 @@ -258,7 +258,7 @@ 247 n32 inotify_init sys_inotify_init 248 n32 inotify_add_watch sys_inotify_add_watch 249 n32 inotify_rm_watch sys_inotify_rm_watch -250 n32 migrate_pages compat_sys_migrate_pages +250 n32 migrate_pages sys_migrate_pages 251 n32 openat sys_openat 252 n32 mkdirat sys_mkdirat 253 n32 mknodat sys_mknodat @@ -279,7 +279,7 @@ 268 n32 sync_file_range sys_sync_file_range 269 n32 tee sys_tee 270 n32 vmsplice sys_vmsplice -271 n32 move_pages compat_sys_move_pages +271 n32 move_pages sys_move_pages 272 n32 set_robust_list compat_sys_set_robust_list 273 n32 get_robust_list compat_sys_get_robust_list 274 n32 kexec_load compat_sys_kexec_load @@ -385,3 +385,5 @@ 444 n32 landlock_create_ruleset sys_landlock_create_ruleset 445 n32 landlock_add_rule sys_landlock_add_rule 446 n32 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 n32 process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index ac653d08b1ea..1ca7bc337932 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -361,3 +361,5 @@ 444 n64 landlock_create_ruleset sys_landlock_create_ruleset 445 n64 landlock_add_rule sys_landlock_add_rule 446 n64 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 n64 process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index fae35882a165..a61c35edaa74 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -279,9 +279,9 @@ 265 o32 clock_nanosleep sys_clock_nanosleep_time32 266 o32 tgkill sys_tgkill 267 o32 utimes sys_utimes_time32 -268 o32 mbind sys_mbind compat_sys_mbind -269 o32 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -270 o32 set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +268 o32 mbind sys_mbind +269 o32 get_mempolicy sys_get_mempolicy +270 o32 set_mempolicy sys_set_mempolicy 271 o32 mq_open sys_mq_open compat_sys_mq_open 272 o32 mq_unlink sys_mq_unlink 273 o32 mq_timedsend sys_mq_timedsend_time32 @@ -298,7 +298,7 @@ 284 o32 inotify_init sys_inotify_init 285 o32 inotify_add_watch sys_inotify_add_watch 286 o32 inotify_rm_watch sys_inotify_rm_watch -287 o32 migrate_pages sys_migrate_pages compat_sys_migrate_pages +287 o32 migrate_pages sys_migrate_pages 288 o32 openat sys_openat compat_sys_openat 289 o32 mkdirat sys_mkdirat 290 o32 mknodat sys_mknodat @@ -319,7 +319,7 @@ 305 o32 sync_file_range sys_sync_file_range sys32_sync_file_range 306 o32 tee sys_tee 307 o32 vmsplice sys_vmsplice -308 o32 move_pages sys_move_pages compat_sys_move_pages +308 o32 move_pages sys_move_pages 309 o32 set_robust_list sys_set_robust_list compat_sys_set_robust_list 310 o32 get_robust_list sys_get_robust_list compat_sys_get_robust_list 311 o32 kexec_load sys_kexec_load compat_sys_kexec_load @@ -434,3 +434,5 @@ 444 o32 landlock_create_ruleset sys_landlock_create_ruleset 445 o32 landlock_add_rule sys_landlock_add_rule 446 o32 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 o32 process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/uprobes.c b/arch/mips/kernel/uprobes.c index 6dbe4eab0a0e..9db2a6db5f62 100644 --- a/arch/mips/kernel/uprobes.c +++ b/arch/mips/kernel/uprobes.c @@ -75,7 +75,7 @@ bool is_trap_insn(uprobe_opcode_t *insn) case tlt_op: case tltu_op: case tne_op: - return 1; + return true; } break; @@ -87,12 +87,12 @@ bool is_trap_insn(uprobe_opcode_t *insn) case tlti_op: case tltiu_op: case tnei_op: - return 1; + return true; } break; } - return 0; + return false; } #define UPROBE_TRAP_NR ULONG_MAX @@ -254,9 +254,9 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) * See if the instruction can be emulated. * Returns true if instruction was emulated, false otherwise. * - * For now we always emulate so this function just returns 0. + * For now we always emulate so this function just returns false. */ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - return 0; + return false; } diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile index c67250a956b8..d3710959da55 100644 --- a/arch/mips/kvm/Makefile +++ b/arch/mips/kvm/Makefile @@ -2,21 +2,18 @@ # Makefile for KVM support for MIPS # -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o) +ccflags-y += -Ivirt/kvm -Iarch/mips/kvm -EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm +kvm-y := $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o) +kvm-$(CONFIG_CPU_HAS_MSA) += msa.o -common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o - -kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \ +kvm-y += mips.o emulate.o entry.o \ interrupt.o stats.o \ fpu.o -kvm-objs += hypcall.o -kvm-objs += mmu.o -ifdef CONFIG_CPU_LOONGSON64 -kvm-objs += loongson_ipi.o -endif +kvm-y += hypcall.o +kvm-y += mmu.o +kvm-$(CONFIG_CPU_LOONGSON64) += loongson_ipi.o -kvm-objs += vz.o +kvm-y += vz.o obj-$(CONFIG_KVM) += kvm.o obj-y += callback.o tlb.o diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index af9dd029a4e1..75c6f264c626 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -41,8 +41,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -85,8 +83,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, vz_cpucfg_exits), #endif }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 6d1f68cf4edf..1bfd1b501d82 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -442,7 +442,7 @@ static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn, bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { kvm_mips_flush_gpa_pt(kvm, range->start, range->end); - return 1; + return true; } bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -486,7 +486,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); if (!gpa_pte) - return 0; + return false; return pte_young(*gpa_pte); } diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 43cad10b877d..4adca5abbc72 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -388,7 +388,6 @@ static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu, u32 compare, u32 cause) { u32 start_count, after_count; - ktime_t freeze_time; unsigned long flags; /* @@ -396,7 +395,7 @@ static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu, * this with interrupts disabled to avoid latency. */ local_irq_save(flags); - freeze_time = kvm_mips_freeze_hrtimer(vcpu, &start_count); + kvm_mips_freeze_hrtimer(vcpu, &start_count); write_c0_gtoffset(start_count - read_c0_count()); local_irq_restore(flags); diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S index e19fb98b5d38..277c32296636 100644 --- a/arch/mips/lib/memcpy.S +++ b/arch/mips/lib/memcpy.S @@ -666,8 +666,6 @@ FEXPORT(__raw_copy_from_user) EXPORT_SYMBOL(__raw_copy_from_user) FEXPORT(__raw_copy_to_user) EXPORT_SYMBOL(__raw_copy_to_user) -FEXPORT(__raw_copy_in_user) -EXPORT_SYMBOL(__raw_copy_in_user) #endif /* Legacy Mode, user <-> user */ __BUILD_COPY_USER LEGACY_MODE USEROP USEROP @@ -703,13 +701,4 @@ EXPORT_SYMBOL(__raw_copy_to_user) __BUILD_COPY_USER EVA_MODE KERNELOP USEROP END(__raw_copy_to_user) -/* - * __copy_in_user (EVA) - */ - -LEAF(__raw_copy_in_user) -EXPORT_SYMBOL(__raw_copy_in_user) -__BUILD_COPY_USER EVA_MODE USEROP USEROP -END(__raw_copy_in_user) - #endif diff --git a/arch/mips/loongson2ef/common/Makefile b/arch/mips/loongson2ef/common/Makefile index d5ab3e543ea3..30ea8b5ca685 100644 --- a/arch/mips/loongson2ef/common/Makefile +++ b/arch/mips/loongson2ef/common/Makefile @@ -4,12 +4,14 @@ # obj-y += setup.o init.o env.o time.o reset.o irq.o \ - bonito-irq.o mem.o machtype.o platform.o serial.o + bonito-irq.o mem.o machtype.o platform.o obj-$(CONFIG_PCI) += pci.o # # Serial port support # +obj-$(CONFIG_LOONGSON_UART_BASE) += serial.o +obj-$(CONFIG_EARLY_PRINTK) += serial.o obj-$(CONFIG_LOONGSON_UART_BASE) += uart_base.o obj-$(CONFIG_LOONGSON_MC146818) += rtc.o diff --git a/arch/mips/mm/c-octeon.c b/arch/mips/mm/c-octeon.c index 8ae181e08311..ec2ae501539a 100644 --- a/arch/mips/mm/c-octeon.c +++ b/arch/mips/mm/c-octeon.c @@ -30,7 +30,7 @@ unsigned long long cache_err_dcache[NR_CPUS]; EXPORT_SYMBOL_GPL(cache_err_dcache); -/** +/* * Octeon automatically flushes the dcache on tlb changes, so * from Linux's viewpoint it acts much like a physically * tagged cache. No flushing is needed @@ -56,8 +56,8 @@ static void local_octeon_flush_icache_range(unsigned long start, } /** - * Flush caches as necessary for all cores affected by a - * vma. If no vma is supplied, all cores are flushed. + * octeon_flush_icache_all_cores - Flush caches as necessary for all cores + * affected by a vma. If no vma is supplied, all cores are flushed. * * @vma: VMA to flush or NULL to flush all icaches. */ @@ -92,7 +92,7 @@ static void octeon_flush_icache_all_cores(struct vm_area_struct *vma) } -/** +/* * Called to flush the icache on all cores */ static void octeon_flush_icache_all(void) @@ -102,8 +102,7 @@ static void octeon_flush_icache_all(void) /** - * Called to flush all memory associated with a memory - * context. + * octeon_flush_cache_mm - flush all memory associated with a memory context. * * @mm: Memory context to flush */ @@ -116,7 +115,7 @@ static void octeon_flush_cache_mm(struct mm_struct *mm) } -/** +/* * Flush a range of kernel addresses out of the icache * */ @@ -127,11 +126,11 @@ static void octeon_flush_icache_range(unsigned long start, unsigned long end) /** - * Flush a range out of a vma + * octeon_flush_cache_range - Flush a range out of a vma * * @vma: VMA to flush - * @start: - * @end: + * @start: beginning address for flush + * @end: ending address for flush */ static void octeon_flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) @@ -142,11 +141,11 @@ static void octeon_flush_cache_range(struct vm_area_struct *vma, /** - * Flush a specific page of a vma + * octeon_flush_cache_page - Flush a specific page of a vma * * @vma: VMA to flush page for * @page: Page to flush - * @pfn: + * @pfn: Page frame number */ static void octeon_flush_cache_page(struct vm_area_struct *vma, unsigned long page, unsigned long pfn) @@ -160,7 +159,7 @@ static void octeon_flush_kernel_vmap_range(unsigned long vaddr, int size) BUG(); } -/** +/* * Probe Octeon's caches * */ @@ -256,7 +255,7 @@ static void octeon_cache_error_setup(void) set_handler(0x100, &except_vec2_octeon, 0x80); } -/** +/* * Setup the Octeon cache flush routines * */ @@ -341,7 +340,7 @@ asmlinkage void cache_parity_error_octeon_recoverable(void) co_cache_error_call_notifiers(0); } -/** +/* * Called when the the exception is not recoverable */ diff --git a/arch/mips/mti-malta/malta-dtshim.c b/arch/mips/mti-malta/malta-dtshim.c index 0ddf03df6268..f451268f6c38 100644 --- a/arch/mips/mti-malta/malta-dtshim.c +++ b/arch/mips/mti-malta/malta-dtshim.c @@ -22,7 +22,7 @@ #define ROCIT_CONFIG_GEN1_MEMMAP_SHIFT 8 #define ROCIT_CONFIG_GEN1_MEMMAP_MASK (0xf << 8) -static unsigned char fdt_buf[16 << 10] __initdata; +static unsigned char fdt_buf[16 << 10] __initdata __aligned(8); /* determined physical memory size, not overridden by command line args */ extern unsigned long physical_memsize; diff --git a/arch/mips/netlogic/xlr/fmn-config.c b/arch/mips/netlogic/xlr/fmn-config.c index c7622c6e5f67..15483537e8cf 100644 --- a/arch/mips/netlogic/xlr/fmn-config.c +++ b/arch/mips/netlogic/xlr/fmn-config.c @@ -103,18 +103,19 @@ static void check_credit_distribution(void) } /** - * Configure bucket size and credits for a device. 'size' is the size of - * the buckets for the device. This size is distributed among all the CPUs - * so that all of them can send messages to the device. - * - * The device is also given 'cpu_credits' to send messages to the CPUs - * + * setup_fmn_cc - Configure bucket size and credits for a device. * @dev_info: FMN information structure for each devices * @start_stn_id: Starting station id of dev_info * @end_stn_id: End station id of dev_info * @num_buckets: Total number of buckets for den_info * @cpu_credits: Allowed credits to cpu for each devices pointing by dev_info * @size: Size of the each buckets in the device station + * + * 'size' is the size of the buckets for the device. This size is + * distributed among all the CPUs + * so that all of them can send messages to the device. + * + * The device is also given 'cpu_credits' to send messages to the CPUs */ static void setup_fmn_cc(struct xlr_fmn_info *dev_info, int start_stn_id, int end_stn_id, int num_buckets, int cpu_credits, int size) @@ -174,6 +175,8 @@ static void setup_cpu_fmninfo(struct xlr_fmn_info *cpu, int num_core) } /** + * xlr_board_info_setup - Setup FMN details + * * Setup the FMN details for each devices according to the device available * in each variant of XLR/XLS processor */ diff --git a/arch/mips/pistachio/Kconfig b/arch/mips/pistachio/Kconfig deleted file mode 100644 index 9a0e06c95184..000000000000 --- a/arch/mips/pistachio/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -config PISTACHIO_GPTIMER_CLKSRC - bool "Enable General Purpose Timer based clocksource" - depends on MACH_PISTACHIO - select CLKSRC_PISTACHIO - select MIPS_EXTERNAL_TIMER - help - This option enables a clocksource driver based on a Pistachio - SoC General Purpose external timer. - - If you want to enable the CPUFreq, you need to enable - this option. - - If you don't want to enable CPUFreq, you can leave this disabled. diff --git a/arch/mips/pistachio/Makefile b/arch/mips/pistachio/Makefile deleted file mode 100644 index 66f4af17fb66..000000000000 --- a/arch/mips/pistachio/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-y += init.o irq.o time.o diff --git a/arch/mips/pistachio/Platform b/arch/mips/pistachio/Platform deleted file mode 100644 index c59de86dbddf..000000000000 --- a/arch/mips/pistachio/Platform +++ /dev/null @@ -1,6 +0,0 @@ -# -# IMG Pistachio SoC -# -load-$(CONFIG_MACH_PISTACHIO) += 0xffffffff80400000 -zload-$(CONFIG_MACH_PISTACHIO) += 0xffffffff81000000 -all-$(CONFIG_MACH_PISTACHIO) := uImage.gz diff --git a/arch/mips/pistachio/init.c b/arch/mips/pistachio/init.c deleted file mode 100644 index e0bacfc3c6b4..000000000000 --- a/arch/mips/pistachio/init.c +++ /dev/null @@ -1,125 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Pistachio platform setup - * - * Copyright (C) 2014 Google, Inc. - * Copyright (C) 2016 Imagination Technologies - */ - -#include <linux/init.h> -#include <linux/io.h> -#include <linux/kernel.h> -#include <linux/of_address.h> -#include <linux/of_fdt.h> - -#include <asm/cacheflush.h> -#include <asm/fw/fw.h> -#include <asm/mips-boards/generic.h> -#include <asm/mips-cps.h> -#include <asm/prom.h> -#include <asm/smp-ops.h> -#include <asm/traps.h> - -/* - * Core revision register decoding - * Bits 23 to 20: Major rev - * Bits 15 to 8: Minor rev - * Bits 7 to 0: Maintenance rev - */ -#define PISTACHIO_CORE_REV_REG 0xB81483D0 -#define PISTACHIO_CORE_REV_A1 0x00100006 -#define PISTACHIO_CORE_REV_B0 0x00100106 - -const char *get_system_type(void) -{ - u32 core_rev; - const char *sys_type; - - core_rev = __raw_readl((const void *)PISTACHIO_CORE_REV_REG); - - switch (core_rev) { - case PISTACHIO_CORE_REV_B0: - sys_type = "IMG Pistachio SoC (B0)"; - break; - - case PISTACHIO_CORE_REV_A1: - sys_type = "IMG Pistachio SoC (A1)"; - break; - - default: - sys_type = "IMG Pistachio SoC"; - break; - } - - return sys_type; -} - -void __init *plat_get_fdt(void) -{ - if (fw_arg0 != -2) - panic("Device-tree not present"); - return (void *)fw_arg1; -} - -void __init plat_mem_setup(void) -{ - __dt_setup_arch(plat_get_fdt()); -} - -#define DEFAULT_CPC_BASE_ADDR 0x1bde0000 -#define DEFAULT_CDMM_BASE_ADDR 0x1bdd0000 - -phys_addr_t mips_cpc_default_phys_base(void) -{ - return DEFAULT_CPC_BASE_ADDR; -} - -phys_addr_t mips_cdmm_phys_base(void) -{ - return DEFAULT_CDMM_BASE_ADDR; -} - -static void __init mips_nmi_setup(void) -{ - void *base; - - base = cpu_has_veic ? - (void *)(CAC_BASE + 0xa80) : - (void *)(CAC_BASE + 0x380); - memcpy(base, except_vec_nmi, 0x80); - flush_icache_range((unsigned long)base, - (unsigned long)base + 0x80); -} - -static void __init mips_ejtag_setup(void) -{ - void *base; - extern char except_vec_ejtag_debug[]; - - base = cpu_has_veic ? - (void *)(CAC_BASE + 0xa00) : - (void *)(CAC_BASE + 0x300); - memcpy(base, except_vec_ejtag_debug, 0x80); - flush_icache_range((unsigned long)base, - (unsigned long)base + 0x80); -} - -void __init prom_init(void) -{ - board_nmi_handler_setup = mips_nmi_setup; - board_ejtag_handler_setup = mips_ejtag_setup; - - mips_cm_probe(); - mips_cpc_probe(); - register_cps_smp_ops(); - - pr_info("SoC Type: %s\n", get_system_type()); -} - -void __init device_tree_init(void) -{ - if (!initial_boot_params) - return; - - unflatten_and_copy_device_tree(); -} diff --git a/arch/mips/pistachio/irq.c b/arch/mips/pistachio/irq.c deleted file mode 100644 index 437c3101ac45..000000000000 --- a/arch/mips/pistachio/irq.c +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Pistachio IRQ setup - * - * Copyright (C) 2014 Google, Inc. - */ - -#include <linux/init.h> -#include <linux/irqchip.h> -#include <linux/kernel.h> - -#include <asm/cpu-features.h> -#include <asm/irq_cpu.h> - -void __init arch_init_irq(void) -{ - pr_info("EIC is %s\n", cpu_has_veic ? "on" : "off"); - pr_info("VINT is %s\n", cpu_has_vint ? "on" : "off"); - - if (!cpu_has_veic) - mips_cpu_irq_init(); - - irqchip_init(); -} diff --git a/arch/mips/pistachio/time.c b/arch/mips/pistachio/time.c deleted file mode 100644 index de64751dec40..000000000000 --- a/arch/mips/pistachio/time.c +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Pistachio clocksource/timer setup - * - * Copyright (C) 2014 Google, Inc. - */ - -#include <linux/clk.h> -#include <linux/clocksource.h> -#include <linux/init.h> -#include <linux/of.h> -#include <linux/of_clk.h> - -#include <asm/mips-cps.h> -#include <asm/time.h> - -unsigned int get_c0_compare_int(void) -{ - return gic_get_c0_compare_int(); -} - -int get_c0_perfcount_int(void) -{ - return gic_get_c0_perfcount_int(); -} -EXPORT_SYMBOL_GPL(get_c0_perfcount_int); - -int get_c0_fdc_int(void) -{ - return gic_get_c0_fdc_int(); -} - -void __init plat_time_init(void) -{ - struct device_node *np; - struct clk *clk; - - of_clk_init(NULL); - timer_probe(); - - np = of_get_cpu_node(0, NULL); - if (!np) { - pr_err("Failed to get CPU node\n"); - return; - } - - clk = of_clk_get(np, 0); - if (IS_ERR(clk)) { - pr_err("Failed to get CPU clock: %ld\n", PTR_ERR(clk)); - return; - } - - mips_hpt_frequency = clk_get_rate(clk) / 2; - clk_put(clk); -} diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index 9c9f3877abf9..aea26e739543 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -46,6 +46,7 @@ config NDS32 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select SET_FS + select TRACE_IRQFLAGS_SUPPORT help Andes(nds32) Linux support. @@ -62,9 +63,6 @@ config GENERIC_LOCKBREAK def_bool y depends on PREEMPTION -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config STACKTRACE_SUPPORT def_bool y diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index 7d6824f7c0e8..c2a222ebfa2a 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -36,8 +36,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr); -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -void flush_kernel_dcache_page(struct page *page); +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 void flush_kernel_vmap_range(void *addr, int size); void invalidate_kernel_vmap_range(void *addr, int size); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&(mapping)->i_pages) diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c index 41725eaf8bac..b3d34d646652 100644 --- a/arch/nds32/kernel/setup.c +++ b/arch/nds32/kernel/setup.c @@ -244,7 +244,6 @@ static void __init setup_memory(void) unsigned long ram_start_pfn; unsigned long free_ram_start_pfn; phys_addr_t memory_start, memory_end; - struct memblock_region *region; memory_end = memory_start = 0; diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index ee0d9ae192a5..f06421c645af 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -183,11 +183,6 @@ void __pgd_error(const char *file, int line, unsigned long val) } extern char *exception_vector, *exception_vector_end; -void __init trap_init(void) -{ - return; -} - void __init early_trap_init(void) { unsigned long ivb = 0; diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c index ad5344ef5d33..07aac65d1cab 100644 --- a/arch/nds32/mm/cacheflush.c +++ b/arch/nds32/mm/cacheflush.c @@ -318,15 +318,6 @@ void flush_anon_page(struct vm_area_struct *vma, local_irq_restore(flags); } -void flush_kernel_dcache_page(struct page *page) -{ - unsigned long flags; - local_irq_save(flags); - cpu_dcache_wbinval_page((unsigned long)page_address(page)); - local_irq_restore(flags); -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - void flush_kernel_vmap_range(void *addr, int size) { unsigned long flags; diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 3efe5533ea1c..33fd06f5fa41 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -41,9 +41,6 @@ config NO_IOPORT_MAP config FPU def_bool n -config TRACE_IRQFLAGS_SUPPORT - def_bool n - menu "Kernel features" source "kernel/Kconfig.hz" diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c index b172da4eb1a9..596986a74a26 100644 --- a/arch/nios2/kernel/traps.c +++ b/arch/nios2/kernel/traps.c @@ -105,11 +105,6 @@ void show_stack(struct task_struct *task, unsigned long *stack, printk("%s\n", loglvl); } -void __init trap_init(void) -{ - /* Nothing to do here */ -} - /* Breakpoint handler */ asmlinkage void breakpoint_c(struct pt_regs *fp) { diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 50035a9816c8..e804026b4797 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -37,6 +37,7 @@ config OPENRISC select GENERIC_IRQ_MULTI_HANDLER select MMU_GATHER_NO_RANGE if MMU select SET_FS + select TRACE_IRQFLAGS_SUPPORT config CPU_BIG_ENDIAN def_bool y @@ -50,9 +51,6 @@ config GENERIC_HWEIGHT config NO_IOPORT_MAP def_bool y -config TRACE_IRQFLAGS_SUPPORT - def_bool y - # For now, use generic checksum functions #These can be reimplemented in assembly later if so inclined config GENERIC_CSUM diff --git a/arch/openrisc/boot/dts/or1klitex.dts b/arch/openrisc/boot/dts/or1klitex.dts index 3f9867aa3844..91c7173c50e6 100644 --- a/arch/openrisc/boot/dts/or1klitex.dts +++ b/arch/openrisc/boot/dts/or1klitex.dts @@ -41,10 +41,10 @@ interrupt-controller; }; - serial0: serial@e0002000 { + serial0: serial@e0006800 { device_type = "serial"; compatible = "litex,liteuart"; - reg = <0xe0002000 0x100>; + reg = <0xe0006800 0x100>; }; soc_ctrl0: soc_controller@e0000000 { @@ -52,4 +52,13 @@ reg = <0xe0000000 0xc>; status = "okay"; }; + + ethernet@e0001000 { + compatible = "litex,liteeth"; + reg = <0xe0001000 0x7c>, + <0xe0001800 0x0a>, + <0x80000000 0x2000>; + reg-names = "mac", "mdio", "buffer"; + interrupts = <2>; + }; }; diff --git a/arch/openrisc/configs/or1klitex_defconfig b/arch/openrisc/configs/or1klitex_defconfig index 3c2c70d3d740..d695879a4d26 100644 --- a/arch/openrisc/configs/or1klitex_defconfig +++ b/arch/openrisc/configs/or1klitex_defconfig @@ -1,18 +1,24 @@ CONFIG_BLK_DEV_INITRD=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y -CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y CONFIG_EMBEDDED=y +CONFIG_OPENRISC_BUILTIN_DTB="or1klitex" CONFIG_HZ_100=y -CONFIG_INITRAMFS_SOURCE="openrisc-rootfs.cpio.gz" +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y CONFIG_OF_OVERLAY=y -CONFIG_OPENRISC_BUILTIN_DTB="or1klitex" -CONFIG_PANIC_ON_OOPS=y -CONFIG_PRINTK_TIME=y -CONFIG_LITEX_SOC_CONTROLLER=y +CONFIG_NETDEVICES=y +CONFIG_LITEX_LITEETH=y CONFIG_SERIAL_LITEUART=y CONFIG_SERIAL_LITEUART_CONSOLE=y -CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_TTY_PRINTK=y +CONFIG_LITEX_SOC_CONTROLLER=y +CONFIG_TMPFS=y +CONFIG_PRINTK_TIME=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BUG_ON_DATA_CORRUPTION=y diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 4ac591c9ca33..cdd657f80bfa 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -12,7 +12,7 @@ * et al. */ -/* or32 pgtable.h - macros and functions to manipulate page tables +/* or1k pgtable.h - macros and functions to manipulate page tables * * Based on: * include/asm-cris/pgtable.h @@ -29,14 +29,14 @@ /* * The Linux memory management assumes a three-level page table setup. On - * or32, we use that, but "fold" the mid level into the top-level page + * or1k, we use that, but "fold" the mid level into the top-level page * table. Since the MMU TLB is software loaded through an interrupt, it * supports any page table structure, so we could have used a three-level * setup, but for the amounts of memory we normally use, a two-level is * probably more efficient. * * This file contains the functions and defines necessary to modify and use - * the or32 page table tree. + * the or1k page table tree. */ extern void paging_init(void); diff --git a/arch/openrisc/include/asm/setup.h b/arch/openrisc/include/asm/setup.h new file mode 100644 index 000000000000..9acbc5deda69 --- /dev/null +++ b/arch/openrisc/include/asm/setup.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021 Stafford Horne + */ +#ifndef _ASM_OR1K_SETUP_H +#define _ASM_OR1K_SETUP_H + +#include <linux/init.h> +#include <asm-generic/setup.h> + +#ifndef __ASSEMBLY__ +void __init or1k_early_setup(void *fdt); +#endif + +#endif /* _ASM_OR1K_SETUP_H */ diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h index 4f9d2a261455..659834ab87fa 100644 --- a/arch/openrisc/include/asm/thread_info.h +++ b/arch/openrisc/include/asm/thread_info.h @@ -25,7 +25,7 @@ /* THREAD_SIZE is the size of the task_struct/kernel_stack combo. * normally, the stack is found by doing something like p + THREAD_SIZE - * in or32, a page is 8192 bytes, which seems like a sane size + * in or1k, a page is 8192 bytes, which seems like a sane size */ #define THREAD_SIZE_ORDER 0 diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S index 947613f61d4a..edaa775a648e 100644 --- a/arch/openrisc/kernel/entry.S +++ b/arch/openrisc/kernel/entry.S @@ -326,7 +326,7 @@ EXCEPTION_ENTRY(_data_page_fault_handler) 1: l.ori r6,r0,0x0 // !write access 2: - /* call fault.c handler in or32/mm/fault.c */ + /* call fault.c handler in openrisc/mm/fault.c */ l.jal do_page_fault l.nop l.j _ret_from_exception @@ -348,7 +348,7 @@ EXCEPTION_ENTRY(_insn_page_fault_handler) /* r4 set be EXCEPTION_HANDLE */ // effective address of fault l.ori r6,r0,0x0 // !write access - /* call fault.c handler in or32/mm/fault.c */ + /* call fault.c handler in openrisc/mm/fault.c */ l.jal do_page_fault l.nop l.j _ret_from_exception @@ -547,6 +547,7 @@ EXCEPTION_ENTRY(_external_irq_handler) l.bnf 1f // ext irq enabled, all ok. l.nop +#ifdef CONFIG_PRINTK l.addi r1,r1,-0x8 l.movhi r3,hi(42f) l.ori r3,r3,lo(42f) @@ -560,6 +561,7 @@ EXCEPTION_ENTRY(_external_irq_handler) .string "\n\rESR interrupt bug: in _external_irq_handler (ESR %x)\n\r" .align 4 .previous +#endif l.ori r4,r4,SPR_SR_IEE // fix the bug // l.sw PT_SR(r1),r4 diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S index af355e3f4619..15f1b38dfe03 100644 --- a/arch/openrisc/kernel/head.S +++ b/arch/openrisc/kernel/head.S @@ -599,7 +599,7 @@ flush_tlb: l.jal _flush_tlb l.nop -/* The MMU needs to be enabled before or32_early_setup is called */ +/* The MMU needs to be enabled before or1k_early_setup is called */ enable_mmu: /* @@ -641,9 +641,9 @@ enable_mmu: /* magic number mismatch, set fdt pointer to null */ l.or r25,r0,r0 _fdt_found: - /* pass fdt pointer to or32_early_setup in r3 */ + /* pass fdt pointer to or1k_early_setup in r3 */ l.or r3,r0,r25 - LOAD_SYMBOL_2_GPR(r24, or32_early_setup) + LOAD_SYMBOL_2_GPR(r24, or1k_early_setup) l.jalr r24 l.nop diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index eb62429681fc..b0698d9ce14f 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -14,8 +14,6 @@ */ #define __KERNEL_SYSCALLS__ -#include <stdarg.h> - #include <linux/errno.h> #include <linux/sched.h> #include <linux/sched/debug.h> diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c index 8ae2da6ac097..0cd04d936a7a 100644 --- a/arch/openrisc/kernel/setup.c +++ b/arch/openrisc/kernel/setup.c @@ -209,7 +209,8 @@ void __init setup_cpuinfo(void) } /** - * or32_early_setup + * or1k_early_setup + * @fdt: pointer to the start of the device tree in memory or NULL * * Handles the pointer to the device tree that this kernel is to use * for establishing the available platform devices. @@ -217,7 +218,7 @@ void __init setup_cpuinfo(void) * Falls back on built-in device tree in case null pointer is passed. */ -void __init or32_early_setup(void *fdt) +void __init or1k_early_setup(void *fdt) { if (fdt) pr_info("FDT at %p\n", fdt); @@ -243,21 +244,6 @@ static inline unsigned long extract_value(unsigned long reg, unsigned long mask) return mask & reg; } -void __init detect_unit_config(unsigned long upr, unsigned long mask, - char *text, void (*func) (void)) -{ - if (text != NULL) - printk("%s", text); - - if (upr & mask) { - if (func != NULL) - func(); - else - printk("present\n"); - } else - printk("not present\n"); -} - /* * calibrate_delay * diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index 4d61333c2623..aa1e709405ac 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -231,11 +231,6 @@ void unhandled_exception(struct pt_regs *regs, int ea, int vector) die("Oops", regs, 9); } -void __init trap_init(void) -{ - /* Nothing needs to be done */ -} - asmlinkage void do_trap(struct pt_regs *regs, unsigned long address) { force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc); diff --git a/arch/openrisc/lib/Makefile b/arch/openrisc/lib/Makefile index 79775aaa6baa..53327406b483 100644 --- a/arch/openrisc/lib/Makefile +++ b/arch/openrisc/lib/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only # -# Makefile for or32 specific library files.. +# Makefile for or1k specific library files.. # obj-y := delay.o string.o memset.o memcpy.o diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index ca97d9baab51..c730d1a51686 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -28,7 +28,7 @@ unsigned long pte_misses; /* updated by do_page_fault() */ unsigned long pte_errors; /* updated by do_page_fault() */ /* __PHX__ :: - check the vmalloc_fault in do_page_fault() - * - also look into include/asm-or32/mmu_context.h + * - also look into include/asm/mmu_context.h */ volatile pgd_t *current_pgd[NR_CPUS]; diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 95d4bbf4e455..4742b6f169b7 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -10,7 +10,6 @@ config PARISC select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_STRNLEN_USER select ARCH_NO_SG_CHAIN select ARCH_SUPPORTS_HUGETLBFS if PA20 select ARCH_SUPPORTS_MEMORY_FAILURE @@ -65,7 +64,7 @@ config PARISC select HAVE_KPROBES_ON_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_SOFTIRQ_ON_OWN_STACK if IRQSTACKS - select SET_FS + select TRACE_IRQFLAGS_SUPPORT help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/parisc/Kconfig.debug b/arch/parisc/Kconfig.debug index 1478ded0e247..f66554cd5c45 100644 --- a/arch/parisc/Kconfig.debug +++ b/arch/parisc/Kconfig.debug @@ -1,4 +1 @@ # SPDX-License-Identifier: GPL-2.0 - -config TRACE_IRQFLAGS_SUPPORT - def_bool y diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile index dff453687530..9fe54878167d 100644 --- a/arch/parisc/boot/compressed/Makefile +++ b/arch/parisc/boot/compressed/Makefile @@ -26,7 +26,7 @@ endif OBJECTS += $(obj)/head.o $(obj)/real2.o $(obj)/firmware.o $(obj)/misc.o $(obj)/piggy.o LDFLAGS_vmlinux := -X -e startup --as-needed -T -$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(LIBGCC) +$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(LIBGCC) FORCE $(call if_changed,ld) sed-sizes := -e 's/^\([0-9a-fA-F]*\) . \(__bss_start\|_end\|parisc_kernel_start\)$$/\#define SZ\2 0x\1/p' @@ -34,7 +34,7 @@ sed-sizes := -e 's/^\([0-9a-fA-F]*\) . \(__bss_start\|_end\|parisc_kernel_start\ quiet_cmd_sizes = GEN $@ cmd_sizes = $(NM) $< | sed -n $(sed-sizes) > $@ -$(obj)/sizes.h: vmlinux +$(obj)/sizes.h: vmlinux FORCE $(call if_changed,sizes) AFLAGS_head.o += -I$(objtree)/$(obj) -DBOOTLOADER @@ -70,19 +70,19 @@ suffix-$(CONFIG_KERNEL_LZMA) := lzma suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_XZ) := xz -$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) +$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) +$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) +$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE $(call if_changed,lz4) -$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) +$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzma) -$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) +$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzo) -$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) +$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE $(call if_changed,xzkern) LDFLAGS_piggy.o := -r --format binary --oformat $(LD_BFD) -T -$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) +$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE $(call if_changed,ld) diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c index 2d395998f524..7ee49f5881d1 100644 --- a/arch/parisc/boot/compressed/misc.c +++ b/arch/parisc/boot/compressed/misc.c @@ -26,7 +26,7 @@ extern char input_data[]; extern int input_len; /* output_len is inserted by the linker possibly at an unaligned address */ -extern __le32 output_len __aligned(1); +extern char output_len; extern char _text, _end; extern char _bss, _ebss; extern char _startcode_end; diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig index 7611d48c599e..dd14e3131325 100644 --- a/arch/parisc/configs/generic-32bit_defconfig +++ b/arch/parisc/configs/generic-32bit_defconfig @@ -111,7 +111,6 @@ CONFIG_PPP_BSDCOMP=m CONFIG_PPP_DEFLATE=m CONFIG_PPPOE=m # CONFIG_WLAN is not set -CONFIG_INPUT_POLLDEV=y CONFIG_KEYBOARD_HIL_OLD=m CONFIG_KEYBOARD_HIL=m CONFIG_MOUSE_SERIAL=y diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 99663fc1f997..eef0096db5f8 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -36,16 +36,12 @@ void flush_cache_all_local(void); void flush_cache_all(void); void flush_cache_mm(struct mm_struct *mm); -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE void flush_kernel_dcache_page_addr(void *addr); -static inline void flush_kernel_dcache_page(struct page *page) -{ - flush_kernel_dcache_page_addr(page_address(page)); -} #define flush_kernel_dcache_range(start,size) \ flush_kernel_dcache_range_asm((start), (start)+(size)); +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 void flush_kernel_vmap_range(void *vaddr, int size); void invalidate_kernel_vmap_range(void *vaddr, int size); @@ -59,7 +55,7 @@ extern void flush_dcache_page(struct page *page); #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #define flush_icache_page(vma,page) do { \ - flush_kernel_dcache_page(page); \ + flush_kernel_dcache_page_addr(page_address(page)); \ flush_kernel_icache_page(page_address(page)); \ } while (0) diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index b5d90e82b65d..c04f5a637c39 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -163,12 +163,6 @@ struct compat_shmid64_ds { #define COMPAT_ELF_NGREG 80 typedef compat_ulong_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; -static __inline__ void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = ¤t->thread.regs; - return (void __user *)regs->gr[30]; -} - static inline int __is_compat_task(struct task_struct *t) { return test_tsk_thread_flag(t, TIF_32BIT); diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index d00313d1274e..0561568f7b48 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -184,7 +184,7 @@ extern int npmem_ranges; #include <asm-generic/getorder.h> #include <asm/pdc.h> -#define PAGE0 ((struct zeropage *)__PAGE_OFFSET) +#define PAGE0 ((struct zeropage *)absolute_pointer(__PAGE_OFFSET)) /* DEFINITION OF THE ZERO-PAGE (PAG0) */ /* based on work by Jason Eckhardt (jason@equator.com) */ diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index b5fbcd2c1780..eeb7da064289 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -101,10 +101,6 @@ DECLARE_PER_CPU(struct cpuinfo_parisc, cpu_data); #define CPU_HVERSION ((boot_cpu_data.hversion >> 4) & 0x0FFF) -typedef struct { - int seg; -} mm_segment_t; - #define ARCH_MIN_TASKALIGN 8 struct thread_struct { diff --git a/arch/parisc/include/asm/rt_sigframe.h b/arch/parisc/include/asm/rt_sigframe.h index 2b3010ade00e..4b9e3d707571 100644 --- a/arch/parisc/include/asm/rt_sigframe.h +++ b/arch/parisc/include/asm/rt_sigframe.h @@ -2,7 +2,7 @@ #ifndef _ASM_PARISC_RT_SIGFRAME_H #define _ASM_PARISC_RT_SIGFRAME_H -#define SIGRETURN_TRAMP 4 +#define SIGRETURN_TRAMP 3 #define SIGRESTARTBLOCK_TRAMP 5 #define TRAMP_SIZE (SIGRETURN_TRAMP + SIGRESTARTBLOCK_TRAMP) diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index 0bd38a972cea..00ad50fef769 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -11,7 +11,6 @@ struct thread_info { struct task_struct *task; /* main task structure */ unsigned long flags; /* thread_info flags (see TIF_*) */ - mm_segment_t addr_limit; /* user-level address space limit */ __u32 cpu; /* current CPU */ int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ }; @@ -21,7 +20,6 @@ struct thread_info { .task = &tsk, \ .flags = 0, \ .cpu = 0, \ - .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ } diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index ed2cd4fb479b..192ad9e11b25 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -11,14 +11,6 @@ #include <linux/bug.h> #include <linux/string.h> -#define KERNEL_DS ((mm_segment_t){0}) -#define USER_DS ((mm_segment_t){1}) - -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) - -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) - /* * Note that since kernel addresses are in a separate address space on * parisc, we don't need to do anything for access_ok(). @@ -33,11 +25,11 @@ #define get_user __get_user #if !defined(CONFIG_64BIT) -#define LDD_USER(val, ptr) __get_user_asm64(val, ptr) -#define STD_USER(x, ptr) __put_user_asm64(x, ptr) +#define LDD_USER(sr, val, ptr) __get_user_asm64(sr, val, ptr) +#define STD_USER(sr, x, ptr) __put_user_asm64(sr, x, ptr) #else -#define LDD_USER(val, ptr) __get_user_asm(val, "ldd", ptr) -#define STD_USER(x, ptr) __put_user_asm("std", x, ptr) +#define LDD_USER(sr, val, ptr) __get_user_asm(sr, val, "ldd", ptr) +#define STD_USER(sr, x, ptr) __put_user_asm(sr, "std", x, ptr) #endif /* @@ -67,28 +59,15 @@ struct exception_table_entry { #define ASM_EXCEPTIONTABLE_ENTRY_EFAULT( fault_addr, except_addr )\ ASM_EXCEPTIONTABLE_ENTRY( fault_addr, except_addr + 1) -/* - * load_sr2() preloads the space register %%sr2 - based on the value of - * get_fs() - with either a value of 0 to access kernel space (KERNEL_DS which - * is 0), or with the current value of %%sr3 to access user space (USER_DS) - * memory. The following __get_user_asm() and __put_user_asm() functions have - * %%sr2 hard-coded to access the requested memory. - */ -#define load_sr2() \ - __asm__(" or,= %0,%%r0,%%r0\n\t" \ - " mfsp %%sr3,%0\n\t" \ - " mtsp %0,%%sr2\n\t" \ - : : "r"(get_fs()) : ) - -#define __get_user_internal(val, ptr) \ +#define __get_user_internal(sr, val, ptr) \ ({ \ register long __gu_err __asm__ ("r8") = 0; \ \ switch (sizeof(*(ptr))) { \ - case 1: __get_user_asm(val, "ldb", ptr); break; \ - case 2: __get_user_asm(val, "ldh", ptr); break; \ - case 4: __get_user_asm(val, "ldw", ptr); break; \ - case 8: LDD_USER(val, ptr); break; \ + case 1: __get_user_asm(sr, val, "ldb", ptr); break; \ + case 2: __get_user_asm(sr, val, "ldh", ptr); break; \ + case 4: __get_user_asm(sr, val, "ldw", ptr); break; \ + case 8: LDD_USER(sr, val, ptr); break; \ default: BUILD_BUG(); \ } \ \ @@ -97,15 +76,14 @@ struct exception_table_entry { #define __get_user(val, ptr) \ ({ \ - load_sr2(); \ - __get_user_internal(val, ptr); \ + __get_user_internal("%%sr3,", val, ptr); \ }) -#define __get_user_asm(val, ldx, ptr) \ +#define __get_user_asm(sr, val, ldx, ptr) \ { \ register long __gu_val; \ \ - __asm__("1: " ldx " 0(%%sr2,%2),%0\n" \ + __asm__("1: " ldx " 0(" sr "%2),%0\n" \ "9:\n" \ ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ : "=r"(__gu_val), "=r"(__gu_err) \ @@ -114,9 +92,22 @@ struct exception_table_entry { (val) = (__force __typeof__(*(ptr))) __gu_val; \ } +#define HAVE_GET_KERNEL_NOFAULT +#define __get_kernel_nofault(dst, src, type, err_label) \ +{ \ + type __z; \ + long __err; \ + __err = __get_user_internal("%%sr0,", __z, (type *)(src)); \ + if (unlikely(__err)) \ + goto err_label; \ + else \ + *(type *)(dst) = __z; \ +} + + #if !defined(CONFIG_64BIT) -#define __get_user_asm64(val, ptr) \ +#define __get_user_asm64(sr, val, ptr) \ { \ union { \ unsigned long long l; \ @@ -124,8 +115,8 @@ struct exception_table_entry { } __gu_tmp; \ \ __asm__(" copy %%r0,%R0\n" \ - "1: ldw 0(%%sr2,%2),%0\n" \ - "2: ldw 4(%%sr2,%2),%R0\n" \ + "1: ldw 0(" sr "%2),%0\n" \ + "2: ldw 4(" sr "%2),%R0\n" \ "9:\n" \ ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b) \ @@ -138,16 +129,16 @@ struct exception_table_entry { #endif /* !defined(CONFIG_64BIT) */ -#define __put_user_internal(x, ptr) \ +#define __put_user_internal(sr, x, ptr) \ ({ \ register long __pu_err __asm__ ("r8") = 0; \ __typeof__(*(ptr)) __x = (__typeof__(*(ptr)))(x); \ \ switch (sizeof(*(ptr))) { \ - case 1: __put_user_asm("stb", __x, ptr); break; \ - case 2: __put_user_asm("sth", __x, ptr); break; \ - case 4: __put_user_asm("stw", __x, ptr); break; \ - case 8: STD_USER(__x, ptr); break; \ + case 1: __put_user_asm(sr, "stb", __x, ptr); break; \ + case 2: __put_user_asm(sr, "sth", __x, ptr); break; \ + case 4: __put_user_asm(sr, "stw", __x, ptr); break; \ + case 8: STD_USER(sr, __x, ptr); break; \ default: BUILD_BUG(); \ } \ \ @@ -156,10 +147,20 @@ struct exception_table_entry { #define __put_user(x, ptr) \ ({ \ - load_sr2(); \ - __put_user_internal(x, ptr); \ + __put_user_internal("%%sr3,", x, ptr); \ }) +#define __put_kernel_nofault(dst, src, type, err_label) \ +{ \ + type __z = *(type *)(src); \ + long __err; \ + __err = __put_user_internal("%%sr0,", __z, (type *)(dst)); \ + if (unlikely(__err)) \ + goto err_label; \ +} + + + /* * The "__put_user/kernel_asm()" macros tell gcc they read from memory @@ -170,26 +171,26 @@ struct exception_table_entry { * r8 is already listed as err. */ -#define __put_user_asm(stx, x, ptr) \ - __asm__ __volatile__ ( \ - "1: " stx " %2,0(%%sr2,%1)\n" \ - "9:\n" \ - ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ - : "=r"(__pu_err) \ +#define __put_user_asm(sr, stx, x, ptr) \ + __asm__ __volatile__ ( \ + "1: " stx " %2,0(" sr "%1)\n" \ + "9:\n" \ + ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ + : "=r"(__pu_err) \ : "r"(ptr), "r"(x), "0"(__pu_err)) #if !defined(CONFIG_64BIT) -#define __put_user_asm64(__val, ptr) do { \ - __asm__ __volatile__ ( \ - "1: stw %2,0(%%sr2,%1)\n" \ - "2: stw %R2,4(%%sr2,%1)\n" \ - "9:\n" \ - ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ - ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b) \ - : "=r"(__pu_err) \ - : "r"(ptr), "r"(__val), "0"(__pu_err)); \ +#define __put_user_asm64(sr, __val, ptr) do { \ + __asm__ __volatile__ ( \ + "1: stw %2,0(" sr "%1)\n" \ + "2: stw %R2,4(" sr "%1)\n" \ + "9:\n" \ + ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ + ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b) \ + : "=r"(__pu_err) \ + : "r"(ptr), "r"(__val), "0"(__pu_err)); \ } while (0) #endif /* !defined(CONFIG_64BIT) */ @@ -200,14 +201,12 @@ struct exception_table_entry { */ extern long strncpy_from_user(char *, const char __user *, long); -extern unsigned lclear_user(void __user *, unsigned long); -extern long lstrnlen_user(const char __user *, long); +extern __must_check unsigned lclear_user(void __user *, unsigned long); +extern __must_check long strnlen_user(const char __user *src, long n); /* * Complex access routines -- macros */ -#define user_addr_max() (~0UL) -#define strnlen_user lstrnlen_user #define clear_user lclear_user #define __clear_user lclear_user @@ -215,8 +214,6 @@ unsigned long __must_check raw_copy_to_user(void __user *dst, const void *src, unsigned long len); unsigned long __must_check raw_copy_from_user(void *dst, const void __user *src, unsigned long len); -unsigned long __must_check raw_copy_in_user(void __user *dst, const void __user *src, - unsigned long len); #define INLINE_COPY_TO_USER #define INLINE_COPY_FROM_USER diff --git a/arch/parisc/include/uapi/asm/swab.h b/arch/parisc/include/uapi/asm/swab.h deleted file mode 100644 index 35fb2d1bfbbd..000000000000 --- a/arch/parisc/include/uapi/asm/swab.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _PARISC_SWAB_H -#define _PARISC_SWAB_H - -#include <asm/bitsperlong.h> -#include <linux/types.h> -#include <linux/compiler.h> - -#define __SWAB_64_THRU_32__ - -static inline __attribute_const__ __u16 __arch_swab16(__u16 x) -{ - __asm__("dep %0, 15, 8, %0\n\t" /* deposit 00ab -> 0bab */ - "shd %%r0, %0, 8, %0" /* shift 000000ab -> 00ba */ - : "=r" (x) - : "0" (x)); - return x; -} -#define __arch_swab16 __arch_swab16 - -static inline __attribute_const__ __u32 __arch_swab24(__u32 x) -{ - __asm__("shd %0, %0, 8, %0\n\t" /* shift xabcxabc -> cxab */ - "dep %0, 15, 8, %0\n\t" /* deposit cxab -> cbab */ - "shd %%r0, %0, 8, %0" /* shift 0000cbab -> 0cba */ - : "=r" (x) - : "0" (x)); - return x; -} - -static inline __attribute_const__ __u32 __arch_swab32(__u32 x) -{ - unsigned int temp; - __asm__("shd %0, %0, 16, %1\n\t" /* shift abcdabcd -> cdab */ - "dep %1, 15, 8, %1\n\t" /* deposit cdab -> cbab */ - "shd %0, %1, 8, %0" /* shift abcdcbab -> dcba */ - : "=r" (x), "=&r" (temp) - : "0" (x)); - return x; -} -#define __arch_swab32 __arch_swab32 - -#if __BITS_PER_LONG > 32 -/* -** From "PA-RISC 2.0 Architecture", HP Professional Books. -** See Appendix I page 8 , "Endian Byte Swapping". -** -** Pretty cool algorithm: (* == zero'd bits) -** PERMH 01234567 -> 67452301 into %0 -** HSHL 67452301 -> 7*5*3*1* into %1 -** HSHR 67452301 -> *6*4*2*0 into %0 -** OR %0 | %1 -> 76543210 into %0 (all done!) -*/ -static inline __attribute_const__ __u64 __arch_swab64(__u64 x) -{ - __u64 temp; - __asm__("permh,3210 %0, %0\n\t" - "hshl %0, 8, %1\n\t" - "hshr,u %0, 8, %0\n\t" - "or %1, %0, %0" - : "=r" (x), "=&r" (temp) - : "0" (x)); - return x; -} -#define __arch_swab64 __arch_swab64 -#endif /* __BITS_PER_LONG > 32 */ - -#endif /* _PARISC_SWAB_H */ diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c index 33113ba24054..22924a3f1728 100644 --- a/arch/parisc/kernel/asm-offsets.c +++ b/arch/parisc/kernel/asm-offsets.c @@ -230,7 +230,6 @@ int main(void) DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); - DEFINE(TI_SEGMENT, offsetof(struct thread_info, addr_limit)); DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count)); DEFINE(THREAD_SZ, sizeof(struct thread_info)); /* THREAD_SZ_ALGN includes space for a stack frame. */ diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 86a1a63563fd..39e02227e231 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -334,7 +334,7 @@ void flush_dcache_page(struct page *page) return; } - flush_kernel_dcache_page(page); + flush_kernel_dcache_page_addr(page_address(page)); if (!mapping) return; @@ -375,7 +375,6 @@ EXPORT_SYMBOL(flush_dcache_page); /* Defined in arch/parisc/kernel/pacache.S */ EXPORT_SYMBOL(flush_kernel_dcache_range_asm); -EXPORT_SYMBOL(flush_kernel_dcache_page_asm); EXPORT_SYMBOL(flush_data_cache_local); EXPORT_SYMBOL(flush_kernel_icache_range_asm); diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index 665b70086685..7034227dbdf3 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -51,7 +51,7 @@ * prumpf 991016 */ -#include <stdarg.h> +#include <linux/stdarg.h> #include <linux/delay.h> #include <linux/init.h> diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c index e8a6a751dfd8..00297e8e1c88 100644 --- a/arch/parisc/kernel/parisc_ksyms.c +++ b/arch/parisc/kernel/parisc_ksyms.c @@ -32,7 +32,6 @@ EXPORT_SYMBOL(__xchg64); #include <linux/uaccess.h> EXPORT_SYMBOL(lclear_user); -EXPORT_SYMBOL(lstrnlen_user); #ifndef CONFIG_64BIT /* Needed so insmod can set dp value */ diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 184ec3c1eae4..38ec4ae81239 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -17,9 +17,6 @@ * Copyright (C) 2001-2014 Helge Deller <deller@gmx.de> * Copyright (C) 2002 Randolph Chung <tausq with parisc-linux.org> */ - -#include <stdarg.h> - #include <linux/elf.h> #include <linux/errno.h> #include <linux/kernel.h> diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index 3fb86ee507dd..cceb09855e03 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -150,8 +150,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_PA11 dma_ops_init(); #endif - - clear_sched_clock_stable(); } /* diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index db1a47cf424d..bbfe23c40c01 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -237,18 +237,22 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs, #endif usp = (regs->gr[30] & ~(0x01UL)); + sigframe_size = PARISC_RT_SIGFRAME_SIZE; #ifdef CONFIG_64BIT if (is_compat_task()) { /* The gcc alloca implementation leaves garbage in the upper 32 bits of sp */ usp = (compat_uint_t)usp; + sigframe_size = PARISC_RT_SIGFRAME_SIZE32; } #endif - /*FIXME: frame_size parameter is unused, remove it. */ - frame = get_sigframe(&ksig->ka, usp, sizeof(*frame)); + frame = get_sigframe(&ksig->ka, usp, sigframe_size); DBG(1,"SETUP_RT_FRAME: START\n"); DBG(1,"setup_rt_frame: frame %p info %p\n", frame, ksig->info); + start = (unsigned long) frame; + if (start >= user_addr_max() - sigframe_size) + return -EFAULT; #ifdef CONFIG_64BIT @@ -284,32 +288,21 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs, already in userspace. The first words of tramp are used to save the previous sigrestartblock trampoline that might be on the stack. We start the sigreturn trampoline at - SIGRESTARTBLOCK_TRAMP+X. */ + SIGRESTARTBLOCK_TRAMP. */ err |= __put_user(in_syscall ? INSN_LDI_R25_1 : INSN_LDI_R25_0, &frame->tramp[SIGRESTARTBLOCK_TRAMP+0]); - err |= __put_user(INSN_LDI_R20, - &frame->tramp[SIGRESTARTBLOCK_TRAMP+1]); err |= __put_user(INSN_BLE_SR2_R0, + &frame->tramp[SIGRESTARTBLOCK_TRAMP+1]); + err |= __put_user(INSN_LDI_R20, &frame->tramp[SIGRESTARTBLOCK_TRAMP+2]); - err |= __put_user(INSN_NOP, &frame->tramp[SIGRESTARTBLOCK_TRAMP+3]); - -#if DEBUG_SIG - /* Assert that we're flushing in the correct space... */ - { - unsigned long sid; - asm ("mfsp %%sr3,%0" : "=r" (sid)); - DBG(1,"setup_rt_frame: Flushing 64 bytes at space %#x offset %p\n", - sid, frame->tramp); - } -#endif - start = (unsigned long) &frame->tramp[0]; - end = (unsigned long) &frame->tramp[TRAMP_SIZE]; + start = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP+0]; + end = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP+3]; flush_user_dcache_range_asm(start, end); flush_user_icache_range_asm(start, end); /* TRAMP Words 0-4, Length 5 = SIGRESTARTBLOCK_TRAMP - * TRAMP Words 5-9, Length 4 = SIGRETURN_TRAMP + * TRAMP Words 5-7, Length 3 = SIGRETURN_TRAMP * So the SIGRETURN_TRAMP is at the end of SIGRESTARTBLOCK_TRAMP */ rp = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP]; @@ -353,11 +346,6 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs, /* The syscall return path will create IAOQ values from r31. */ - sigframe_size = PARISC_RT_SIGFRAME_SIZE; -#ifdef CONFIG_64BIT - if (is_compat_task()) - sigframe_size = PARISC_RT_SIGFRAME_SIZE32; -#endif if (in_syscall) { regs->gr[31] = haddr; #ifdef CONFIG_64BIT @@ -501,7 +489,6 @@ syscall_restart(struct pt_regs *regs, struct k_sigaction *ka) DBG(1,"ERESTARTNOHAND: returning -EINTR\n"); regs->gr[28] = -EINTR; break; - case -ERESTARTSYS: if (!(ka->sa.sa_flags & SA_RESTART)) { DBG(1,"ERESTARTSYS: putting -EINTR\n"); @@ -529,6 +516,10 @@ insert_restart_trampoline(struct pt_regs *regs) unsigned long end = (unsigned long) &usp[5]; long err = 0; + /* check that we don't exceed the stack */ + if (A(&usp[0]) >= user_addr_max() - 5 * sizeof(int)) + return; + /* Setup a trampoline to restart the syscall * with __NR_restart_syscall * @@ -569,10 +560,6 @@ insert_restart_trampoline(struct pt_regs *regs) } /* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - * * We need to be able to restore the syscall arguments (r21-r26) to * restart syscalls. Thus, the syscall path should save them in the * pt_regs structure (it's okay to do so since they are caller-save diff --git a/arch/parisc/kernel/signal32.h b/arch/parisc/kernel/signal32.h index f166250f2d06..a5bdbb5678b7 100644 --- a/arch/parisc/kernel/signal32.h +++ b/arch/parisc/kernel/signal32.h @@ -36,7 +36,7 @@ struct compat_regfile { compat_int_t rf_sar; }; -#define COMPAT_SIGRETURN_TRAMP 4 +#define COMPAT_SIGRETURN_TRAMP 3 #define COMPAT_SIGRESTARTBLOCK_TRAMP 5 #define COMPAT_TRAMP_SIZE (COMPAT_SIGRETURN_TRAMP + \ COMPAT_SIGRESTARTBLOCK_TRAMP) diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index eaf0603ae781..bf751e0732b7 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -292,9 +292,9 @@ 258 32 clock_nanosleep sys_clock_nanosleep_time32 258 64 clock_nanosleep sys_clock_nanosleep 259 common tgkill sys_tgkill -260 common mbind sys_mbind compat_sys_mbind -261 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -262 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +260 common mbind sys_mbind +261 common get_mempolicy sys_get_mempolicy +262 common set_mempolicy sys_set_mempolicy # 263 was vserver 264 common add_key sys_add_key 265 common request_key sys_request_key @@ -331,7 +331,7 @@ 292 64 sync_file_range sys_sync_file_range 293 common tee sys_tee 294 common vmsplice sys_vmsplice -295 common move_pages sys_move_pages compat_sys_move_pages +295 common move_pages sys_move_pages 296 common getcpu sys_getcpu 297 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 298 common statfs64 sys_statfs64 compat_sys_statfs64 @@ -444,3 +444,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 08e4d480abe1..9fb1e794831b 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -265,6 +265,9 @@ static int __init init_cr16_clocksource(void) (cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc)) continue; + /* mark sched_clock unstable */ + clear_sched_clock_stable(); + clocksource_cr16.name = "cr16_unstable"; clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; clocksource_cr16.rating = 0; @@ -272,10 +275,6 @@ static int __init init_cr16_clocksource(void) } } - /* XXX: We may want to mark sched_clock stable here if cr16 clocks are - * in sync: - * (clocksource_cr16.flags == CLOCK_SOURCE_IS_CONTINUOUS) */ - /* register at clocksource framework */ clocksource_register_hz(&clocksource_cr16, 100 * PAGE0->mem_10msec); diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 8d8441d4562a..747c328fb886 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -859,7 +859,3 @@ void __init early_trap_init(void) initialize_ivt(&fault_vector_20); } - -void __init trap_init(void) -{ -} diff --git a/arch/parisc/lib/iomap.c b/arch/parisc/lib/iomap.c index f03adb1999e7..367f6397bda7 100644 --- a/arch/parisc/lib/iomap.c +++ b/arch/parisc/lib/iomap.c @@ -513,12 +513,15 @@ void ioport_unmap(void __iomem *addr) } } +#ifdef CONFIG_PCI void pci_iounmap(struct pci_dev *dev, void __iomem * addr) { if (!INDIRECT_ADDR(addr)) { iounmap(addr); } } +EXPORT_SYMBOL(pci_iounmap); +#endif EXPORT_SYMBOL(ioread8); EXPORT_SYMBOL(ioread16); @@ -544,4 +547,3 @@ EXPORT_SYMBOL(iowrite16_rep); EXPORT_SYMBOL(iowrite32_rep); EXPORT_SYMBOL(ioport_map); EXPORT_SYMBOL(ioport_unmap); -EXPORT_SYMBOL(pci_iounmap); diff --git a/arch/parisc/lib/lusercopy.S b/arch/parisc/lib/lusercopy.S index 36d6a8638ead..b428d29e45fb 100644 --- a/arch/parisc/lib/lusercopy.S +++ b/arch/parisc/lib/lusercopy.S @@ -28,21 +28,6 @@ #include <linux/linkage.h> /* - * get_sr gets the appropriate space value into - * sr1 for kernel/user space access, depending - * on the flag stored in the task structure. - */ - - .macro get_sr - mfctl %cr30,%r1 - ldw TI_SEGMENT(%r1),%r22 - mfsp %sr3,%r1 - or,<> %r22,%r0,%r0 - copy %r0,%r1 - mtsp %r1,%sr1 - .endm - - /* * unsigned long lclear_user(void *to, unsigned long n) * * Returns 0 for success. @@ -51,10 +36,9 @@ ENTRY_CFI(lclear_user) comib,=,n 0,%r25,$lclu_done - get_sr $lclu_loop: addib,<> -1,%r25,$lclu_loop -1: stbs,ma %r0,1(%sr1,%r26) +1: stbs,ma %r0,1(%sr3,%r26) $lclu_done: bv %r0(%r2) @@ -67,40 +51,6 @@ $lclu_done: ENDPROC_CFI(lclear_user) - /* - * long lstrnlen_user(char *s, long n) - * - * Returns 0 if exception before zero byte or reaching N, - * N+1 if N would be exceeded, - * else strlen + 1 (i.e. includes zero byte). - */ - -ENTRY_CFI(lstrnlen_user) - comib,= 0,%r25,$lslen_nzero - copy %r26,%r24 - get_sr -1: ldbs,ma 1(%sr1,%r26),%r1 -$lslen_loop: - comib,=,n 0,%r1,$lslen_done - addib,<> -1,%r25,$lslen_loop -2: ldbs,ma 1(%sr1,%r26),%r1 -$lslen_done: - bv %r0(%r2) - sub %r26,%r24,%r28 - -$lslen_nzero: - b $lslen_done - ldo 1(%r26),%r26 /* special case for N == 0 */ - -3: b $lslen_done - copy %r24,%r26 /* reset r26 so 0 is returned on fault */ - - ASM_EXCEPTIONTABLE_ENTRY(1b,3b) - ASM_EXCEPTIONTABLE_ENTRY(2b,3b) - -ENDPROC_CFI(lstrnlen_user) - - /* * unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) * diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index 4b75388190b4..ea70a0e08321 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -38,14 +38,6 @@ unsigned long raw_copy_from_user(void *dst, const void __user *src, } EXPORT_SYMBOL(raw_copy_from_user); -unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long len) -{ - mtsp(get_user_space(), 1); - mtsp(get_user_space(), 2); - return pa_memcpy((void __force *)dst, (void __force *)src, len); -} - - void * memcpy(void * dst,const void *src, size_t count) { mtsp(get_kernel_space(), 1); @@ -54,7 +46,6 @@ void * memcpy(void * dst,const void *src, size_t count) return dst; } -EXPORT_SYMBOL(raw_copy_in_user); EXPORT_SYMBOL(memcpy); bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 678a0acbad7b..ba5b66189358 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -94,10 +94,6 @@ config STACKTRACE_SUPPORT bool default y -config TRACE_IRQFLAGS_SUPPORT - bool - default y - config LOCKDEP_SUPPORT bool default y @@ -123,6 +119,7 @@ config PPC select ARCH_HAS_COPY_MC if PPC64 select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE + select ARCH_HAS_DEBUG_WX if STRICT_KERNEL_RWX select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_DMA_MAP_DIRECT if PPC_PSERIES select ARCH_HAS_ELF_RANDOMIZE @@ -182,6 +179,7 @@ config PPC select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_PCI_IOMAP if PCI + select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_VDSO_TIME_NS @@ -268,6 +266,7 @@ config PPC select STRICT_KERNEL_RWX if STRICT_MODULE_RWX select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select TRACE_IRQFLAGS_SUPPORT select VIRT_TO_BUS if !PPC64 # # Please keep this list sorted alphabetically. diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 205cd77f321f..192f0ed0097f 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -365,36 +365,6 @@ config FAIL_IOMMU If you are unsure, say N. -config PPC_PTDUMP - bool "Export kernel pagetable layout to userspace via debugfs" - depends on DEBUG_KERNEL && DEBUG_FS - help - This option exports the state of the kernel pagetables to a - debugfs file. This is only useful for kernel developers who are - working in architecture specific areas of the kernel - probably - not a good idea to enable this feature in a production kernel. - - If you are unsure, say N. - -config PPC_DEBUG_WX - bool "Warn on W+X mappings at boot" - depends on PPC_PTDUMP && STRICT_KERNEL_RWX - help - Generate a warning if any W+X mappings are found at boot. - - This is useful for discovering cases where the kernel is leaving - W+X mappings after applying NX, as such mappings are a security risk. - - Note that even if the check fails, your kernel is possibly - still fine, as W+X mappings are not a security hole in - themselves, what they do is that they make the exploitation - of other unfixed kernel bugs easier. - - There is no runtime or memory usage effect of this option - once the kernel has booted up - it's a one time check. - - If in doubt, say "Y". - config PPC_FAST_ENDIAN_SWITCH bool "Deprecated fast endian-switch syscall" depends on DEBUG_KERNEL && PPC_BOOK3S_64 diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 6505d66f1193..aa6808e70647 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -122,6 +122,7 @@ endif LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie +LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) += -z notext LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) ifdef CONFIG_PPC64 @@ -407,7 +408,8 @@ endef PHONY += install install: - $(Q)$(MAKE) $(build)=$(boot) install + sh -x $(srctree)/$(boot)/install.sh "$(KERNELRELEASE)" vmlinux \ + System.map "$(INSTALL_PATH)" archclean: $(Q)$(MAKE) $(clean)=$(boot) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index e312ea802aa6..089ee3ea55c8 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -35,7 +35,6 @@ endif BOOTCFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -O2 -msoft-float -mno-altivec -mno-vsx \ -pipe -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \ - -include $(srctree)/include/linux/compiler_attributes.h \ $(LINUXINCLUDE) ifdef CONFIG_PPC64_BOOT_WRAPPER @@ -70,6 +69,7 @@ ifeq ($(call cc-option-yn, -fstack-protector),y) BOOTCFLAGS += -fno-stack-protector endif +BOOTCFLAGS += -include $(srctree)/include/linux/compiler_attributes.h BOOTCFLAGS += -I$(objtree)/$(obj) -I$(srctree)/$(obj) DTC_FLAGS ?= -p 1024 @@ -341,7 +341,6 @@ image-$(CONFIG_TQM8541) += cuImage.tqm8541 image-$(CONFIG_TQM8548) += cuImage.tqm8548 image-$(CONFIG_TQM8555) += cuImage.tqm8555 image-$(CONFIG_TQM8560) += cuImage.tqm8560 -image-$(CONFIG_SBC8548) += cuImage.sbc8548 image-$(CONFIG_KSI8560) += cuImage.ksi8560 # Board ports in arch/powerpc/platform/86xx/Kconfig @@ -444,16 +443,6 @@ $(obj)/zImage: $(addprefix $(obj)/, $(image-y)) $(obj)/zImage.initrd: $(addprefix $(obj)/, $(initrd-y)) $(Q)rm -f $@; ln $< $@ -# Only install the vmlinux -install: $(CONFIGURE) $(addprefix $(obj)/, $(image-y)) - sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux System.map "$(INSTALL_PATH)" - -# Install the vmlinux and other built boot targets. -zInstall: $(CONFIGURE) $(addprefix $(obj)/, $(image-y)) - sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux System.map "$(INSTALL_PATH)" $^ - -PHONY += install zInstall - # anything not in $(targets) clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* treeImage.* \ zImage zImage.initrd zImage.chrp zImage.coff zImage.holly \ diff --git a/arch/powerpc/boot/dts/fsl/sbc8641d.dts b/arch/powerpc/boot/dts/fsl/sbc8641d.dts deleted file mode 100644 index 3dca10acc161..000000000000 --- a/arch/powerpc/boot/dts/fsl/sbc8641d.dts +++ /dev/null @@ -1,176 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8641D Device Tree Source - * - * Copyright 2008 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - * - * Based largely on the mpc8641_hpcn.dts by Freescale Semiconductor Inc. - */ - -/include/ "mpc8641si-pre.dtsi" - -/ { - model = "SBC8641D"; - compatible = "wind,sbc8641"; - - memory { - device_type = "memory"; - reg = <0x00000000 0x20000000>; // 512M at 0x0 - }; - - lbc: localbus@f8005000 { - reg = <0xf8005000 0x1000>; - - ranges = <0 0 0xff000000 0x01000000 // 16MB Boot flash - 1 0 0xf0000000 0x00010000 // 64KB EEPROM - 2 0 0xf1000000 0x00100000 // EPLD (1MB) - 3 0 0xe0000000 0x04000000 // 64MB LB SDRAM (CS3) - 4 0 0xe4000000 0x04000000 // 64MB LB SDRAM (CS4) - 6 0 0xf4000000 0x00100000 // LCD display (1MB) - 7 0 0xe8000000 0x04000000>; // 64MB OneNAND - - flash@0,0 { - compatible = "cfi-flash"; - reg = <0 0 0x01000000>; - bank-width = <2>; - device-width = <2>; - #address-cells = <1>; - #size-cells = <1>; - partition@0 { - label = "dtb"; - reg = <0x00000000 0x00100000>; - read-only; - }; - partition@300000 { - label = "kernel"; - reg = <0x00100000 0x00400000>; - read-only; - }; - partition@400000 { - label = "fs"; - reg = <0x00500000 0x00a00000>; - }; - partition@700000 { - label = "firmware"; - reg = <0x00f00000 0x00100000>; - read-only; - }; - }; - - epld@2,0 { - compatible = "wrs,epld-localbus"; - #address-cells = <2>; - #size-cells = <1>; - reg = <2 0 0x100000>; - ranges = <0 0 5 0 1 // User switches - 1 0 5 1 1 // Board ID/Rev - 3 0 5 3 1>; // LEDs - }; - }; - - soc: soc@f8000000 { - ranges = <0x00000000 0xf8000000 0x00100000>; - - enet0: ethernet@24000 { - tbi-handle = <&tbi0>; - phy-handle = <&phy0>; - phy-connection-type = "rgmii-id"; - }; - - mdio@24520 { - phy0: ethernet-phy@1f { - reg = <0x1f>; - }; - phy1: ethernet-phy@0 { - reg = <0>; - }; - phy2: ethernet-phy@1 { - reg = <1>; - }; - phy3: ethernet-phy@2 { - reg = <2>; - }; - tbi0: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - - enet1: ethernet@25000 { - tbi-handle = <&tbi1>; - phy-handle = <&phy1>; - phy-connection-type = "rgmii-id"; - }; - - mdio@25520 { - tbi1: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - - enet2: ethernet@26000 { - tbi-handle = <&tbi2>; - phy-handle = <&phy2>; - phy-connection-type = "rgmii-id"; - }; - - mdio@26520 { - tbi2: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - - enet3: ethernet@27000 { - tbi-handle = <&tbi3>; - phy-handle = <&phy3>; - phy-connection-type = "rgmii-id"; - }; - - mdio@27520 { - tbi3: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - }; - - pci0: pcie@f8008000 { - reg = <0xf8008000 0x1000>; - ranges = <0x02000000 0x0 0x80000000 0x80000000 0x0 0x20000000 - 0x01000000 0x0 0x00000000 0xe2000000 0x0 0x00100000>; - interrupt-map-mask = <0xff00 0 0 7>; - - pcie@0 { - ranges = <0x02000000 0x0 0x80000000 - 0x02000000 0x0 0x80000000 - 0x0 0x20000000 - - 0x01000000 0x0 0x00000000 - 0x01000000 0x0 0x00000000 - 0x0 0x00100000>; - }; - - }; - - pci1: pcie@f8009000 { - reg = <0xf8009000 0x1000>; - ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000 - 0x01000000 0x0 0x00000000 0xe3000000 0x0 0x00100000>; - - pcie@0 { - ranges = <0x02000000 0x0 0xa0000000 - 0x02000000 0x0 0xa0000000 - 0x0 0x20000000 - - 0x01000000 0x0 0x00000000 - 0x01000000 0x0 0x00000000 - 0x0 0x00100000>; - }; - }; -}; - -/include/ "mpc8641si-post.dtsi" diff --git a/arch/powerpc/boot/dts/microwatt.dts b/arch/powerpc/boot/dts/microwatt.dts index 974abbdda249..65b270a90f94 100644 --- a/arch/powerpc/boot/dts/microwatt.dts +++ b/arch/powerpc/boot/dts/microwatt.dts @@ -127,6 +127,18 @@ fifo-size = <16>; interrupts = <0x10 0x1>; }; + + ethernet@8020000 { + compatible = "litex,liteeth"; + reg = <0x8021000 0x100 + 0x8020800 0x100 + 0x8030000 0x2000>; + reg-names = "mac", "mido", "buffer"; + litex,rx-slots = <2>; + litex,tx-slots = <2>; + litex,slot-size = <0x800>; + interrupts = <0x11 0x1>; + }; }; chosen { diff --git a/arch/powerpc/boot/dts/sbc8548-altflash.dts b/arch/powerpc/boot/dts/sbc8548-altflash.dts deleted file mode 100644 index bb7a1e712bb7..000000000000 --- a/arch/powerpc/boot/dts/sbc8548-altflash.dts +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8548 Device Tree Source - * - * Configured for booting off the alternate (64MB SODIMM) flash. - * Requires switching JP12 jumpers and changing SW2.8 setting. - * - * Copyright 2013 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - */ - - -/dts-v1/; - -/include/ "sbc8548-pre.dtsi" - -/{ - localbus@e0000000 { - #address-cells = <2>; - #size-cells = <1>; - compatible = "simple-bus"; - reg = <0xe0000000 0x5000>; - interrupt-parent = <&mpic>; - - ranges = <0x0 0x0 0xfc000000 0x04000000 /*64MB Flash*/ - 0x3 0x0 0xf0000000 0x04000000 /*64MB SDRAM*/ - 0x4 0x0 0xf4000000 0x04000000 /*64MB SDRAM*/ - 0x5 0x0 0xf8000000 0x00b10000 /* EPLD */ - 0x6 0x0 0xef800000 0x00800000>; /*8MB Flash*/ - - flash@0,0 { - #address-cells = <1>; - #size-cells = <1>; - reg = <0x0 0x0 0x04000000>; - compatible = "intel,JS28F128", "cfi-flash"; - bank-width = <4>; - device-width = <1>; - partition@0 { - label = "space"; - /* FC000000 -> FFEFFFFF */ - reg = <0x00000000 0x03f00000>; - }; - partition@3f00000 { - label = "bootloader"; - /* FFF00000 -> FFFFFFFF */ - reg = <0x03f00000 0x00100000>; - read-only; - }; - }; - - - epld@5,0 { - compatible = "wrs,epld-localbus"; - #address-cells = <2>; - #size-cells = <1>; - reg = <0x5 0x0 0x00b10000>; - ranges = < - 0x0 0x0 0x5 0x000000 0x1fff /* LED */ - 0x1 0x0 0x5 0x100000 0x1fff /* Switches */ - 0x3 0x0 0x5 0x300000 0x1fff /* HW Rev. */ - 0xb 0x0 0x5 0xb00000 0x1fff /* EEPROM */ - >; - - led@0,0 { - compatible = "led"; - reg = <0x0 0x0 0x1fff>; - }; - - switches@1,0 { - compatible = "switches"; - reg = <0x1 0x0 0x1fff>; - }; - - hw-rev@3,0 { - compatible = "hw-rev"; - reg = <0x3 0x0 0x1fff>; - }; - - eeprom@b,0 { - compatible = "eeprom"; - reg = <0xb 0 0x1fff>; - }; - - }; - - alt-flash@6,0 { - #address-cells = <1>; - #size-cells = <1>; - compatible = "intel,JS28F640", "cfi-flash"; - reg = <0x6 0x0 0x800000>; - bank-width = <1>; - device-width = <1>; - partition@0 { - label = "space"; - /* EF800000 -> EFF9FFFF */ - reg = <0x00000000 0x007a0000>; - }; - partition@7a0000 { - label = "bootloader"; - /* EFFA0000 -> EFFFFFFF */ - reg = <0x007a0000 0x00060000>; - read-only; - }; - }; - - - }; -}; - -/include/ "sbc8548-post.dtsi" diff --git a/arch/powerpc/boot/dts/sbc8548-post.dtsi b/arch/powerpc/boot/dts/sbc8548-post.dtsi deleted file mode 100644 index 9d848d409408..000000000000 --- a/arch/powerpc/boot/dts/sbc8548-post.dtsi +++ /dev/null @@ -1,289 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8548 Device Tree Source - * - * Copyright 2007 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - */ - -/{ - soc8548@e0000000 { - #address-cells = <1>; - #size-cells = <1>; - device_type = "soc"; - ranges = <0x00000000 0xe0000000 0x00100000>; - bus-frequency = <0>; - compatible = "simple-bus"; - - ecm-law@0 { - compatible = "fsl,ecm-law"; - reg = <0x0 0x1000>; - fsl,num-laws = <10>; - }; - - ecm@1000 { - compatible = "fsl,mpc8548-ecm", "fsl,ecm"; - reg = <0x1000 0x1000>; - interrupts = <17 2>; - interrupt-parent = <&mpic>; - }; - - memory-controller@2000 { - compatible = "fsl,mpc8548-memory-controller"; - reg = <0x2000 0x1000>; - interrupt-parent = <&mpic>; - interrupts = <0x12 0x2>; - }; - - L2: l2-cache-controller@20000 { - compatible = "fsl,mpc8548-l2-cache-controller"; - reg = <0x20000 0x1000>; - cache-line-size = <0x20>; // 32 bytes - cache-size = <0x80000>; // L2, 512K - interrupt-parent = <&mpic>; - interrupts = <0x10 0x2>; - }; - - i2c@3000 { - #address-cells = <1>; - #size-cells = <0>; - cell-index = <0>; - compatible = "fsl-i2c"; - reg = <0x3000 0x100>; - interrupts = <0x2b 0x2>; - interrupt-parent = <&mpic>; - dfsrr; - }; - - i2c@3100 { - #address-cells = <1>; - #size-cells = <0>; - cell-index = <1>; - compatible = "fsl-i2c"; - reg = <0x3100 0x100>; - interrupts = <0x2b 0x2>; - interrupt-parent = <&mpic>; - dfsrr; - }; - - dma@21300 { - #address-cells = <1>; - #size-cells = <1>; - compatible = "fsl,mpc8548-dma", "fsl,eloplus-dma"; - reg = <0x21300 0x4>; - ranges = <0x0 0x21100 0x200>; - cell-index = <0>; - dma-channel@0 { - compatible = "fsl,mpc8548-dma-channel", - "fsl,eloplus-dma-channel"; - reg = <0x0 0x80>; - cell-index = <0>; - interrupt-parent = <&mpic>; - interrupts = <20 2>; - }; - dma-channel@80 { - compatible = "fsl,mpc8548-dma-channel", - "fsl,eloplus-dma-channel"; - reg = <0x80 0x80>; - cell-index = <1>; - interrupt-parent = <&mpic>; - interrupts = <21 2>; - }; - dma-channel@100 { - compatible = "fsl,mpc8548-dma-channel", - "fsl,eloplus-dma-channel"; - reg = <0x100 0x80>; - cell-index = <2>; - interrupt-parent = <&mpic>; - interrupts = <22 2>; - }; - dma-channel@180 { - compatible = "fsl,mpc8548-dma-channel", - "fsl,eloplus-dma-channel"; - reg = <0x180 0x80>; - cell-index = <3>; - interrupt-parent = <&mpic>; - interrupts = <23 2>; - }; - }; - - enet0: ethernet@24000 { - #address-cells = <1>; - #size-cells = <1>; - cell-index = <0>; - device_type = "network"; - model = "eTSEC"; - compatible = "gianfar"; - reg = <0x24000 0x1000>; - ranges = <0x0 0x24000 0x1000>; - local-mac-address = [ 00 00 00 00 00 00 ]; - interrupts = <0x1d 0x2 0x1e 0x2 0x22 0x2>; - interrupt-parent = <&mpic>; - tbi-handle = <&tbi0>; - phy-handle = <&phy0>; - - mdio@520 { - #address-cells = <1>; - #size-cells = <0>; - compatible = "fsl,gianfar-mdio"; - reg = <0x520 0x20>; - - phy0: ethernet-phy@19 { - interrupt-parent = <&mpic>; - interrupts = <0x6 0x1>; - reg = <0x19>; - }; - phy1: ethernet-phy@1a { - interrupt-parent = <&mpic>; - interrupts = <0x7 0x1>; - reg = <0x1a>; - }; - tbi0: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - }; - - enet1: ethernet@25000 { - #address-cells = <1>; - #size-cells = <1>; - cell-index = <1>; - device_type = "network"; - model = "eTSEC"; - compatible = "gianfar"; - reg = <0x25000 0x1000>; - ranges = <0x0 0x25000 0x1000>; - local-mac-address = [ 00 00 00 00 00 00 ]; - interrupts = <0x23 0x2 0x24 0x2 0x28 0x2>; - interrupt-parent = <&mpic>; - tbi-handle = <&tbi1>; - phy-handle = <&phy1>; - - mdio@520 { - #address-cells = <1>; - #size-cells = <0>; - compatible = "fsl,gianfar-tbi"; - reg = <0x520 0x20>; - - tbi1: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - }; - - serial0: serial@4500 { - cell-index = <0>; - device_type = "serial"; - compatible = "fsl,ns16550", "ns16550"; - reg = <0x4500 0x100>; // reg base, size - clock-frequency = <0>; // should we fill in in uboot? - interrupts = <0x2a 0x2>; - interrupt-parent = <&mpic>; - }; - - serial1: serial@4600 { - cell-index = <1>; - device_type = "serial"; - compatible = "fsl,ns16550", "ns16550"; - reg = <0x4600 0x100>; // reg base, size - clock-frequency = <0>; // should we fill in in uboot? - interrupts = <0x2a 0x2>; - interrupt-parent = <&mpic>; - }; - - global-utilities@e0000 { //global utilities reg - compatible = "fsl,mpc8548-guts"; - reg = <0xe0000 0x1000>; - fsl,has-rstcr; - }; - - crypto@30000 { - compatible = "fsl,sec2.1", "fsl,sec2.0"; - reg = <0x30000 0x10000>; - interrupts = <45 2>; - interrupt-parent = <&mpic>; - fsl,num-channels = <4>; - fsl,channel-fifo-len = <24>; - fsl,exec-units-mask = <0xfe>; - fsl,descriptor-types-mask = <0x12b0ebf>; - }; - - mpic: pic@40000 { - interrupt-controller; - #address-cells = <0>; - #interrupt-cells = <2>; - reg = <0x40000 0x40000>; - compatible = "chrp,open-pic"; - device_type = "open-pic"; - }; - }; - - pci0: pci@e0008000 { - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - interrupt-map = < - /* IDSEL 0x01 (PCI-X slot) @66MHz */ - 0x0800 0x0 0x0 0x1 &mpic 0x2 0x1 - 0x0800 0x0 0x0 0x2 &mpic 0x3 0x1 - 0x0800 0x0 0x0 0x3 &mpic 0x4 0x1 - 0x0800 0x0 0x0 0x4 &mpic 0x1 0x1 - - /* IDSEL 0x11 (PCI, 3.3V 32bit) @33MHz */ - 0x8800 0x0 0x0 0x1 &mpic 0x2 0x1 - 0x8800 0x0 0x0 0x2 &mpic 0x3 0x1 - 0x8800 0x0 0x0 0x3 &mpic 0x4 0x1 - 0x8800 0x0 0x0 0x4 &mpic 0x1 0x1>; - - interrupt-parent = <&mpic>; - interrupts = <0x18 0x2>; - bus-range = <0 0>; - ranges = <0x02000000 0x0 0x80000000 0x80000000 0x0 0x10000000 - 0x01000000 0x0 0x00000000 0xe2000000 0x0 0x00800000>; - clock-frequency = <66000000>; - #interrupt-cells = <1>; - #size-cells = <2>; - #address-cells = <3>; - reg = <0xe0008000 0x1000>; - compatible = "fsl,mpc8540-pcix", "fsl,mpc8540-pci"; - device_type = "pci"; - }; - - pci1: pcie@e000a000 { - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - interrupt-map = < - - /* IDSEL 0x0 (PEX) */ - 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 - 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 - 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 - 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1>; - - interrupt-parent = <&mpic>; - interrupts = <0x1a 0x2>; - bus-range = <0x0 0xff>; - ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x10000000 - 0x01000000 0x0 0x00000000 0xe2800000 0x0 0x08000000>; - clock-frequency = <33000000>; - #interrupt-cells = <1>; - #size-cells = <2>; - #address-cells = <3>; - reg = <0xe000a000 0x1000>; - compatible = "fsl,mpc8548-pcie"; - device_type = "pci"; - pcie@0 { - reg = <0x0 0x0 0x0 0x0 0x0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; - ranges = <0x02000000 0x0 0xa0000000 - 0x02000000 0x0 0xa0000000 - 0x0 0x10000000 - - 0x01000000 0x0 0x00000000 - 0x01000000 0x0 0x00000000 - 0x0 0x00800000>; - }; - }; -}; diff --git a/arch/powerpc/boot/dts/sbc8548-pre.dtsi b/arch/powerpc/boot/dts/sbc8548-pre.dtsi deleted file mode 100644 index 0e3665fd15d0..000000000000 --- a/arch/powerpc/boot/dts/sbc8548-pre.dtsi +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8548 Device Tree Source - * - * Copyright 2007 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - */ - -/{ - model = "SBC8548"; - compatible = "SBC8548"; - #address-cells = <1>; - #size-cells = <1>; - - aliases { - ethernet0 = &enet0; - ethernet1 = &enet1; - serial0 = &serial0; - serial1 = &serial1; - pci0 = &pci0; - pci1 = &pci1; - }; - - cpus { - #address-cells = <1>; - #size-cells = <0>; - - PowerPC,8548@0 { - device_type = "cpu"; - reg = <0>; - d-cache-line-size = <0x20>; // 32 bytes - i-cache-line-size = <0x20>; // 32 bytes - d-cache-size = <0x8000>; // L1, 32K - i-cache-size = <0x8000>; // L1, 32K - timebase-frequency = <0>; // From uboot - bus-frequency = <0>; - clock-frequency = <0>; - next-level-cache = <&L2>; - }; - }; - - memory { - device_type = "memory"; - reg = <0x00000000 0x10000000>; - }; - -}; diff --git a/arch/powerpc/boot/dts/sbc8548.dts b/arch/powerpc/boot/dts/sbc8548.dts deleted file mode 100644 index ce0a119f496e..000000000000 --- a/arch/powerpc/boot/dts/sbc8548.dts +++ /dev/null @@ -1,106 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8548 Device Tree Source - * - * Copyright 2007 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - */ - - -/dts-v1/; - -/include/ "sbc8548-pre.dtsi" - -/{ - localbus@e0000000 { - #address-cells = <2>; - #size-cells = <1>; - compatible = "simple-bus"; - reg = <0xe0000000 0x5000>; - interrupt-parent = <&mpic>; - - ranges = <0x0 0x0 0xff800000 0x00800000 /*8MB Flash*/ - 0x3 0x0 0xf0000000 0x04000000 /*64MB SDRAM*/ - 0x4 0x0 0xf4000000 0x04000000 /*64MB SDRAM*/ - 0x5 0x0 0xf8000000 0x00b10000 /* EPLD */ - 0x6 0x0 0xec000000 0x04000000>; /*64MB Flash*/ - - - flash@0,0 { - #address-cells = <1>; - #size-cells = <1>; - compatible = "intel,JS28F640", "cfi-flash"; - reg = <0x0 0x0 0x800000>; - bank-width = <1>; - device-width = <1>; - partition@0 { - label = "space"; - /* FF800000 -> FFF9FFFF */ - reg = <0x00000000 0x007a0000>; - }; - partition@7a0000 { - label = "bootloader"; - /* FFFA0000 -> FFFFFFFF */ - reg = <0x007a0000 0x00060000>; - read-only; - }; - }; - - epld@5,0 { - compatible = "wrs,epld-localbus"; - #address-cells = <2>; - #size-cells = <1>; - reg = <0x5 0x0 0x00b10000>; - ranges = < - 0x0 0x0 0x5 0x000000 0x1fff /* LED */ - 0x1 0x0 0x5 0x100000 0x1fff /* Switches */ - 0x3 0x0 0x5 0x300000 0x1fff /* HW Rev. */ - 0xb 0x0 0x5 0xb00000 0x1fff /* EEPROM */ - >; - - led@0,0 { - compatible = "led"; - reg = <0x0 0x0 0x1fff>; - }; - - switches@1,0 { - compatible = "switches"; - reg = <0x1 0x0 0x1fff>; - }; - - hw-rev@3,0 { - compatible = "hw-rev"; - reg = <0x3 0x0 0x1fff>; - }; - - eeprom@b,0 { - compatible = "eeprom"; - reg = <0xb 0 0x1fff>; - }; - - }; - - alt-flash@6,0 { - #address-cells = <1>; - #size-cells = <1>; - reg = <0x6 0x0 0x04000000>; - compatible = "intel,JS28F128", "cfi-flash"; - bank-width = <4>; - device-width = <1>; - partition@0 { - label = "space"; - /* EC000000 -> EFEFFFFF */ - reg = <0x00000000 0x03f00000>; - }; - partition@3f00000 { - label = "bootloader"; - /* EFF00000 -> EFFFFFFF */ - reg = <0x03f00000 0x00100000>; - read-only; - }; - }; - }; -}; - -/include/ "sbc8548-post.dtsi" diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts index aaa381da1906..e9c945b123c6 100644 --- a/arch/powerpc/boot/dts/wii.dts +++ b/arch/powerpc/boot/dts/wii.dts @@ -216,7 +216,18 @@ control@d800100 { compatible = "nintendo,hollywood-control"; - reg = <0x0d800100 0x300>; + /* + * Both the address and length are wrong, according to + * Wiibrew this should be <0x0d800000 0x400>, but it + * requires refactoring the PIC1, GPIO and OTP nodes + * before changing that. + */ + reg = <0x0d800100 0xa0>; + }; + + otp@d8001ec { + compatible = "nintendo,hollywood-otp"; + reg = <0x0d8001ec 0x8>; }; disk@d806000 { diff --git a/arch/powerpc/boot/install.sh b/arch/powerpc/boot/install.sh index b6a256bc96ee..14473150ddb4 100644 --- a/arch/powerpc/boot/install.sh +++ b/arch/powerpc/boot/install.sh @@ -15,12 +15,25 @@ # $2 - kernel image file # $3 - kernel map file # $4 - default install path (blank if root directory) -# $5 and more - kernel boot files; zImage*, uImage, cuImage.*, etc. # # Bail with error code if anything goes wrong set -e +verify () { + if [ ! -f "$1" ]; then + echo "" 1>&2 + echo " *** Missing file: $1" 1>&2 + echo ' *** You need to run "make" before "make install".' 1>&2 + echo "" 1>&2 + exit 1 + fi +} + +# Make sure the files actually exist +verify "$2" +verify "$3" + # User may have a custom install script if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi @@ -41,15 +54,3 @@ fi cat $2 > $4/$image_name cp $3 $4/System.map - -# Copy all the bootable image files -path=$4 -shift 4 -while [ $# -ne 0 ]; do - image_name=`basename $1` - if [ -f $path/$image_name ]; then - mv $path/$image_name $path/$image_name.old - fi - cat $1 > $path/$image_name - shift -done; diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 1f4676bab786..1cd82564c996 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -298,7 +298,7 @@ cuboot*) *-tqm8541|*-mpc8560*|*-tqm8560|*-tqm8555|*-ksi8560*) platformo=$object/cuboot-85xx-cpm2.o ;; - *-mpc85*|*-tqm85*|*-sbc85*) + *-mpc85*|*-tqm85*) platformo=$object/cuboot-85xx.o ;; *-amigaone) diff --git a/arch/powerpc/configs/85xx/sbc8548_defconfig b/arch/powerpc/configs/85xx/sbc8548_defconfig deleted file mode 100644 index 258881727119..000000000000 --- a/arch/powerpc/configs/85xx/sbc8548_defconfig +++ /dev/null @@ -1,50 +0,0 @@ -CONFIG_PPC_85xx=y -CONFIG_SYSVIPC=y -CONFIG_LOG_BUF_SHIFT=14 -CONFIG_BLK_DEV_INITRD=y -CONFIG_EXPERT=y -CONFIG_SLAB=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_SBC8548=y -CONFIG_GEN_RTC=y -CONFIG_BINFMT_MISC=y -CONFIG_MATH_EMULATION=y -# CONFIG_SECCOMP is not set -CONFIG_PCI=y -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_UNIX=y -CONFIG_XFRM_USER=y -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -CONFIG_IP_PNP_BOOTP=y -CONFIG_SYN_COOKIES=y -# CONFIG_IPV6 is not set -# CONFIG_FW_LOADER is not set -CONFIG_MTD=y -CONFIG_MTD_BLOCK=y -CONFIG_MTD_CFI=y -CONFIG_MTD_CFI_ADV_OPTIONS=y -CONFIG_MTD_CFI_GEOMETRY=y -CONFIG_MTD_CFI_I4=y -CONFIG_MTD_CFI_INTELEXT=y -CONFIG_MTD_PHYSMAP_OF=y -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_NETDEVICES=y -CONFIG_GIANFAR=y -CONFIG_BROADCOM_PHY=y -# CONFIG_INPUT_KEYBOARD is not set -# CONFIG_INPUT_MOUSE is not set -# CONFIG_SERIO is not set -# CONFIG_VT is not set -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -# CONFIG_HW_RANDOM is not set -# CONFIG_USB_SUPPORT is not set -CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y -CONFIG_NFS_FS=y -CONFIG_ROOT_NFS=y diff --git a/arch/powerpc/configs/microwatt_defconfig b/arch/powerpc/configs/microwatt_defconfig index a08b739123da..9465209b8c5b 100644 --- a/arch/powerpc/configs/microwatt_defconfig +++ b/arch/powerpc/configs/microwatt_defconfig @@ -5,6 +5,7 @@ CONFIG_PREEMPT_VOLUNTARY=y CONFIG_TICK_CPU_ACCOUNTING=y CONFIG_LOG_BUF_SHIFT=16 CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=12 +CONFIG_CGROUPS=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KALLSYMS_ALL=y @@ -53,10 +54,12 @@ CONFIG_MTD_SPI_NOR=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_NETDEVICES=y +CONFIG_LITEX_LITEETH=y # CONFIG_WLAN is not set # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_8250=y # CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set CONFIG_SERIAL_8250_CONSOLE=y @@ -76,8 +79,10 @@ CONFIG_SPI_SPIDEV=y CONFIG_EXT4_FS=y # CONFIG_FILE_LOCKING is not set # CONFIG_DNOTIFY is not set -# CONFIG_INOTIFY_USER is not set +CONFIG_AUTOFS_FS=y +CONFIG_TMPFS=y # CONFIG_MISC_FILESYSTEMS is not set +CONFIG_CRYPTO_SHA256=y # CONFIG_CRYPTO_HW is not set # CONFIG_XZ_DEC_X86 is not set # CONFIG_XZ_DEC_IA64 is not set diff --git a/arch/powerpc/configs/mpc85xx_base.config b/arch/powerpc/configs/mpc85xx_base.config index b1593fe6f70b..85907b776908 100644 --- a/arch/powerpc/configs/mpc85xx_base.config +++ b/arch/powerpc/configs/mpc85xx_base.config @@ -13,7 +13,6 @@ CONFIG_P1022_DS=y CONFIG_P1022_RDK=y CONFIG_P1023_RDB=y CONFIG_TWR_P102x=y -CONFIG_SBC8548=y CONFIG_SOCRATES=y CONFIG_STX_GP3=y CONFIG_TQM8540=y diff --git a/arch/powerpc/configs/mpc86xx_base.config b/arch/powerpc/configs/mpc86xx_base.config index 67bd1fa036ee..588870e6af3b 100644 --- a/arch/powerpc/configs/mpc86xx_base.config +++ b/arch/powerpc/configs/mpc86xx_base.config @@ -1,6 +1,5 @@ CONFIG_PPC_86xx=y CONFIG_MPC8641_HPCN=y -CONFIG_SBC8641D=y CONFIG_MPC8610_HPCD=y CONFIG_GEF_PPC9A=y CONFIG_GEF_SBC310=y diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig index d21f266cea9a..c74dc76b1d0d 100644 --- a/arch/powerpc/configs/mpc885_ads_defconfig +++ b/arch/powerpc/configs/mpc885_ads_defconfig @@ -1,19 +1,30 @@ -CONFIG_PPC_8xx=y # CONFIG_SWAP is not set CONFIG_SYSVIPC=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_JIT=y +CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_EXPERT=y # CONFIG_ELF_CORE is not set # CONFIG_BASE_FULL is not set # CONFIG_FUTEX is not set +CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set -# CONFIG_BLK_DEV_BSG is not set -CONFIG_PARTITION_ADVANCED=y +CONFIG_PPC_8xx=y +CONFIG_8xx_GPIO=y +CONFIG_SMC_UCODE_PATCH=y +CONFIG_PIN_TLB=y CONFIG_GEN_RTC=y CONFIG_HZ_100=y +CONFIG_MATH_EMULATION=y +CONFIG_PPC_16K_PAGES=y +CONFIG_ADVANCED_OPTIONS=y # CONFIG_SECCOMP is not set +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_MODULES=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_PARTITION_ADVANCED=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -21,7 +32,6 @@ CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_PNP=y CONFIG_SYN_COOKIES=y -# CONFIG_IPV6 is not set # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_BLOCK=y @@ -34,6 +44,7 @@ CONFIG_MTD_CFI_GEOMETRY=y # CONFIG_MTD_CFI_I2 is not set CONFIG_MTD_CFI_I4=y CONFIG_MTD_CFI_AMDSTD=y +CONFIG_MTD_PHYSMAP=y CONFIG_MTD_PHYSMAP_OF=y # CONFIG_BLK_DEV is not set CONFIG_NETDEVICES=y @@ -46,39 +57,25 @@ CONFIG_DAVICOM_PHY=y # CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y +CONFIG_SPI=y +CONFIG_SPI_FSL_SPI=y # CONFIG_HWMON is not set +CONFIG_WATCHDOG=y +CONFIG_8xxx_WDT=y # CONFIG_USB_SUPPORT is not set # CONFIG_DNOTIFY is not set CONFIG_TMPFS=y CONFIG_CRAMFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y +CONFIG_CRYPTO=y +CONFIG_CRYPTO_DEV_TALITOS=y CONFIG_CRC32_SLICEBY4=y CONFIG_DEBUG_INFO=y CONFIG_MAGIC_SYSRQ=y -CONFIG_DETECT_HUNG_TASK=y -CONFIG_PPC_16K_PAGES=y -CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_FS=y -CONFIG_PPC_PTDUMP=y -CONFIG_MODULES=y -CONFIG_SPI=y -CONFIG_SPI_FSL_SPI=y -CONFIG_CRYPTO=y -CONFIG_CRYPTO_DEV_TALITOS=y -CONFIG_8xx_GPIO=y -CONFIG_WATCHDOG=y -CONFIG_8xxx_WDT=y -CONFIG_SMC_UCODE_PATCH=y -CONFIG_ADVANCED_OPTIONS=y -CONFIG_PIN_TLB=y -CONFIG_PERF_EVENTS=y -CONFIG_MATH_EMULATION=y -CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_IPV6=y -CONFIG_BPF_JIT=y CONFIG_DEBUG_VM_PGTABLE=y +CONFIG_DETECT_HUNG_TASK=y CONFIG_BDI_SWITCH=y CONFIG_PPC_EARLY_DEBUG=y -CONFIG_PPC_EARLY_DEBUG_CPM_ADDR=0xff002008 +CONFIG_PPC_PTDUMP=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index ee09f7bb6ea9..6697c5e6682f 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -55,7 +55,6 @@ CONFIG_MPC837x_RDB=y CONFIG_ASP834x=y CONFIG_PPC_86xx=y CONFIG_MPC8641_HPCN=y -CONFIG_SBC8641D=y CONFIG_MPC8610_HPCD=y CONFIG_GEF_SBC610=y CONFIG_CPU_FREQ=y diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig index 379c171f3ddd..a0c45bf2bfb1 100644 --- a/arch/powerpc/configs/wii_defconfig +++ b/arch/powerpc/configs/wii_defconfig @@ -99,6 +99,7 @@ CONFIG_LEDS_TRIGGER_HEARTBEAT=y CONFIG_LEDS_TRIGGER_PANIC=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_GENERIC=y +CONFIG_NVMEM_NINTENDO_OTP=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_FUSE_FS=m diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h index 19b70c5b5f18..2b736d9fbb1b 100644 --- a/arch/powerpc/include/asm/asm-compat.h +++ b/arch/powerpc/include/asm/asm-compat.h @@ -17,7 +17,7 @@ #define PPC_LONG stringify_in_c(.8byte) #define PPC_LONG_ALIGN stringify_in_c(.balign 8) #define PPC_TLNEI stringify_in_c(tdnei) -#define PPC_LLARX(t, a, b, eh) PPC_LDARX(t, a, b, eh) +#define PPC_LLARX stringify_in_c(ldarx) #define PPC_STLCX stringify_in_c(stdcx.) #define PPC_CNTLZL stringify_in_c(cntlzd) #define PPC_MTOCRF(FXM, RS) MTOCRF((FXM), RS) @@ -50,7 +50,7 @@ #define PPC_LONG stringify_in_c(.long) #define PPC_LONG_ALIGN stringify_in_c(.balign 4) #define PPC_TLNEI stringify_in_c(twnei) -#define PPC_LLARX(t, a, b, eh) PPC_LWARX(t, a, b, eh) +#define PPC_LLARX stringify_in_c(lwarx) #define PPC_STLCX stringify_in_c(stwcx.) #define PPC_CNTLZL stringify_in_c(cntlzw) #define PPC_MTOCRF stringify_in_c(mtcrf) diff --git a/arch/powerpc/include/asm/asm-const.h b/arch/powerpc/include/asm/asm-const.h index 0ce2368bd20f..dbfa5e1e3198 100644 --- a/arch/powerpc/include/asm/asm-const.h +++ b/arch/powerpc/include/asm/asm-const.h @@ -12,16 +12,6 @@ # define ASM_CONST(x) __ASM_CONST(x) #endif -/* - * Inline assembly memory constraint - * - * GCC 4.9 doesn't properly handle pre update memory constraint "m<>" - * - */ -#if defined(GCC_VERSION) && GCC_VERSION < 50000 -#define UPD_CONSTR "" -#else #define UPD_CONSTR "<>" -#endif #endif /* _ASM_POWERPC_ASM_CONST_H */ diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index a1732a79e92a..6a53ef178bfd 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -207,7 +207,7 @@ arch_atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new) int r, o = *old; __asm__ __volatile__ ( -"1:\t" PPC_LWARX(%0,0,%2,1) " # atomic_try_cmpxchg_acquire \n" +"1: lwarx %0,0,%2,%5 # atomic_try_cmpxchg_acquire \n" " cmpw 0,%0,%3 \n" " bne- 2f \n" " stwcx. %4,0,%2 \n" @@ -215,7 +215,7 @@ arch_atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new) "\t" PPC_ACQUIRE_BARRIER " \n" "2: \n" : "=&r" (r), "+m" (v->counter) - : "r" (&v->counter), "r" (o), "r" (new) + : "r" (&v->counter), "r" (o), "r" (new), "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) : "cr0", "memory"); if (unlikely(r != o)) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 299ab33505a6..11847b6a244e 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -70,7 +70,7 @@ static inline void fn(unsigned long mask, \ unsigned long *p = (unsigned long *)_p; \ __asm__ __volatile__ ( \ prefix \ -"1:" PPC_LLARX(%0,0,%3,0) "\n" \ +"1:" PPC_LLARX "%0,0,%3,0\n" \ stringify_in_c(op) "%0,%0,%2\n" \ PPC_STLCX "%0,0,%3\n" \ "bne- 1b\n" \ @@ -115,13 +115,13 @@ static inline unsigned long fn( \ unsigned long *p = (unsigned long *)_p; \ __asm__ __volatile__ ( \ prefix \ -"1:" PPC_LLARX(%0,0,%3,eh) "\n" \ +"1:" PPC_LLARX "%0,0,%3,%4\n" \ stringify_in_c(op) "%1,%0,%2\n" \ PPC_STLCX "%1,0,%3\n" \ "bne- 1b\n" \ postfix \ : "=&r" (old), "=&r" (t) \ - : "r" (mask), "r" (p) \ + : "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \ : "cc", "memory"); \ return (old & mask); \ } @@ -170,7 +170,7 @@ clear_bit_unlock_return_word(int nr, volatile unsigned long *addr) __asm__ __volatile__ ( PPC_RELEASE_BARRIER -"1:" PPC_LLARX(%0,0,%3,0) "\n" +"1:" PPC_LLARX "%0,0,%3,0\n" "andc %1,%0,%2\n" PPC_STLCX "%1,0,%3\n" "bne- 1b\n" diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index a1cc73a88710..170339969b7c 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -90,7 +90,7 @@ /* Prevent access to userspace using any key values */ LOAD_REG_IMMEDIATE(\gpr2, AMR_KUAP_BLOCKED) 999: tdne \gpr1, \gpr2 - EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) + EMIT_WARN_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_BOOK3S_KUAP, 67) #endif .endm diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 0b2162890d8b..02c08d1492f8 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -4,6 +4,7 @@ #ifdef __KERNEL__ #include <asm/asm-compat.h> +#include <asm/extable.h> #ifdef CONFIG_BUG @@ -30,6 +31,11 @@ .endm #endif /* verbose */ +.macro EMIT_WARN_ENTRY addr,file,line,flags + EX_TABLE(\addr,\addr+4) + EMIT_BUG_ENTRY \addr,\file,\line,\flags +.endm + #else /* !__ASSEMBLY__ */ /* _EMIT_BUG_ENTRY expects args %0,%1,%2,%3 to be FILE, LINE, flags and sizeof(struct bug_entry), respectively */ @@ -58,6 +64,16 @@ "i" (sizeof(struct bug_entry)), \ ##__VA_ARGS__) +#define WARN_ENTRY(insn, flags, label, ...) \ + asm_volatile_goto( \ + "1: " insn "\n" \ + EX_TABLE(1b, %l[label]) \ + _EMIT_BUG_ENTRY \ + : : "i" (__FILE__), "i" (__LINE__), \ + "i" (flags), \ + "i" (sizeof(struct bug_entry)), \ + ##__VA_ARGS__ : : label) + /* * BUG_ON() and WARN_ON() do their best to cooperate with compile-time * optimisations. However depending on the complexity of the condition @@ -68,7 +84,19 @@ BUG_ENTRY("twi 31, 0, 0", 0); \ unreachable(); \ } while (0) +#define HAVE_ARCH_BUG +#define __WARN_FLAGS(flags) do { \ + __label__ __label_warn_on; \ + \ + WARN_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags), __label_warn_on); \ + unreachable(); \ + \ +__label_warn_on: \ + break; \ +} while (0) + +#ifdef CONFIG_PPC64 #define BUG_ON(x) do { \ if (__builtin_constant_p(x)) { \ if (x) \ @@ -78,31 +106,43 @@ } \ } while (0) -#define __WARN_FLAGS(flags) BUG_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags)) - #define WARN_ON(x) ({ \ - int __ret_warn_on = !!(x); \ - if (__builtin_constant_p(__ret_warn_on)) { \ - if (__ret_warn_on) \ + bool __ret_warn_on = false; \ + do { \ + if (__builtin_constant_p((x))) { \ + if (!(x)) \ + break; \ __WARN(); \ - } else { \ - BUG_ENTRY(PPC_TLNEI " %4, 0", \ - BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \ - "r" (__ret_warn_on)); \ - } \ + __ret_warn_on = true; \ + } else { \ + __label__ __label_warn_on; \ + \ + WARN_ENTRY(PPC_TLNEI " %4, 0", \ + BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \ + __label_warn_on, \ + "r" ((__force long)(x))); \ + break; \ +__label_warn_on: \ + __ret_warn_on = true; \ + } \ + } while (0); \ unlikely(__ret_warn_on); \ }) -#define HAVE_ARCH_BUG #define HAVE_ARCH_BUG_ON #define HAVE_ARCH_WARN_ON +#endif + #endif /* __ASSEMBLY __ */ #else #ifdef __ASSEMBLY__ .macro EMIT_BUG_ENTRY addr,file,line,flags .endm +.macro EMIT_WARN_ENTRY addr,file,line,flags +.endm #else /* !__ASSEMBLY__ */ #define _EMIT_BUG_ENTRY +#define _EMIT_WARN_ENTRY #endif #endif /* CONFIG_BUG */ diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index e33dcf134cdd..7afc96fb6524 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -83,22 +83,6 @@ struct compat_statfs { #define COMPAT_OFF_T_MAX 0x7fffffff -static inline void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = current->thread.regs; - unsigned long usp = regs->gpr[1]; - - /* - * We can't access below the stack pointer in the 32bit ABI and - * can access 288 bytes in the 64bit big-endian ABI, - * or 512 bytes with the new ELFv2 little-endian ABI. - */ - if (!is_32bit_task()) - usp -= USER_REDZONE_SIZE; - - return (void __user *) (usp - len); -} - /* * ipc64_perm is actually 32/64bit clean but since the compat layer refers to * it we may as well define it. diff --git a/arch/powerpc/include/asm/debugfs.h b/arch/powerpc/include/asm/debugfs.h deleted file mode 100644 index 2c5c48571d75..000000000000 --- a/arch/powerpc/include/asm/debugfs.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _ASM_POWERPC_DEBUGFS_H -#define _ASM_POWERPC_DEBUGFS_H - -/* - * Copyright 2017, Michael Ellerman, IBM Corporation. - */ - -#include <linux/debugfs.h> - -extern struct dentry *powerpc_debugfs_root; - -#endif /* _ASM_POWERPC_DEBUGFS_H */ diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index bf2402fed3e0..4265d5e95c2c 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -111,6 +111,7 @@ int drmem_update_dt(void); int __init walk_drmem_lmbs_early(unsigned long node, void *data, int (*func)(struct drmem_lmb *, const __be32 **, void *)); +void drmem_update_lmbs(struct property *prop); #endif static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb) diff --git a/arch/powerpc/include/asm/extable.h b/arch/powerpc/include/asm/extable.h index eb91b2d2935a..26ce2e5c0fa8 100644 --- a/arch/powerpc/include/asm/extable.h +++ b/arch/powerpc/include/asm/extable.h @@ -17,6 +17,8 @@ #define ARCH_HAS_RELATIVE_EXTABLE +#ifndef __ASSEMBLY__ + struct exception_table_entry { int insn; int fixup; @@ -28,3 +30,15 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x) } #endif + +/* + * Helper macro for exception table entries + */ +#define EX_TABLE(_fault, _target) \ + stringify_in_c(.section __ex_table,"a";)\ + stringify_in_c(.balign 4;) \ + stringify_in_c(.long (_fault) - . ;) \ + stringify_in_c(.long (_target) - . ;) \ + stringify_in_c(.previous) + +#endif diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 7604673787d6..97a3bd9ffeb9 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -44,7 +44,7 @@ #define FW_FEATURE_OPAL ASM_CONST(0x0000000010000000) #define FW_FEATURE_SET_MODE ASM_CONST(0x0000000040000000) #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x0000000080000000) -#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0000000100000000) +#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0000000100000000) #define FW_FEATURE_PRRN ASM_CONST(0x0000000200000000) #define FW_FEATURE_DRMEM_V2 ASM_CONST(0x0000000400000000) #define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000) @@ -53,6 +53,7 @@ #define FW_FEATURE_ULTRAVISOR ASM_CONST(0x0000004000000000) #define FW_FEATURE_STUFF_TCE ASM_CONST(0x0000008000000000) #define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000) +#define FW_FEATURE_FORM2_AFFINITY ASM_CONST(0x0000020000000000) #ifndef __ASSEMBLY__ @@ -69,11 +70,11 @@ enum { FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO | FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY | - FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | + FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN | FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR | - FW_FEATURE_RPT_INVALIDATE, + FW_FEATURE_RPT_INVALIDATE | FW_FEATURE_FORM2_AFFINITY, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR, FW_FEATURE_POWERNV_ALWAYS = 0, diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index deef7c94d7b6..bf3b84128525 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -154,6 +154,7 @@ extern int iommu_tce_table_put(struct iommu_table *tbl); */ extern struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, unsigned long res_start, unsigned long res_end); +bool iommu_table_in_use(struct iommu_table *tbl); #define IOMMU_TABLE_GROUP_MAX_TABLES 2 diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index eaf3a562bf1e..19b6942c6969 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -39,6 +39,7 @@ struct kvm_nested_guest { pgd_t *shadow_pgtable; /* our page table for this guest */ u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */ u64 process_table; /* process table entry for this guest */ + u64 hfscr; /* HFSCR that the L1 requested for this nested guest */ long refcnt; /* number of pointers to this struct */ struct mutex tlb_lock; /* serialize page faults and tlbies */ struct kvm_nested_guest *next; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9f52f282b1aa..080a7feb7731 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -103,7 +103,6 @@ struct kvm_vcpu_stat { u64 emulated_inst_exits; u64 dec_exits; u64 ext_intr_exits; - u64 halt_wait_ns; u64 halt_successful_wait; u64 dbell_exits; u64 gdbell_exits; @@ -811,6 +810,8 @@ struct kvm_vcpu_arch { u32 online; + u64 hfscr_permitted; /* A mask of permitted HFSCR facilities */ + /* For support of nested guests */ struct kvm_nested_guest *nested; u32 nested_vcpu_id; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2d88944f9f34..671fbd1a765e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -664,9 +664,9 @@ extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); extern void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu); extern int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, - struct irq_desc *host_desc); + unsigned long host_irq); extern int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, - struct irq_desc *host_desc); + unsigned long host_irq); extern u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h index 6e20bb5c74ea..de7f79157918 100644 --- a/arch/powerpc/include/asm/membarrier.h +++ b/arch/powerpc/include/asm/membarrier.h @@ -12,7 +12,8 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev, * when switching from userspace to kernel is not needed after * store to rq->curr. */ - if (likely(!(atomic_read(&next->membarrier_state) & + if (IS_ENABLED(CONFIG_SMP) && + likely(!(atomic_read(&next->membarrier_state) & (MEMBARRIER_STATE_PRIVATE_EXPEDITED | MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev)) return; diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 27016b98ecb2..8abe8e42e045 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -324,7 +324,7 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) } #endif /* !CONFIG_DEBUG_VM */ -static inline bool radix_enabled(void) +static __always_inline bool radix_enabled(void) { return mmu_has_feature(MMU_FTR_TYPE_RADIX); } diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 74424c14515c..90f488fa4c17 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -126,6 +126,11 @@ struct pci_controller { #endif /* CONFIG_PPC64 */ void *private_data; + + /* IRQ domain hierarchy */ + struct irq_domain *dev_domain; + struct irq_domain *msi_domain; + struct fwnode_handle *fwnode; }; /* These are used for config access before all the PCI probing diff --git a/arch/powerpc/include/asm/pmc.h b/arch/powerpc/include/asm/pmc.h index c6bbe9778d3c..3c09109e708e 100644 --- a/arch/powerpc/include/asm/pmc.h +++ b/arch/powerpc/include/asm/pmc.h @@ -34,6 +34,13 @@ static inline void ppc_set_pmu_inuse(int inuse) #endif } +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +static inline int ppc_get_pmu_inuse(void) +{ + return get_paca()->pmcregs_in_use; +} +#endif + extern void power4_enable_pmcs(void); #else /* CONFIG_PPC64 */ diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h index d0ee0ede5767..b3f480799352 100644 --- a/arch/powerpc/include/asm/pnv-pci.h +++ b/arch/powerpc/include/asm/pnv-pci.h @@ -33,7 +33,7 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num); void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num); int pnv_cxl_get_irq_count(struct pci_dev *dev); struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev); -int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq); +int64_t pnv_opal_pci_msi_eoi(struct irq_data *d); bool is_pnv_opal_msi(struct irq_chip *chip); #ifdef CONFIG_CXL_BASE diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index bede76dd3db7..baea657bc868 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -576,8 +576,6 @@ #define PPC_DIVDE(t, a, b) stringify_in_c(.long PPC_RAW_DIVDE(t, a, b)) #define PPC_DIVDEU(t, a, b) stringify_in_c(.long PPC_RAW_DIVDEU(t, a, b)) #define PPC_LQARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LQARX(t, a, b, eh)) -#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LDARX(t, a, b, eh)) -#define PPC_LWARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LWARX(t, a, b, eh)) #define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_RAW_STQCX(t, a, b)) #define PPC_MADDHD(t, a, b, c) stringify_in_c(.long PPC_RAW_MADDHD(t, a, b, c)) #define PPC_MADDHDU(t, a, b, c) stringify_in_c(.long PPC_RAW_MADDHDU(t, a, b, c)) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 116c1519728a..1c538a9a11e0 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -10,6 +10,7 @@ #include <asm/ppc-opcode.h> #include <asm/firmware.h> #include <asm/feature-fixups.h> +#include <asm/extable.h> #ifdef __ASSEMBLY__ @@ -259,7 +260,7 @@ n: /* Be careful, this will clobber the lr register. */ #define LOAD_REG_ADDR_PIC(reg, name) \ - bl 0f; \ + bcl 20,31,$+4; \ 0: mflr reg; \ addis reg,reg,(name - 0b)@ha; \ addi reg,reg,(name - 0b)@l; @@ -752,16 +753,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) #endif /* __ASSEMBLY__ */ -/* - * Helper macro for exception table entries - */ -#define EX_TABLE(_fault, _target) \ - stringify_in_c(.section __ex_table,"a";)\ - stringify_in_c(.balign 4;) \ - stringify_in_c(.long (_fault) - . ;) \ - stringify_in_c(.long (_target) - . ;) \ - stringify_in_c(.previous) - #define SOFT_MASK_TABLE(_start, _end) \ stringify_in_c(.section __soft_mask_table,"a";)\ stringify_in_c(.balign 8;) \ diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 324a13351749..5c80152e8f18 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -147,8 +147,9 @@ extern int of_read_drc_info_cell(struct property **prop, #define OV5_MSI 0x0201 /* PCIe/MSI support */ #define OV5_CMO 0x0480 /* Cooperative Memory Overcommitment */ #define OV5_XCMO 0x0440 /* Page Coalescing */ -#define OV5_TYPE1_AFFINITY 0x0580 /* Type 1 NUMA affinity */ +#define OV5_FORM1_AFFINITY 0x0580 /* FORM1 NUMA affinity */ #define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ +#define OV5_FORM2_AFFINITY 0x0520 /* Form2 NUMA affinity */ #define OV5_HP_EVT 0x0604 /* Hot Plug Event support */ #define OV5_RESIZE_HPT 0x0601 /* Hash Page Table resizing */ #define OV5_PFO_HW_RNG 0x1180 /* PFO Random Number Generator */ diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 14422e851494..6e560f035614 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -22,6 +22,7 @@ #include <linux/err.h> #include <uapi/asm/ptrace.h> #include <asm/asm-const.h> +#include <asm/reg.h> #ifndef __ASSEMBLY__ struct pt_regs @@ -43,8 +44,14 @@ struct pt_regs unsigned long mq; #endif unsigned long trap; - unsigned long dar; - unsigned long dsisr; + union { + unsigned long dar; + unsigned long dear; + }; + union { + unsigned long dsisr; + unsigned long esr; + }; unsigned long result; }; }; @@ -197,11 +204,7 @@ static inline unsigned long frame_pointer(struct pt_regs *regs) return 0; } -#ifdef __powerpc64__ -#define user_mode(regs) ((((regs)->msr) >> MSR_PR_LG) & 0x1) -#else #define user_mode(regs) (((regs)->msr & MSR_PR) != 0) -#endif #define force_successful_syscall_return() \ do { \ @@ -286,6 +289,28 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) regs->gpr[3] = rc; } +static inline bool cpu_has_msr_ri(void) +{ + return !IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x); +} + +static inline bool regs_is_unrecoverable(struct pt_regs *regs) +{ + return unlikely(cpu_has_msr_ri() && !(regs->msr & MSR_RI)); +} + +static inline void regs_set_recoverable(struct pt_regs *regs) +{ + if (cpu_has_msr_ri()) + regs_set_return_msr(regs, regs->msr | MSR_RI); +} + +static inline void regs_set_unrecoverable(struct pt_regs *regs) +{ + if (cpu_has_msr_ri()) + regs_set_return_msr(regs, regs->msr & ~MSR_RI); +} + #define arch_has_single_step() (1) #define arch_has_block_step() (true) #define ARCH_HAS_USER_SINGLE_STEP_REPORT diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index be85cf156a1f..e9d27265253b 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -415,6 +415,7 @@ #define FSCR_TAR __MASK(FSCR_TAR_LG) #define FSCR_EBB __MASK(FSCR_EBB_LG) #define FSCR_DSCR __MASK(FSCR_DSCR_LG) +#define FSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */ #define SPRN_HFSCR 0xbe /* HV=1 Facility Status & Control Register */ #define HFSCR_PREFIX __MASK(FSCR_PREFIX_LG) #define HFSCR_MSGP __MASK(FSCR_MSGP_LG) @@ -426,7 +427,7 @@ #define HFSCR_DSCR __MASK(FSCR_DSCR_LG) #define HFSCR_VECVSX __MASK(FSCR_VECVSX_LG) #define HFSCR_FP __MASK(FSCR_FP_LG) -#define HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */ +#define HFSCR_INTR_CAUSE FSCR_INTR_CAUSE #define SPRN_TAR 0x32f /* Target Address Register */ #define SPRN_LPCR 0x13E /* LPAR Control Register */ #define LPCR_VPM0 ASM_CONST(0x8000000000000000) diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 324d7b298ec3..6e4af4492a14 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -38,14 +38,6 @@ extern char start_virt_trampolines[]; extern char end_virt_trampolines[]; #endif -static inline int in_kernel_text(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && addr < (unsigned long)__init_end) - return 1; - - return 0; -} - static inline unsigned long kernel_toc_addr(void) { /* Defined by the linker, see vmlinux.lds.S */ diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h index 552f325412cc..8985791a2ba5 100644 --- a/arch/powerpc/include/asm/simple_spinlock.h +++ b/arch/powerpc/include/asm/simple_spinlock.h @@ -51,7 +51,7 @@ static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) token = LOCK_TOKEN; __asm__ __volatile__( -"1: " PPC_LWARX(%0,0,%2,1) "\n\ +"1: lwarx %0,0,%2,1\n\ cmpwi 0,%0,0\n\ bne- 2f\n\ stwcx. %1,0,%2\n\ @@ -179,7 +179,7 @@ static inline long __arch_read_trylock(arch_rwlock_t *rw) long tmp; __asm__ __volatile__( -"1: " PPC_LWARX(%0,0,%1,1) "\n" +"1: lwarx %0,0,%1,1\n" __DO_SIGN_EXTEND " addic. %0,%0,1\n\ ble- 2f\n" @@ -203,7 +203,7 @@ static inline long __arch_write_trylock(arch_rwlock_t *rw) token = WRLOCK_TOKEN; __asm__ __volatile__( -"1: " PPC_LWARX(%0,0,%2,1) "\n\ +"1: lwarx %0,0,%2,1\n\ cmpwi 0,%0,0\n\ bne- 2f\n" " stwcx. %1,0,%2\n\ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 03b3d010cbab..7ef1cd8168a0 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -33,6 +33,10 @@ extern bool coregroup_enabled; extern int cpu_to_chip_id(int cpu); extern int *chip_id_lookup_table; +DECLARE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); +DECLARE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); +DECLARE_PER_CPU(cpumask_var_t, thread_group_l3_cache_map); + #ifdef CONFIG_SMP struct smp_ops_t { @@ -141,6 +145,7 @@ extern int cpu_to_core_id(int cpu); extern bool has_big_cores; extern bool thread_group_shares_l2; +extern bool thread_group_shares_l3; #define cpu_smt_mask cpu_smt_mask #ifdef CONFIG_SCHED_SMT @@ -195,6 +200,7 @@ extern void __cpu_die(unsigned int cpu); #define hard_smp_processor_id() get_hard_smp_processor_id(0) #define smp_setup_cpu_maps() #define thread_group_shares_l2 0 +#define thread_group_shares_l3 0 static inline void inhibit_secondary_onlining(void) {} static inline void uninhibit_secondary_onlining(void) {} static inline const struct cpumask *cpu_sibling_mask(int cpu) diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index ba0f88f3a30d..c60ebd04b2ed 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -90,10 +90,9 @@ static inline void syscall_get_arguments(struct task_struct *task, unsigned long val, mask = -1UL; unsigned int n = 6; -#ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(task, TIF_32BIT)) + if (is_32bit_task()) mask = 0xffffffff; -#endif + while (n--) { if (n == 0) val = regs->orig_gpr3; @@ -116,16 +115,11 @@ static inline void syscall_set_arguments(struct task_struct *task, static inline int syscall_get_arch(struct task_struct *task) { - int arch; - - if (IS_ENABLED(CONFIG_PPC64) && !test_tsk_thread_flag(task, TIF_32BIT)) - arch = AUDIT_ARCH_PPC64; + if (is_32bit_task()) + return AUDIT_ARCH_PPC; + else if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) + return AUDIT_ARCH_PPC64LE; else - arch = AUDIT_ARCH_PPC; - -#ifdef __LITTLE_ENDIAN__ - arch |= __AUDIT_ARCH_LE; -#endif - return arch; + return AUDIT_ARCH_PPC64; } #endif /* _ASM_SYSCALL_H */ diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 398171fdcd9f..7ee66ae5444d 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -6,6 +6,7 @@ #include <linux/compiler.h> #include <linux/linkage.h> #include <linux/types.h> +#include <linux/compat.h> struct rtas_args; @@ -18,5 +19,34 @@ asmlinkage long sys_mmap2(unsigned long addr, size_t len, asmlinkage long ppc64_personality(unsigned long personality); asmlinkage long sys_rtas(struct rtas_args __user *uargs); +#ifdef CONFIG_COMPAT +unsigned long compat_sys_mmap2(unsigned long addr, size_t len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff); + +compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, + u32 reg6, u32 pos1, u32 pos2); + +compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count, + u32 reg6, u32 pos1, u32 pos2); + +compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offset1, u32 offset2, u32 count); + +int compat_sys_truncate64(const char __user *path, u32 reg4, + unsigned long len1, unsigned long len2); + +long compat_sys_fallocate(int fd, int mode, u32 offset1, u32 offset2, u32 len1, u32 len2); + +int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, + unsigned long len2); + +long ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, + size_t len, int advice); + +long compat_sys_sync_file_range2(int fd, unsigned int flags, + unsigned int offset1, unsigned int offset2, + unsigned int nbytes1, unsigned int nbytes2); +#endif + #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_SYSCALLS_H */ diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index db5fc2f2262d..0c34d2756d92 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -19,15 +19,7 @@ #define TCE_VB 0 #define TCE_PCI 1 -/* TCE page size is 4096 bytes (1 << 12) */ - -#define TCE_SHIFT 12 -#define TCE_PAGE_SIZE (1 << TCE_SHIFT) - #define TCE_ENTRY_SIZE 8 /* each TCE is 64 bits */ - -#define TCE_RPN_MASK 0xfffffffffful /* 40-bit RPN (4K pages) */ -#define TCE_RPN_SHIFT 12 #define TCE_VALID 0x800 /* TCE valid */ #define TCE_ALLIO 0x400 /* TCE valid for all lpars */ #define TCE_PCI_WRITE 0x2 /* write from PCI allowed */ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index e4db64c0e184..36fcafb1fd6d 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -36,7 +36,7 @@ static inline int pcibus_to_node(struct pci_bus *bus) cpu_all_mask : \ cpumask_of_node(pcibus_to_node(bus))) -extern int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc); +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc); extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) @@ -64,6 +64,12 @@ static inline int early_cpu_to_node(int cpu) } int of_drconf_to_nid_single(struct drmem_lmb *lmb); +void update_numa_distance(struct device_node *node); + +extern void map_cpu_to_node(int cpu, int node); +#ifdef CONFIG_HOTPLUG_CPU +extern void unmap_cpu_from_node(unsigned long cpu); +#endif /* CONFIG_HOTPLUG_CPU */ #else @@ -83,7 +89,7 @@ static inline void sysfs_remove_device_from_node(struct device *dev, static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {} -static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +static inline int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { return 0; } @@ -93,6 +99,15 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb) return first_online_node; } +static inline void update_numa_distance(struct device_node *node) {} + +#ifdef CONFIG_SMP +static inline void map_cpu_to_node(int cpu, int node) {} +#ifdef CONFIG_HOTPLUG_CPU +static inline void unmap_cpu_from_node(unsigned long cpu) {} +#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_SMP */ + #endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index b541c690a31c..5eb462af6766 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -9,8 +9,6 @@ #define NR_syscalls __NR_syscalls -#define __NR__exit __NR_exit - #ifndef __ASSEMBLY__ #include <linux/types.h> diff --git a/arch/powerpc/include/asm/vdso/processor.h b/arch/powerpc/include/asm/vdso/processor.h index e072577bc7c0..8d79f994b4aa 100644 --- a/arch/powerpc/include/asm/vdso/processor.h +++ b/arch/powerpc/include/asm/vdso/processor.h @@ -5,12 +5,21 @@ #ifndef __ASSEMBLY__ /* Macros for adjusting thread priority (hardware multi-threading) */ +#ifdef CONFIG_PPC64 #define HMT_very_low() asm volatile("or 31, 31, 31 # very low priority") #define HMT_low() asm volatile("or 1, 1, 1 # low priority") #define HMT_medium_low() asm volatile("or 6, 6, 6 # medium low priority") #define HMT_medium() asm volatile("or 2, 2, 2 # medium priority") #define HMT_medium_high() asm volatile("or 5, 5, 5 # medium high priority") #define HMT_high() asm volatile("or 3, 3, 3 # high priority") +#else +#define HMT_very_low() +#define HMT_low() +#define HMT_medium_low() +#define HMT_medium() +#define HMT_medium_high() +#define HMT_high() +#endif #ifdef CONFIG_PPC64 #define cpu_relax() do { HMT_low(); HMT_medium(); barrier(); } while (0) diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h index d9cf192368ad..0ac9bfddf704 100644 --- a/arch/powerpc/include/asm/xics.h +++ b/arch/powerpc/include/asm/xics.h @@ -89,10 +89,11 @@ static inline int ics_opal_init(void) { return -ENODEV; } /* ICS instance, hooked up to chip_data of an irq */ struct ics { struct list_head link; - int (*map)(struct ics *ics, unsigned int virq); + int (*check)(struct ics *ics, unsigned int hwirq); void (*mask_unknown)(struct ics *ics, unsigned long vec); long (*get_server)(struct ics *ics, unsigned long vec); int (*host_match)(struct ics *ics, struct device_node *node); + struct irq_chip *chip; char data[]; }; diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h index 8b211faa0e42..cf8bb6ac4463 100644 --- a/arch/powerpc/include/asm/xive-regs.h +++ b/arch/powerpc/include/asm/xive-regs.h @@ -80,10 +80,13 @@ #define TM_QW0W2_VU PPC_BIT32(0) #define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1,31) // XX 2,31 ? #define TM_QW1W2_VO PPC_BIT32(0) +#define TM_QW1W2_HO PPC_BIT32(1) /* P10 XIVE2 */ #define TM_QW1W2_OS_CAM PPC_BITMASK32(8,31) #define TM_QW2W2_VP PPC_BIT32(0) +#define TM_QW2W2_HP PPC_BIT32(1) /* P10 XIVE2 */ #define TM_QW2W2_POOL_CAM PPC_BITMASK32(8,31) #define TM_QW3W2_VT PPC_BIT32(0) +#define TM_QW3W2_HT PPC_BIT32(1) /* P10 XIVE2 */ #define TM_QW3W2_LP PPC_BIT32(6) #define TM_QW3W2_LE PPC_BIT32(7) #define TM_QW3W2_T PPC_BIT32(31) diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index aa094a8655b0..92930b0b5d0e 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -111,6 +111,7 @@ void xive_native_free_vp_block(u32 vp_base); int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data); void xive_cleanup_irq_data(struct xive_irq_data *xd); +void xive_irq_free_data(unsigned int virq); void xive_native_free_irq(u32 irq); int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); @@ -125,6 +126,7 @@ int xive_native_enable_vp(u32 vp_id, bool single_escalation); int xive_native_disable_vp(u32 vp_id); int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id); bool xive_native_has_single_escalation(void); +bool xive_native_has_save_restore(void); int xive_native_get_queue_info(u32 vp_id, uint32_t prio, u64 *out_qpage, diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index f66b63e81c3b..7be36c1e1db6 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -46,7 +46,8 @@ obj-y := cputable.o syscalls.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o misc_$(BITS).o \ of_platform.o prom_parse.o firmware.o \ - hw_breakpoint_constraints.o interrupt.o + hw_breakpoint_constraints.o interrupt.o \ + kdebugfs.o obj-y += ptrace/ obj-$(CONFIG_PPC64) += setup_64.o \ paca.o nvram_64.o note.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 5bee245d832b..e563d3222d69 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -286,23 +286,16 @@ int main(void) STACK_PT_REGS_OFFSET(_CCR, ccr); STACK_PT_REGS_OFFSET(_XER, xer); STACK_PT_REGS_OFFSET(_DAR, dar); + STACK_PT_REGS_OFFSET(_DEAR, dear); STACK_PT_REGS_OFFSET(_DSISR, dsisr); + STACK_PT_REGS_OFFSET(_ESR, esr); STACK_PT_REGS_OFFSET(ORIG_GPR3, orig_gpr3); STACK_PT_REGS_OFFSET(RESULT, result); STACK_PT_REGS_OFFSET(_TRAP, trap); -#ifndef CONFIG_PPC64 - /* - * The PowerPC 400-class & Book-E processors have neither the DAR - * nor the DSISR SPRs. Hence, we overload them to hold the similar - * DEAR and ESR SPRs for such processors. For critical interrupts - * we use them to hold SRR0 and SRR1. - */ - STACK_PT_REGS_OFFSET(_DEAR, dar); - STACK_PT_REGS_OFFSET(_ESR, dsisr); -#else /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC64 STACK_PT_REGS_OFFSET(SOFTE, softe); STACK_PT_REGS_OFFSET(_PPR, ppr); -#endif /* CONFIG_PPC64 */ +#endif #ifdef CONFIG_PPC_PKEY STACK_PT_REGS_OFFSET(STACK_REGS_AMR, amr); diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index 6f903e9aa20b..cf1be75b7833 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -120,6 +120,7 @@ struct cache { struct cpumask shared_cpu_map; /* online CPUs using this cache */ int type; /* split cache disambiguation */ int level; /* level not explicit in device tree */ + int group_id; /* id of the group of threads that share this cache */ struct list_head list; /* global list of cache objects */ struct cache *next_local; /* next cache of >= level */ }; @@ -142,22 +143,24 @@ static const char *cache_type_string(const struct cache *cache) } static void cache_init(struct cache *cache, int type, int level, - struct device_node *ofnode) + struct device_node *ofnode, int group_id) { cache->type = type; cache->level = level; cache->ofnode = of_node_get(ofnode); + cache->group_id = group_id; INIT_LIST_HEAD(&cache->list); list_add(&cache->list, &cache_list); } -static struct cache *new_cache(int type, int level, struct device_node *ofnode) +static struct cache *new_cache(int type, int level, + struct device_node *ofnode, int group_id) { struct cache *cache; cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (cache) - cache_init(cache, type, level, ofnode); + cache_init(cache, type, level, ofnode, group_id); return cache; } @@ -309,20 +312,24 @@ static struct cache *cache_find_first_sibling(struct cache *cache) return cache; list_for_each_entry(iter, &cache_list, list) - if (iter->ofnode == cache->ofnode && iter->next_local == cache) + if (iter->ofnode == cache->ofnode && + iter->group_id == cache->group_id && + iter->next_local == cache) return iter; return cache; } -/* return the first cache on a local list matching node */ -static struct cache *cache_lookup_by_node(const struct device_node *node) +/* return the first cache on a local list matching node and thread-group id */ +static struct cache *cache_lookup_by_node_group(const struct device_node *node, + int group_id) { struct cache *cache = NULL; struct cache *iter; list_for_each_entry(iter, &cache_list, list) { - if (iter->ofnode != node) + if (iter->ofnode != node || + iter->group_id != group_id) continue; cache = cache_find_first_sibling(iter); break; @@ -352,14 +359,15 @@ static int cache_is_unified_d(const struct device_node *np) CACHE_TYPE_UNIFIED_D : CACHE_TYPE_UNIFIED; } -static struct cache *cache_do_one_devnode_unified(struct device_node *node, int level) +static struct cache *cache_do_one_devnode_unified(struct device_node *node, int group_id, + int level) { pr_debug("creating L%d ucache for %pOFP\n", level, node); - return new_cache(cache_is_unified_d(node), level, node); + return new_cache(cache_is_unified_d(node), level, node, group_id); } -static struct cache *cache_do_one_devnode_split(struct device_node *node, +static struct cache *cache_do_one_devnode_split(struct device_node *node, int group_id, int level) { struct cache *dcache, *icache; @@ -367,8 +375,8 @@ static struct cache *cache_do_one_devnode_split(struct device_node *node, pr_debug("creating L%d dcache and icache for %pOFP\n", level, node); - dcache = new_cache(CACHE_TYPE_DATA, level, node); - icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node); + dcache = new_cache(CACHE_TYPE_DATA, level, node, group_id); + icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node, group_id); if (!dcache || !icache) goto err; @@ -382,31 +390,32 @@ err: return NULL; } -static struct cache *cache_do_one_devnode(struct device_node *node, int level) +static struct cache *cache_do_one_devnode(struct device_node *node, int group_id, int level) { struct cache *cache; if (cache_node_is_unified(node)) - cache = cache_do_one_devnode_unified(node, level); + cache = cache_do_one_devnode_unified(node, group_id, level); else - cache = cache_do_one_devnode_split(node, level); + cache = cache_do_one_devnode_split(node, group_id, level); return cache; } static struct cache *cache_lookup_or_instantiate(struct device_node *node, + int group_id, int level) { struct cache *cache; - cache = cache_lookup_by_node(node); + cache = cache_lookup_by_node_group(node, group_id); WARN_ONCE(cache && cache->level != level, "cache level mismatch on lookup (got %d, expected %d)\n", cache->level, level); if (!cache) - cache = cache_do_one_devnode(node, level); + cache = cache_do_one_devnode(node, group_id, level); return cache; } @@ -443,7 +452,30 @@ static void do_subsidiary_caches_debugcheck(struct cache *cache) of_node_get_device_type(cache->ofnode)); } -static void do_subsidiary_caches(struct cache *cache) +/* + * If sub-groups of threads in a core containing @cpu_id share the + * L@level-cache (information obtained via "ibm,thread-groups" + * device-tree property), then we identify the group by the first + * thread-sibling in the group. We define this to be the group-id. + * + * In the absence of any thread-group information for L@level-cache, + * this function returns -1. + */ +static int get_group_id(unsigned int cpu_id, int level) +{ + if (has_big_cores && level == 1) + return cpumask_first(per_cpu(thread_group_l1_cache_map, + cpu_id)); + else if (thread_group_shares_l2 && level == 2) + return cpumask_first(per_cpu(thread_group_l2_cache_map, + cpu_id)); + else if (thread_group_shares_l3 && level == 3) + return cpumask_first(per_cpu(thread_group_l3_cache_map, + cpu_id)); + return -1; +} + +static void do_subsidiary_caches(struct cache *cache, unsigned int cpu_id) { struct device_node *subcache_node; int level = cache->level; @@ -452,9 +484,11 @@ static void do_subsidiary_caches(struct cache *cache) while ((subcache_node = of_find_next_cache_node(cache->ofnode))) { struct cache *subcache; + int group_id; level++; - subcache = cache_lookup_or_instantiate(subcache_node, level); + group_id = get_group_id(cpu_id, level); + subcache = cache_lookup_or_instantiate(subcache_node, group_id, level); of_node_put(subcache_node); if (!subcache) break; @@ -468,6 +502,7 @@ static struct cache *cache_chain_instantiate(unsigned int cpu_id) { struct device_node *cpu_node; struct cache *cpu_cache = NULL; + int group_id; pr_debug("creating cache object(s) for CPU %i\n", cpu_id); @@ -476,11 +511,13 @@ static struct cache *cache_chain_instantiate(unsigned int cpu_id) if (!cpu_node) goto out; - cpu_cache = cache_lookup_or_instantiate(cpu_node, 1); + group_id = get_group_id(cpu_id, 1); + + cpu_cache = cache_lookup_or_instantiate(cpu_node, group_id, 1); if (!cpu_cache) goto out; - do_subsidiary_caches(cpu_cache); + do_subsidiary_caches(cpu_cache, cpu_id); cache_cpu_set(cpu_cache, cpu_id); out: @@ -641,45 +678,6 @@ static ssize_t level_show(struct kobject *k, struct kobj_attribute *attr, char * static struct kobj_attribute cache_level_attr = __ATTR(level, 0444, level_show, NULL); -static unsigned int index_dir_to_cpu(struct cache_index_dir *index) -{ - struct kobject *index_dir_kobj = &index->kobj; - struct kobject *cache_dir_kobj = index_dir_kobj->parent; - struct kobject *cpu_dev_kobj = cache_dir_kobj->parent; - struct device *dev = kobj_to_dev(cpu_dev_kobj); - - return dev->id; -} - -/* - * On big-core systems, each core has two groups of CPUs each of which - * has its own L1-cache. The thread-siblings which share l1-cache with - * @cpu can be obtained via cpu_smallcore_mask(). - * - * On some big-core systems, the L2 cache is shared only between some - * groups of siblings. This is already parsed and encoded in - * cpu_l2_cache_mask(). - * - * TODO: cache_lookup_or_instantiate() needs to be made aware of the - * "ibm,thread-groups" property so that cache->shared_cpu_map - * reflects the correct siblings on platforms that have this - * device-tree property. This helper function is only a stop-gap - * solution so that we report the correct siblings to the - * userspace via sysfs. - */ -static const struct cpumask *get_shared_cpu_map(struct cache_index_dir *index, struct cache *cache) -{ - if (has_big_cores) { - int cpu = index_dir_to_cpu(index); - if (cache->level == 1) - return cpu_smallcore_mask(cpu); - if (cache->level == 2 && thread_group_shares_l2) - return cpu_l2_cache_mask(cpu); - } - - return &cache->shared_cpu_map; -} - static ssize_t show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bool list) { @@ -690,7 +688,7 @@ show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bo index = kobj_to_cache_index_dir(k); cache = index->cache; - mask = get_shared_cpu_map(index, cache); + mask = &cache->shared_cpu_map; return cpumap_print_to_pagebuf(list, buf, mask); } @@ -848,13 +846,15 @@ static struct cache *cache_lookup_by_cpu(unsigned int cpu_id) { struct device_node *cpu_node; struct cache *cache; + int group_id; cpu_node = of_get_cpu_node(cpu_id, NULL); WARN_ONCE(!cpu_node, "no OF node found for CPU %i\n", cpu_id); if (!cpu_node) return NULL; - cache = cache_lookup_by_node(cpu_node); + group_id = get_group_id(cpu_id, 1); + cache = cache_lookup_by_node_group(cpu_node, group_id); of_node_put(cpu_node); return cache; diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c index cdc2dccb987d..64e423d2fe0f 100644 --- a/arch/powerpc/kernel/dawr.c +++ b/arch/powerpc/kernel/dawr.c @@ -9,7 +9,6 @@ #include <linux/export.h> #include <linux/fs.h> #include <linux/debugfs.h> -#include <asm/debugfs.h> #include <asm/machdep.h> #include <asm/hvcall.h> @@ -101,7 +100,7 @@ static int __init dawr_force_setup(void) if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) { /* Turn DAWR off by default, but allow admin to turn it on */ debugfs_create_file_unsafe("dawr_enable_dangerous", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &dawr_force_enable, &dawr_enable_fops); } diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 3bbdcc86d01b..e9b597ed423c 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -21,9 +21,9 @@ #include <linux/spinlock.h> #include <linux/export.h> #include <linux/of.h> +#include <linux/debugfs.h> #include <linux/atomic.h> -#include <asm/debugfs.h> #include <asm/eeh.h> #include <asm/eeh_event.h> #include <asm/io.h> @@ -1901,24 +1901,24 @@ static int __init eeh_init_proc(void) proc_create_single("powerpc/eeh", 0, NULL, proc_eeh_show); #ifdef CONFIG_DEBUG_FS debugfs_create_file_unsafe("eeh_enable", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_enable_dbgfs_ops); debugfs_create_u32("eeh_max_freezes", 0600, - powerpc_debugfs_root, &eeh_max_freezes); + arch_debugfs_dir, &eeh_max_freezes); debugfs_create_bool("eeh_disable_recovery", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &eeh_debugfs_no_recover); debugfs_create_file_unsafe("eeh_dev_check", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_check_fops); debugfs_create_file_unsafe("eeh_dev_break", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_break_fops); debugfs_create_file_unsafe("eeh_force_recover", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_force_recover_fops); debugfs_create_file_unsafe("eeh_dev_can_recover", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_can_recover_fops); eeh_cache_debugfs_init(); #endif diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index bf3270426d82..9bdaaf7fddc9 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -12,8 +12,8 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/atomic.h> +#include <linux/debugfs.h> #include <asm/pci-bridge.h> -#include <asm/debugfs.h> #include <asm/ppc-pci.h> @@ -283,6 +283,6 @@ DEFINE_SHOW_ATTRIBUTE(eeh_addr_cache); void eeh_cache_debugfs_init(void) { debugfs_create_file_unsafe("eeh_address_cache", 0400, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_addr_cache_fops); } diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 0273a1349006..61fdd53cdd9a 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -161,10 +161,10 @@ ret_from_fork: ret_from_kernel_thread: REST_NVGPRS(r1) bl schedule_tail - mtlr r14 + mtctr r14 mr r3,r15 PPC440EP_ERR42 - blrl + bctrl li r3,0 b ret_from_syscall diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 15720f8661a1..70cff7b49e17 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -309,7 +309,7 @@ _GLOBAL(enter_rtas) */ lbz r0,PACAIRQSOFTMASK(r13) 1: tdeqi r0,IRQS_ENABLED - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING + EMIT_WARN_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING #endif /* Hard-disable interrupts */ diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 1401787b0b93..711c66b76df1 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -545,8 +545,8 @@ __end_interrupts: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR - std r14,_DAR(r1) - std r15,_DSISR(r1) + std r14,_DEAR(r1) + std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x300) @@ -558,8 +558,8 @@ __end_interrupts: PROLOG_ADDITION_2REGS) li r15,0 mr r14,r10 - std r14,_DAR(r1) - std r15,_DSISR(r1) + std r14,_DEAR(r1) + std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x400) @@ -575,8 +575,8 @@ __end_interrupts: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR - std r14,_DAR(r1) - std r15,_DSISR(r1) + std r14,_DEAR(r1) + std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x600) @@ -587,7 +587,7 @@ __end_interrupts: NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM, PROLOG_ADDITION_1REG) mfspr r14,SPRN_ESR - std r14,_DSISR(r1) + std r14,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) EXCEPTION_COMMON(0x700) addi r3,r1,STACK_FRAME_OVERHEAD @@ -1057,8 +1057,8 @@ bad_stack_book3e: std r11,_CCR(r1) mfspr r10,SPRN_DEAR mfspr r11,SPRN_ESR - std r10,_DAR(r1) - std r11,_DSISR(r1) + std r10,_DEAR(r1) + std r11,_ESR(r1) std r0,GPR0(r1); /* save r0 in stackframe */ \ std r2,GPR2(r1); /* save r2 in stackframe */ \ SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ @@ -1127,7 +1127,7 @@ found_iprot: * r3 = MAS0_TLBSEL (for the iprot array) * r4 = SPRN_TLBnCFG */ - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r6 /* Make it accessible */ mfmsr r7 rlwinm r5,r7,27,31,31 /* extract MSR[IS] */ @@ -1196,7 +1196,7 @@ skpinv: addi r6,r6,1 /* Increment */ mfmsr r6 xori r6,r6,MSR_IS mtspr SPRN_SRR1,r6 - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r6 addi r6,r6,(2f - 1b) mtspr SPRN_SRR0,r6 @@ -1256,7 +1256,7 @@ skpinv: addi r6,r6,1 /* Increment */ * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping */ /* Now we branch the new virtual address mapped by this entry */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r6 addi r6,r6,(2f - 1b) tovirt(r6,r6) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index b990075285f5..b7ceb041743c 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -24,8 +24,8 @@ #include <linux/slab.h> #include <linux/cma.h> #include <linux/hugetlb.h> +#include <linux/debugfs.h> -#include <asm/debugfs.h> #include <asm/page.h> #include <asm/prom.h> #include <asm/fadump.h> @@ -1557,7 +1557,7 @@ static void fadump_init_files(void) return; } - debugfs_create_file("fadump_region", 0444, powerpc_debugfs_root, NULL, + debugfs_create_file("fadump_region", 0444, arch_debugfs_dir, NULL, &fadump_region_fops); if (fw_dump.dump_active) { diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 6010adcee16e..ba4afe3b5a9c 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -91,8 +91,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) isync /* enable use of FP after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ - tovirt(r5, r5) + addi r5,r2,THREAD lwz r4,THREAD_FPEXC_MODE(r5) ori r9,r9,MSR_FP /* enable FP for current */ or r9,r9,r4 diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S index 8bccce6544b5..dedc17fac8f8 100644 --- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S +++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* 1. Find the index of the entry we're executing in */ - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r6 /* Make it accessible */ mfmsr r7 rlwinm r4,r7,27,31,31 /* extract MSR[IS] */ @@ -85,7 +85,7 @@ skpinv: addi r6,r6,1 /* Increment */ addi r6,r6,10 slw r6,r8,r6 /* convert to mask */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r7 mfspr r8,SPRN_MAS3 @@ -117,7 +117,7 @@ skpinv: addi r6,r6,1 /* Increment */ xori r6,r4,1 slwi r6,r6,5 /* setup new context with other address space */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r9 rlwimi r7,r9,0,20,31 addi r7,r7,(2f - 1b) @@ -207,7 +207,7 @@ next_tlb_setup: lis r7,MSR_KERNEL@h ori r7,r7,MSR_KERNEL@l - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r9 rlwimi r6,r9,0,20,31 addi r6,r6,(2f - 1b) diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index ddc978a2d381..02d2928d1e01 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -70,7 +70,7 @@ _ENTRY(_start); * address. * r21 will be loaded with the physical runtime address of _stext */ - bl 0f /* Get our runtime address */ + bcl 20,31,$+4 /* Get our runtime address */ 0: mflr r21 /* Make it accessible */ addis r21,r21,(_stext - 0b)@ha addi r21,r21,(_stext - 0b)@l /* Get our current runtime base */ @@ -853,7 +853,7 @@ _GLOBAL(init_cpu_state) wmmucr: mtspr SPRN_MMUCR,r3 /* Put MMUCR */ sync - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r5 /* Make it accessible */ tlbsx r23,0,r5 /* Find entry we are in */ li r4,0 /* Start at TLB entry 0 */ @@ -1045,7 +1045,7 @@ head_start_47x: sync /* Find the entry we are running from */ - bl 1f + bcl 20,31,$+4 1: mflr r23 tlbsx r23,0,r23 tlbre r24,r23,0 diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 79930b0bc781..f17ae2083733 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -712,6 +712,8 @@ _GLOBAL(copy_and_flush) isync blr +_ASM_NOKPROBE_SYMBOL(copy_and_flush); /* Called in real mode */ + .align 8 copy_to_here: diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 9a2f4265e6d2..0a9a0f301474 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -79,7 +79,7 @@ _ENTRY(_start); mr r23,r3 mr r25,r4 - bl 0f + bcl 20,31,$+4 0: mflr r8 addis r3,r8,(is_second_reloc - 0b)@ha lwz r19,(is_second_reloc - 0b)@l(r3) @@ -1132,7 +1132,7 @@ _GLOBAL(switch_to_as1) bne 1b /* Get the tlb entry used by the current running code */ - bl 0f + bcl 20,31,$+4 0: mflr r4 tlbsx 0,r4 @@ -1166,7 +1166,7 @@ _GLOBAL(switch_to_as1) _GLOBAL(restore_to_as0) mflr r0 - bl 0f + bcl 20,31,$+4 0: mflr r9 addi r9,r9,1f - 0b diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 21a638aff72f..91a3be14808b 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -22,7 +22,6 @@ #include <asm/processor.h> #include <asm/sstep.h> #include <asm/debug.h> -#include <asm/debugfs.h> #include <asm/hvcall.h> #include <asm/inst.h> #include <linux/uaccess.h> diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 21bbd615ca41..de10a2697258 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -8,7 +8,6 @@ #include <asm/asm-prototypes.h> #include <asm/kup.h> #include <asm/cputime.h> -#include <asm/interrupt.h> #include <asm/hw_irq.h> #include <asm/interrupt.h> #include <asm/kprobes.h> @@ -19,6 +18,7 @@ #include <asm/switch_to.h> #include <asm/syscall.h> #include <asm/time.h> +#include <asm/tm.h> #include <asm/unistd.h> #if defined(CONFIG_PPC_ADV_DEBUG_REGS) && defined(CONFIG_PPC32) @@ -93,8 +93,7 @@ notrace long system_call_exception(long r3, long r4, long r5, CT_WARN_ON(ct_state() == CONTEXT_KERNEL); user_exit_irqoff(); - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) - BUG_ON(!(regs->msr & MSR_RI)); + BUG_ON(regs_is_unrecoverable(regs)); BUG_ON(!(regs->msr & MSR_PR)); BUG_ON(arch_irq_disabled_regs(regs)); @@ -138,6 +137,48 @@ notrace long system_call_exception(long r3, long r4, long r5, */ irq_soft_mask_regs_set_state(regs, IRQS_ENABLED); + /* + * If system call is called with TM active, set _TIF_RESTOREALL to + * prevent RFSCV being used to return to userspace, because POWER9 + * TM implementation has problems with this instruction returning to + * transactional state. Final register values are not relevant because + * the transaction will be aborted upon return anyway. Or in the case + * of unsupported_scv SIGILL fault, the return state does not much + * matter because it's an edge case. + */ + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && + unlikely(MSR_TM_TRANSACTIONAL(regs->msr))) + current_thread_info()->flags |= _TIF_RESTOREALL; + + /* + * If the system call was made with a transaction active, doom it and + * return without performing the system call. Unless it was an + * unsupported scv vector, in which case it's treated like an illegal + * instruction. + */ +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (unlikely(MSR_TM_TRANSACTIONAL(regs->msr)) && + !trap_is_unsupported_scv(regs)) { + /* Enable TM in the kernel, and disable EE (for scv) */ + hard_irq_disable(); + mtmsr(mfmsr() | MSR_TM); + + /* tabort, this dooms the transaction, nothing else */ + asm volatile(".long 0x7c00071d | ((%0) << 16)" + :: "r"(TM_CAUSE_SYSCALL|TM_CAUSE_PERSISTENT)); + + /* + * Userspace will never see the return value. Execution will + * resume after the tbegin. of the aborted transaction with the + * checkpointed register state. A context switch could occur + * or signal delivered to the process before resuming the + * doomed transaction context, but that should all be handled + * as expected. + */ + return -ENOSYS; + } +#endif // CONFIG_PPC_TRANSACTIONAL_MEM + local_irq_enable(); if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) { @@ -463,9 +504,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs) { unsigned long ret; - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) - BUG_ON(!(regs->msr & MSR_RI)); - BUG_ON(!(regs->msr & MSR_PR)); + BUG_ON(regs_is_unrecoverable(regs)); BUG_ON(arch_irq_disabled_regs(regs)); CT_WARN_ON(ct_state() == CONTEXT_USER); @@ -496,10 +535,8 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) bool stack_store = current_thread_info()->flags & _TIF_EMULATE_STACK_STORE; - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) && - unlikely(!(regs->msr & MSR_RI))) + if (regs_is_unrecoverable(regs)) unrecoverable_exception(regs); - BUG_ON(regs->msr & MSR_PR); /* * CT_WARN_ON comes here via program_check_exception, * so avoid recursion. diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index d4212d2ff0b5..ec950b08a8dc 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -12,7 +12,6 @@ #include <asm/mmu.h> #include <asm/ppc_asm.h> #include <asm/ptrace.h> -#include <asm/tm.h> .section ".toc","aw" SYS_CALL_TABLE: @@ -55,12 +54,6 @@ COMPAT_SYS_CALL_TABLE: .globl system_call_vectored_\name system_call_vectored_\name: _ASM_NOKPROBE_SYMBOL(system_call_vectored_\name) -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -BEGIN_FTR_SECTION - extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */ - bne tabort_syscall -END_FTR_SECTION_IFSET(CPU_FTR_TM) -#endif SCV_INTERRUPT_TO_KERNEL mr r10,r1 ld r1,PACAKSAVE(r13) @@ -247,12 +240,6 @@ _ASM_NOKPROBE_SYMBOL(system_call_common_real) .globl system_call_common system_call_common: _ASM_NOKPROBE_SYMBOL(system_call_common) -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -BEGIN_FTR_SECTION - extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */ - bne tabort_syscall -END_FTR_SECTION_IFSET(CPU_FTR_TM) -#endif mr r10,r1 ld r1,PACAKSAVE(r13) std r10,0(r1) @@ -425,34 +412,6 @@ SOFT_MASK_TABLE(.Lsyscall_rst_start, 1b) RESTART_TABLE(.Lsyscall_rst_start, .Lsyscall_rst_end, syscall_restart) #endif -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -tabort_syscall: -_ASM_NOKPROBE_SYMBOL(tabort_syscall) - /* Firstly we need to enable TM in the kernel */ - mfmsr r10 - li r9, 1 - rldimi r10, r9, MSR_TM_LG, 63-MSR_TM_LG - mtmsrd r10, 0 - - /* tabort, this dooms the transaction, nothing else */ - li r9, (TM_CAUSE_SYSCALL|TM_CAUSE_PERSISTENT) - TABORT(R9) - - /* - * Return directly to userspace. We have corrupted user register state, - * but userspace will never see that register state. Execution will - * resume after the tbegin of the aborted transaction with the - * checkpointed register state. - */ - li r9, MSR_RI - andc r10, r10, r9 - mtmsrd r10, 1 - mtspr SPRN_SRR0, r11 - mtspr SPRN_SRR1, r12 - RFI_TO_USER - b . /* prevent speculative execution */ -#endif - /* * If MSR EE/RI was never enabled, IRQs not reconciled, NVGPRs not * touched, no exit work created, then this can be used. diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 30b7736f0896..07093b7cdcb9 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -688,32 +688,24 @@ static void iommu_table_reserve_pages(struct iommu_table *tbl, if (tbl->it_offset == 0) set_bit(0, tbl->it_map); - tbl->it_reserved_start = res_start; - tbl->it_reserved_end = res_end; - - /* Check if res_start..res_end isn't empty and overlaps the table */ - if (res_start && res_end && - (tbl->it_offset + tbl->it_size < res_start || - res_end < tbl->it_offset)) - return; + if (res_start < tbl->it_offset) + res_start = tbl->it_offset; - for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) - set_bit(i - tbl->it_offset, tbl->it_map); -} + if (res_end > (tbl->it_offset + tbl->it_size)) + res_end = tbl->it_offset + tbl->it_size; -static void iommu_table_release_pages(struct iommu_table *tbl) -{ - int i; + /* Check if res_start..res_end is a valid range in the table */ + if (res_start >= res_end) { + tbl->it_reserved_start = tbl->it_offset; + tbl->it_reserved_end = tbl->it_offset; + return; + } - /* - * In case we have reserved the first bit, we should not emit - * the warning below. - */ - if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); + tbl->it_reserved_start = res_start; + tbl->it_reserved_end = res_end; for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) - clear_bit(i - tbl->it_offset, tbl->it_map); + set_bit(i - tbl->it_offset, tbl->it_map); } /* @@ -777,6 +769,22 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, return tbl; } +bool iommu_table_in_use(struct iommu_table *tbl) +{ + unsigned long start = 0, end; + + /* ignore reserved bit0 */ + if (tbl->it_offset == 0) + start = 1; + end = tbl->it_reserved_start - tbl->it_offset; + if (find_next_bit(tbl->it_map, end, start) != end) + return true; + + start = tbl->it_reserved_end - tbl->it_offset; + end = tbl->it_size; + return find_next_bit(tbl->it_map, end, start) != end; +} + static void iommu_table_free(struct kref *kref) { struct iommu_table *tbl; @@ -793,10 +801,8 @@ static void iommu_table_free(struct kref *kref) iommu_debugfs_del(tbl); - iommu_table_release_pages(tbl); - /* verify that table contains no entries */ - if (!bitmap_empty(tbl->it_map, tbl->it_size)) + if (iommu_table_in_use(tbl)) pr_warn("%s: Unexpected TCEs\n", __func__); /* free bitmap */ @@ -1097,14 +1103,9 @@ int iommu_take_ownership(struct iommu_table *tbl) for (i = 0; i < tbl->nr_pools; i++) spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock); - iommu_table_release_pages(tbl); - - if (!bitmap_empty(tbl->it_map, tbl->it_size)) { + if (iommu_table_in_use(tbl)) { pr_err("iommu_tce: it_map is not empty"); ret = -EBUSY; - /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */ - iommu_table_reserve_pages(tbl, tbl->it_reserved_start, - tbl->it_reserved_end); } else { memset(tbl->it_map, 0xff, sz); } diff --git a/arch/powerpc/kernel/kdebugfs.c b/arch/powerpc/kernel/kdebugfs.c new file mode 100644 index 000000000000..36d3124d5a8b --- /dev/null +++ b/arch/powerpc/kernel/kdebugfs.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/debugfs.h> +#include <linux/export.h> +#include <linux/init.h> + +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + +static int __init arch_kdebugfs_init(void) +{ + arch_debugfs_dir = debugfs_create_dir("powerpc", NULL); + return 0; +} +arch_initcall(arch_kdebugfs_init); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 47a683cd00d2..fd829f7f25a4 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -249,6 +249,7 @@ void machine_check_queue_event(void) { int index; struct machine_check_event evt; + unsigned long msr; if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) return; @@ -262,8 +263,20 @@ void machine_check_queue_event(void) memcpy(&local_paca->mce_info->mce_event_queue[index], &evt, sizeof(evt)); - /* Queue irq work to process this event later. */ - irq_work_queue(&mce_event_process_work); + /* + * Queue irq work to process this event later. Before + * queuing the work enable translation for non radix LPAR, + * as irq_work_queue may try to access memory outside RMO + * region. + */ + if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) { + msr = mfmsr(); + mtmsr(msr | MSR_IR | MSR_DR); + irq_work_queue(&mce_event_process_work); + mtmsr(msr); + } else { + irq_work_queue(&mce_event_process_work); + } } void mce_common_process_ue(struct pt_regs *regs, diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 5be96feccb55..fb7de3543c03 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -29,7 +29,7 @@ _GLOBAL(reloc_offset) li r3, 0 _GLOBAL(add_reloc_offset) mflr r0 - bl 1f + bcl 20,31,$+4 1: mflr r5 PPC_LL r4,(2f-1b)(r5) subf r5,r4,r5 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 39ab15419592..e5127b19fec2 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -67,7 +67,7 @@ _GLOBAL(reloc_got2) srwi. r8,r8,2 beqlr mtctr r8 - bl 1f + bcl 20,31,$+4 1: mflr r0 lis r4,1b@ha addi r4,r4,1b@l @@ -237,7 +237,7 @@ _GLOBAL(copy_page) addi r3,r3,-4 0: twnei r5, 0 /* WARN if r3 is not cache aligned */ - EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING + EMIT_WARN_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING addi r4,r4,-4 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 4b761a18a74d..d38a019b38e1 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -255,7 +255,7 @@ _GLOBAL(scom970_write) * Physical (hardware) cpu id should be in r3. */ _GLOBAL(kexec_wait) - bl 1f + bcl 20,31,$+4 1: mflr r5 addi r5,r5,kexec_flag-1b diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 001e90cd8948..c3573430919d 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -29,6 +29,7 @@ #include <linux/slab.h> #include <linux/vgaarb.h> #include <linux/numa.h> +#include <linux/msi.h> #include <asm/processor.h> #include <asm/io.h> @@ -1060,11 +1061,16 @@ void pcibios_bus_add_device(struct pci_dev *dev) int pcibios_add_device(struct pci_dev *dev) { + struct irq_domain *d; + #ifdef CONFIG_PCI_IOV if (ppc_md.pcibios_fixup_sriov) ppc_md.pcibios_fixup_sriov(dev); #endif /* CONFIG_PCI_IOV */ + d = dev_get_msi_domain(&dev->bus->dev); + if (d) + dev_set_msi_domain(&dev->dev, d); return 0; } diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 185beb290580..50436b52c213 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs) trap == INTERRUPT_DATA_STORAGE || trap == INTERRUPT_ALIGNMENT) { if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr); + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dear, regs->esr); else pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); } diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f620e04dc9bf..2e67588f6f6e 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -11,7 +11,6 @@ #undef DEBUG -#include <stdarg.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/init.h> @@ -640,7 +639,9 @@ static void __init early_reserve_mem(void) } #endif /* CONFIG_BLK_DEV_INITRD */ -#ifdef CONFIG_PPC32 + if (!IS_ENABLED(CONFIG_PPC32)) + return; + /* * Handle the case where we might be booting from an old kexec * image that setup the mem_rsvmap as pairs of 32-bit values @@ -661,7 +662,6 @@ static void __init early_reserve_mem(void) } return; } -#endif } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index a5bf355ce1d6..18b04b08b983 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -14,7 +14,7 @@ /* we cannot use FORTIFY as it brings in new symbols */ #define __NO_FORTIFY -#include <stdarg.h> +#include <linux/stdarg.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/init.h> @@ -1096,7 +1096,8 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { #else 0, #endif - .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN), + .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN) | + OV5_FEAT(OV5_FORM2_AFFINITY), .bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT), .micro_checkpoint = 0, .reserved0 = 0, diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 0a0a33eb0d28..7c7093c17c45 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -373,8 +373,12 @@ void __init pt_regs_check(void) offsetof(struct user_pt_regs, trap)); BUILD_BUG_ON(offsetof(struct pt_regs, dar) != offsetof(struct user_pt_regs, dar)); + BUILD_BUG_ON(offsetof(struct pt_regs, dear) != + offsetof(struct user_pt_regs, dar)); BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != offsetof(struct user_pt_regs, dsisr)); + BUILD_BUG_ON(offsetof(struct pt_regs, esr) != + offsetof(struct user_pt_regs, dsisr)); BUILD_BUG_ON(offsetof(struct pt_regs, result) != offsetof(struct user_pt_regs, result)); diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S index 10e96f3e22fe..0508c14b4c28 100644 --- a/arch/powerpc/kernel/reloc_32.S +++ b/arch/powerpc/kernel/reloc_32.S @@ -30,7 +30,7 @@ R_PPC_RELATIVE = 22 _GLOBAL(relocate) mflr r0 /* Save our LR */ - bl 0f /* Find our current runtime address */ + bcl 20,31,$+4 /* Find our current runtime address */ 0: mflr r12 /* Make it accessible */ mtlr r0 diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 99f2cce635fb..ff80bbad22a5 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -7,7 +7,7 @@ * Copyright (C) 2001 IBM. */ -#include <stdarg.h> +#include <linux/stdarg.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/spinlock.h> diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 8561dfb33f24..32ee17753eb4 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -429,7 +429,7 @@ static void rtas_event_scan(struct work_struct *w) do_event_scan(); - get_online_cpus(); + cpus_read_lock(); /* raw_ OK because just using CPU as starting point. */ cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); @@ -451,7 +451,7 @@ static void rtas_event_scan(struct work_struct *w) schedule_delayed_work_on(cpu, &event_scan_work, __round_jiffies_relative(event_scan_delay, cpu)); - put_online_cpus(); + cpus_read_unlock(); } #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index cc51fa52e783..1a998490fe60 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -11,10 +11,10 @@ #include <linux/nospec.h> #include <linux/prctl.h> #include <linux/seq_buf.h> +#include <linux/debugfs.h> #include <asm/asm-prototypes.h> #include <asm/code-patching.h> -#include <asm/debugfs.h> #include <asm/security_features.h> #include <asm/setup.h> #include <asm/inst.h> @@ -106,7 +106,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_barrier_nospec, barrier_nospec_get, static __init int barrier_nospec_debugfs_init(void) { debugfs_create_file_unsafe("barrier_nospec", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &fops_barrier_nospec); return 0; } @@ -114,7 +114,7 @@ device_initcall(barrier_nospec_debugfs_init); static __init int security_feature_debugfs_init(void) { - debugfs_create_x64("security_features", 0400, powerpc_debugfs_root, + debugfs_create_x64("security_features", 0400, arch_debugfs_dir, &powerpc_security_features); return 0; } @@ -420,7 +420,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_stf_barrier, stf_barrier_get, stf_barrier_set, static __init int stf_barrier_debugfs_init(void) { - debugfs_create_file_unsafe("stf_barrier", 0600, powerpc_debugfs_root, + debugfs_create_file_unsafe("stf_barrier", 0600, arch_debugfs_dir, NULL, &fops_stf_barrier); return 0; } @@ -748,7 +748,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_count_cache_flush, count_cache_flush_get, static __init int count_cache_flush_debugfs_init(void) { debugfs_create_file_unsafe("count_cache_flush", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &fops_count_cache_flush); return 0; } @@ -834,9 +834,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_uaccess_flush, uaccess_flush_get, uaccess_flush_set static __init int rfi_flush_debugfs_init(void) { - debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush); - debugfs_create_file("entry_flush", 0600, powerpc_debugfs_root, NULL, &fops_entry_flush); - debugfs_create_file("uaccess_flush", 0600, powerpc_debugfs_root, NULL, &fops_uaccess_flush); + debugfs_create_file("rfi_flush", 0600, arch_debugfs_dir, NULL, &fops_rfi_flush); + debugfs_create_file("entry_flush", 0600, arch_debugfs_dir, NULL, &fops_entry_flush); + debugfs_create_file("uaccess_flush", 0600, arch_debugfs_dir, NULL, &fops_uaccess_flush); return 0; } device_initcall(rfi_flush_debugfs_init); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index aa9c2d01424a..b1e43b69a559 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -33,7 +33,6 @@ #include <linux/of_platform.h> #include <linux/hugetlb.h> #include <linux/pgtable.h> -#include <asm/debugfs.h> #include <asm/io.h> #include <asm/paca.h> #include <asm/prom.h> @@ -773,18 +772,6 @@ static int __init check_cache_coherency(void) late_initcall(check_cache_coherency); #endif /* CONFIG_CHECK_CACHE_COHERENCY */ -#ifdef CONFIG_DEBUG_FS -struct dentry *powerpc_debugfs_root; -EXPORT_SYMBOL(powerpc_debugfs_root); - -static int powerpc_debugfs_init(void) -{ - powerpc_debugfs_root = debugfs_create_dir("powerpc", NULL); - return 0; -} -arch_initcall(powerpc_debugfs_init); -#endif - void ppc_printk_progress(char *s, unsigned short hex) { pr_info("%s\n", s); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 1ff258f6c76c..eaa79a0996d1 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -32,7 +32,6 @@ #include <linux/nmi.h> #include <linux/pgtable.h> -#include <asm/debugfs.h> #include <asm/kvm_guest.h> #include <asm/io.h> #include <asm/kdump.h> diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index e600764a926c..b93b87df499d 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -293,10 +293,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) do_signal(current); } - if (thread_info_flags & _TIF_NOTIFY_RESUME) { + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } } static unsigned long get_tm_stackpointer(struct task_struct *tsk) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 447b78a87c8f..9cc7d3dbf439 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -78,6 +78,7 @@ struct task_struct *secondary_current; bool has_big_cores; bool coregroup_enabled; bool thread_group_shares_l2; +bool thread_group_shares_l3; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); @@ -101,7 +102,7 @@ enum { #define MAX_THREAD_LIST_SIZE 8 #define THREAD_GROUP_SHARE_L1 1 -#define THREAD_GROUP_SHARE_L2 2 +#define THREAD_GROUP_SHARE_L2_L3 2 struct thread_groups { unsigned int property; unsigned int nr_groups; @@ -122,14 +123,20 @@ static struct thread_groups_list tgl[NR_CPUS] __initdata; * On big-cores system, thread_group_l1_cache_map for each CPU corresponds to * the set its siblings that share the L1-cache. */ -static DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); +DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); /* * On some big-cores system, thread_group_l2_cache_map for each CPU * corresponds to the set its siblings within the core that share the * L2-cache. */ -static DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); +DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); + +/* + * On P10, thread_group_l3_cache_map for each CPU is equal to the + * thread_group_l2_cache_map + */ +DEFINE_PER_CPU(cpumask_var_t, thread_group_l3_cache_map); /* SMP operations for this machine */ struct smp_ops_t *smp_ops; @@ -889,19 +896,41 @@ out: return tg; } +static int update_mask_from_threadgroup(cpumask_var_t *mask, struct thread_groups *tg, int cpu, int cpu_group_start) +{ + int first_thread = cpu_first_thread_sibling(cpu); + int i; + + zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu)); + + for (i = first_thread; i < first_thread + threads_per_core; i++) { + int i_group_start = get_cpu_thread_group_start(i, tg); + + if (unlikely(i_group_start == -1)) { + WARN_ON_ONCE(1); + return -ENODATA; + } + + if (i_group_start == cpu_group_start) + cpumask_set_cpu(i, *mask); + } + + return 0; +} + static int __init init_thread_group_cache_map(int cpu, int cache_property) { - int first_thread = cpu_first_thread_sibling(cpu); - int i, cpu_group_start = -1, err = 0; + int cpu_group_start = -1, err = 0; struct thread_groups *tg = NULL; cpumask_var_t *mask = NULL; if (cache_property != THREAD_GROUP_SHARE_L1 && - cache_property != THREAD_GROUP_SHARE_L2) + cache_property != THREAD_GROUP_SHARE_L2_L3) return -EINVAL; tg = get_thread_groups(cpu, cache_property, &err); + if (!tg) return err; @@ -912,25 +941,18 @@ static int __init init_thread_group_cache_map(int cpu, int cache_property) return -ENODATA; } - if (cache_property == THREAD_GROUP_SHARE_L1) + if (cache_property == THREAD_GROUP_SHARE_L1) { mask = &per_cpu(thread_group_l1_cache_map, cpu); - else if (cache_property == THREAD_GROUP_SHARE_L2) + update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start); + } + else if (cache_property == THREAD_GROUP_SHARE_L2_L3) { mask = &per_cpu(thread_group_l2_cache_map, cpu); - - zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu)); - - for (i = first_thread; i < first_thread + threads_per_core; i++) { - int i_group_start = get_cpu_thread_group_start(i, tg); - - if (unlikely(i_group_start == -1)) { - WARN_ON_ONCE(1); - return -ENODATA; - } - - if (i_group_start == cpu_group_start) - cpumask_set_cpu(i, *mask); + update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start); + mask = &per_cpu(thread_group_l3_cache_map, cpu); + update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start); } + return 0; } @@ -1020,14 +1042,16 @@ static int __init init_big_cores(void) has_big_cores = true; for_each_possible_cpu(cpu) { - int err = init_thread_group_cache_map(cpu, THREAD_GROUP_SHARE_L2); + int err = init_thread_group_cache_map(cpu, THREAD_GROUP_SHARE_L2_L3); if (err) return err; } thread_group_shares_l2 = true; - pr_debug("L2 cache only shared by the threads in the small core\n"); + thread_group_shares_l3 = true; + pr_debug("L2/L3 cache only shared by the threads in the small core\n"); + return 0; } @@ -1085,7 +1109,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } if (cpu_to_chip_id(boot_cpuid) != -1) { - int idx = num_possible_cpus() / threads_per_core; + int idx = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); /* * All threads of a core will all belong to the same core, @@ -1376,7 +1400,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask) l2_cache = cpu_to_l2cache(cpu); if (!l2_cache || !*mask) { /* Assume only core siblings share cache with this CPU */ - for_each_cpu(i, submask_fn(cpu)) + for_each_cpu(i, cpu_sibling_mask(cpu)) set_cpus_related(cpu, i, cpu_l2_cache_mask); return false; @@ -1418,6 +1442,8 @@ static void remove_cpu_from_masks(int cpu) struct cpumask *(*mask_fn)(int) = cpu_sibling_mask; int i; + unmap_cpu_from_node(cpu); + if (shared_caches) mask_fn = cpu_l2_cache_mask; @@ -1502,7 +1528,9 @@ static void add_cpu_to_masks(int cpu) * This CPU will not be in the online mask yet so we need to manually * add it to it's own thread sibling mask. */ + map_cpu_to_node(cpu, cpu_to_node(cpu)); cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); + cpumask_set_cpu(cpu, cpu_core_mask(cpu)); for (i = first_thread; i < first_thread + threads_per_core; i++) if (cpu_online(i)) @@ -1520,11 +1548,6 @@ static void add_cpu_to_masks(int cpu) if (chip_id_lookup_table && ret) chip_id = cpu_to_chip_id(cpu); - if (chip_id == -1) { - cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu)); - goto out; - } - if (shared_caches) submask_fn = cpu_l2_cache_mask; @@ -1534,6 +1557,10 @@ static void add_cpu_to_masks(int cpu) /* Skip all CPUs already part of current CPU core mask */ cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu)); + /* If chip_id is -1; limit the cpu_core_mask to within DIE*/ + if (chip_id == -1) + cpumask_and(mask, mask, cpu_cpu_mask(cpu)); + for_each_cpu(i, mask) { if (chip_id == cpu_to_chip_id(i)) { or_cpumasks_related(cpu, i, submask_fn, cpu_core_mask); @@ -1543,7 +1570,6 @@ static void add_cpu_to_masks(int cpu) } } -out: free_cpumask_var(mask); } diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 2b0d04a1b7d2..9e4a4a7af380 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -8,6 +8,7 @@ * Copyright 2018 Nick Piggin, Michael Ellerman, IBM Corp. */ +#include <linux/delay.h> #include <linux/export.h> #include <linux/kallsyms.h> #include <linux/module.h> diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index bf4ae0f0e36c..825931e400df 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -41,20 +41,13 @@ static inline long do_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off, int shift) { - long ret = -EINVAL; - if (!arch_validate_prot(prot, addr)) - goto out; + return -EINVAL; - if (shift) { - if (off & ((1 << shift) - 1)) - goto out; - off >>= shift; - } + if (!IS_ALIGNED(off, 1 << shift)) + return -EINVAL; - ret = ksys_mmap_pgoff(addr, len, prot, flags, fd, off); -out: - return ret; + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> shift); } SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len, diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 6f3953f2a0d5..7bef917cc84e 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -330,10 +330,10 @@ 256 64 sys_debug_setcontext sys_ni_syscall 256 spu sys_debug_setcontext sys_ni_syscall # 257 reserved for vserver -258 nospu migrate_pages sys_migrate_pages compat_sys_migrate_pages -259 nospu mbind sys_mbind compat_sys_mbind -260 nospu get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -261 nospu set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +258 nospu migrate_pages sys_migrate_pages +259 nospu mbind sys_mbind +260 nospu get_mempolicy sys_get_mempolicy +261 nospu set_mempolicy sys_set_mempolicy 262 nospu mq_open sys_mq_open compat_sys_mq_open 263 nospu mq_unlink sys_mq_unlink 264 32 mq_timedsend sys_mq_timedsend_time32 @@ -381,7 +381,7 @@ 298 common faccessat sys_faccessat 299 common get_robust_list sys_get_robust_list compat_sys_get_robust_list 300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list -301 common move_pages sys_move_pages compat_sys_move_pages +301 common move_pages sys_move_pages 302 common getcpu sys_getcpu 303 nospu epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 304 32 utimensat sys_utimensat_time32 @@ -526,3 +526,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index b9a047d92ec0..8e83d19fe8fa 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -164,7 +164,7 @@ static void tau_work_func(struct work_struct *work) queue_work(tau_workq, work); } -DECLARE_WORK(tau_work, tau_work_func); +static DECLARE_WORK(tau_work, tau_work_func); /* * setup the TAU diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index c487ba5a6e11..934d8ae66cc6 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -31,6 +31,7 @@ #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/sched/cputime.h> #include <linux/kernel.h> #include <linux/param.h> #include <linux/string.h> @@ -52,8 +53,6 @@ #include <linux/irq_work.h> #include <linux/of_clk.h> #include <linux/suspend.h> -#include <linux/sched/cputime.h> -#include <linux/sched/clock.h> #include <linux/processor.h> #include <asm/trace.h> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index cd5f633c9b1b..aac8c0412ff9 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -37,10 +37,10 @@ #include <linux/smp.h> #include <linux/console.h> #include <linux/kmsg_dump.h> +#include <linux/debugfs.h> #include <asm/emulated_ops.h> #include <linux/uaccess.h> -#include <asm/debugfs.h> #include <asm/interrupt.h> #include <asm/io.h> #include <asm/machdep.h> @@ -427,7 +427,7 @@ void hv_nmi_check_nonrecoverable(struct pt_regs *regs) return; nonrecoverable: - regs_set_return_msr(regs, regs->msr & ~MSR_RI); + regs_set_unrecoverable(regs); #endif } DEFINE_INTERRUPT_HANDLER_NMI(system_reset_exception) @@ -497,7 +497,7 @@ out: die("Unrecoverable nested System Reset", regs, SIGABRT); #endif /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* For the reason explained in die_mce, nmi_exit before die */ nmi_exit(); die("Unrecoverable System Reset", regs, SIGABRT); @@ -549,7 +549,7 @@ static inline int check_io_access(struct pt_regs *regs) printk(KERN_DEBUG "%s bad port %lx at %p\n", (*nip & 0x100)? "OUT to": "IN from", regs->gpr[rb] - _IO_BASE, nip); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } @@ -561,7 +561,7 @@ static inline int check_io_access(struct pt_regs *regs) #ifdef CONFIG_PPC_ADV_DEBUG_REGS /* On 4xx, the reason for the machine check or program exception is in the ESR. */ -#define get_reason(regs) ((regs)->dsisr) +#define get_reason(regs) ((regs)->esr) #define REASON_FP ESR_FP #define REASON_ILLEGAL (ESR_PIL | ESR_PUO) #define REASON_PRIVILEGED ESR_PPR @@ -839,7 +839,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception) bail: /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) + if (regs_is_unrecoverable(regs)) die_mce("Unrecoverable Machine check", regs, SIGBUS); #ifdef CONFIG_PPC_BOOK3S_64 @@ -1481,8 +1481,13 @@ static void do_program_check(struct pt_regs *regs) if (!(regs->msr & MSR_PR) && /* not user-mode */ report_bug(bugaddr, regs) == BUG_TRAP_TYPE_WARN) { - regs_add_return_ip(regs, 4); - return; + const struct exception_table_entry *entry; + + entry = search_exception_tables(bugaddr); + if (entry) { + regs_set_return_ip(regs, extable_fixup(entry) + regs->nip - bugaddr); + return; + } } _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip); return; @@ -2214,11 +2219,6 @@ DEFINE_INTERRUPT_HANDLER(kernel_bad_stack) die("Bad kernel stack pointer", regs, SIGABRT); } -void __init trap_init(void) -{ -} - - #ifdef CONFIG_PPC_EMULATED_STATS #define WARN_EMULATED_SETUP(type) .type = { .name = #type } @@ -2271,7 +2271,7 @@ static int __init ppc_warn_emulated_init(void) struct ppc_emulated_entry *entries = (void *)&ppc_emulated; dir = debugfs_create_dir("emulated_instructions", - powerpc_debugfs_root); + arch_debugfs_dir); debugfs_create_u32("do_warn", 0644, dir, &ppc_warn_emulated); diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index 01595e8cafe7..b1544b2f6321 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -5,7 +5,7 @@ * c 2001 PPC 64 Team, IBM Corp */ -#include <stdarg.h> +#include <linux/stdarg.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/console.h> diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index fc120fac1910..ba03eedfdcd8 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -65,9 +65,8 @@ _GLOBAL(load_up_altivec) 1: /* enable use of VMX after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ + addi r5,r2,THREAD oris r9,r9,MSR_VEC@h - tovirt(r5, r5) #else ld r4,PACACURRENT(r13) addi r5,r4,THREAD /* Get THREAD */ @@ -81,7 +80,6 @@ _GLOBAL(load_up_altivec) li r4,1 stb r4,THREAD_LOAD_VEC(r5) addi r6,r5,THREAD_VRSTATE - li r4,1 li r10,VRSTATE_VSCR stw r4,THREAD_USED_VR(r5) lvx v0,r10,r6 diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 8a449b2d8715..89c069d664a5 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -64,15 +64,18 @@ int default_machine_kexec_prepare(struct kimage *image) begin = image->segment[i].mem; end = begin + image->segment[i].memsz; - if ((begin < high) && (end > low)) + if ((begin < high) && (end > low)) { + of_node_put(node); return -ETXTBSY; + } } } return 0; } -static void copy_segments(unsigned long ind) +/* Called during kexec sequence with MMU off */ +static notrace void copy_segments(unsigned long ind) { unsigned long entry; unsigned long *ptr; @@ -105,7 +108,8 @@ static void copy_segments(unsigned long ind) } } -void kexec_copy_flush(struct kimage *image) +/* Called during kexec sequence with MMU off */ +notrace void kexec_copy_flush(struct kimage *image) { long i, nr_segments = image->nr_segments; struct kexec_segment ranges[KEXEC_SEGMENT_MAX]; diff --git a/arch/powerpc/kexec/relocate_32.S b/arch/powerpc/kexec/relocate_32.S index 61946c19e07c..cf6e52bdf8d8 100644 --- a/arch/powerpc/kexec/relocate_32.S +++ b/arch/powerpc/kexec/relocate_32.S @@ -93,7 +93,7 @@ wmmucr: * Invalidate all the TLB entries except the current entry * where we are running from */ - bl 0f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 0: mflr r5 /* Make it accessible */ tlbsx r23,0,r5 /* Find entry we are in */ li r4,0 /* Start at TLB entry 0 */ @@ -158,7 +158,7 @@ write_out: /* Switch to other address space in MSR */ insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */ - bl 1f + bcl 20,31,$+4 1: mflr r8 addi r8, r8, (2f-1b) /* Find the target offset */ @@ -202,7 +202,7 @@ next_tlb: li r9,0 insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */ - bl 1f + bcl 20,31,$+4 1: mflr r8 and r8, r8, r11 /* Get our offset within page */ addi r8, r8, (2f-1b) @@ -240,7 +240,7 @@ setup_map_47x: sync /* Find the entry we are running from */ - bl 2f + bcl 20,31,$+4 2: mflr r23 tlbsx r23, 0, r23 tlbre r24, r23, 0 /* TLB Word 0 */ @@ -296,7 +296,7 @@ clear_utlb_entry: /* Update the msr to the new TS */ insrwi r5, r7, 1, 26 - bl 1f + bcl 20,31,$+4 1: mflr r6 addi r6, r6, (2f-1b) @@ -355,7 +355,7 @@ write_utlb: /* Defaults to 256M */ lis r10, 0x1000 - bl 1f + bcl 20,31,$+4 1: mflr r4 addi r4, r4, (2f-1b) /* virtual address of 2f */ diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index e45644657d49..ff581d70f20c 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -38,7 +38,6 @@ config KVM_BOOK3S_32_HANDLER config KVM_BOOK3S_64_HANDLER bool select KVM_BOOK3S_HANDLER - select PPC_DAWR_FORCE_ENABLE config KVM_BOOK3S_PR_POSSIBLE bool diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 79833f78d1da..b785f6772391 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -43,8 +43,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_ICOUNTER(VM, num_2M_pages), STATS_DESC_ICOUNTER(VM, num_1G_pages) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -71,7 +69,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, emulated_inst_exits), STATS_DESC_COUNTER(VCPU, dec_exits), STATS_DESC_COUNTER(VCPU, ext_intr_exits), - STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns), STATS_DESC_COUNTER(VCPU, halt_successful_wait), STATS_DESC_COUNTER(VCPU, dbell_exits), STATS_DESC_COUNTER(VCPU, gdbell_exits), @@ -88,8 +85,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, pthru_host), STATS_DESC_COUNTER(VCPU, pthru_bad_aff) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 740e51def5a5..58391b4b32ed 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -23,7 +23,8 @@ extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val); extern int kvmppc_book3s_init_pr(void); -extern void kvmppc_book3s_exit_pr(void); +void kvmppc_book3s_exit_pr(void); +extern int kvmppc_handle_exit_pr(struct kvm_vcpu *vcpu, unsigned int exit_nr); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM extern void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val); diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 26b8b27a3755..feee40cb2ba1 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -196,7 +196,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, hva_t ptegp; u64 pteg[16]; u64 avpn = 0; - u64 v, r; + u64 r; u64 v_val, v_mask; u64 eaddr_mask; int i; @@ -285,7 +285,6 @@ do_second: goto do_second; } - v = be64_to_cpu(pteg[i]); r = be64_to_cpu(pteg[i+1]); pp = (r & HPTE_R_PP) | key; if (r & HPTE_R_PP0) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index b5905ae4377c..16359525a40f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -44,6 +44,9 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, (to != NULL) ? __pa(to): 0, (from != NULL) ? __pa(from): 0, n); + if (eaddr & (0xFFFUL << 52)) + return ret; + quadrant = 1; if (!pid) quadrant = 2; @@ -65,10 +68,12 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, } isync(); + pagefault_disable(); if (is_load) - ret = copy_from_user_nofault(to, (const void __user *)from, n); + ret = __copy_from_user_inatomic(to, (const void __user *)from, n); else - ret = copy_to_user_nofault((void __user *)to, from, n); + ret = __copy_to_user_inatomic((void __user *)to, from, n); + pagefault_enable(); /* switch the pid first to avoid running host with unallocated pid */ if (quadrant == 1 && pid != old_pid) @@ -81,7 +86,6 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, return ret; } -EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix); static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, void *from, unsigned long n) @@ -117,14 +121,12 @@ long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, return ret; } -EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix); long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from, unsigned long n) { return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n); } -EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix); int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, u64 root, diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 8da93fdfa59e..6365087f3160 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -346,7 +346,7 @@ static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; - memslot = search_memslots(kvm_memslots(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); if (!memslot) return -EINVAL; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index dc6591548f0c..870b7f0c7ea5 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -80,7 +80,7 @@ static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; - memslot = search_memslots(kvm_memslots_raw(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); if (!memslot) return -EINVAL; @@ -173,10 +173,13 @@ static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt, idx -= stt->offset; page = stt->pages[idx / TCES_PER_PAGE]; /* - * page must not be NULL in real mode, - * kvmppc_rm_ioba_validate() must have taken care of this. + * kvmppc_rm_ioba_validate() allows pages not be allocated if TCE is + * being cleared, otherwise it returns H_TOO_HARD and we skip this. */ - WARN_ON_ONCE_RM(!page); + if (!page) { + WARN_ON_ONCE_RM(tce != 0); + return; + } tbl = kvmppc_page_address(page); tbl[idx % TCES_PER_PAGE] = tce; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 085fb8ecbf68..2acb1c96cfaf 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -59,6 +59,7 @@ #include <asm/kvm_book3s.h> #include <asm/mmu_context.h> #include <asm/lppaca.h> +#include <asm/pmc.h> #include <asm/processor.h> #include <asm/cputhreads.h> #include <asm/page.h> @@ -1165,7 +1166,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) break; #endif case H_RANDOM: - if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4])) + if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4])) ret = H_HARDWARE; break; case H_RPT_INVALIDATE: @@ -1679,6 +1680,21 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, r = RESUME_GUEST; } break; + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case BOOK3S_INTERRUPT_HV_SOFTPATCH: + /* + * This occurs for various TM-related instructions that + * we need to emulate on POWER9 DD2.2. We have already + * handled the cases where the guest was in real-suspend + * mode and was transitioning to transactional state. + */ + r = kvmhv_p9_tm_emulation(vcpu); + if (r != -1) + break; + fallthrough; /* go to facility unavailable handler */ +#endif + /* * This occurs if the guest (kernel or userspace), does something that * is prohibited by HFSCR. @@ -1697,18 +1713,6 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, } break; -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - case BOOK3S_INTERRUPT_HV_SOFTPATCH: - /* - * This occurs for various TM-related instructions that - * we need to emulate on POWER9 DD2.2. We have already - * handled the cases where the guest was in real-suspend - * mode and was transitioning to transactional state. - */ - r = kvmhv_p9_tm_emulation(vcpu); - break; -#endif - case BOOK3S_INTERRUPT_HV_RM_HARD: r = RESUME_PASSTHROUGH; break; @@ -1727,6 +1731,7 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) { + struct kvm_nested_guest *nested = vcpu->arch.nested; int r; int srcu_idx; @@ -1811,9 +1816,41 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) * mode and was transitioning to transactional state. */ r = kvmhv_p9_tm_emulation(vcpu); - break; + if (r != -1) + break; + fallthrough; /* go to facility unavailable handler */ #endif + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: { + u64 cause = vcpu->arch.hfscr >> 56; + + /* + * Only pass HFU interrupts to the L1 if the facility is + * permitted but disabled by the L1's HFSCR, otherwise + * the interrupt does not make sense to the L1 so turn + * it into a HEAI. + */ + if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) || + (nested->hfscr & (1UL << cause))) { + vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST; + + /* + * If the fetch failed, return to guest and + * try executing it again. + */ + r = kvmppc_get_last_inst(vcpu, INST_GENERIC, + &vcpu->arch.emul_inst); + if (r != EMULATE_DONE) + r = RESUME_GUEST; + else + r = RESUME_HOST; + } else { + r = RESUME_HOST; + } + + break; + } + case BOOK3S_INTERRUPT_HV_RM_HARD: vcpu->arch.trap = 0; r = RESUME_GUEST; @@ -2684,6 +2721,7 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) spin_lock_init(&vcpu->arch.vpa_update_lock); spin_lock_init(&vcpu->arch.tbacct_lock); vcpu->arch.busy_preempt = TB_NIL; + vcpu->arch.shregs.msr = MSR_ME; vcpu->arch.intr_msr = MSR_SF | MSR_ME; /* @@ -2705,6 +2743,8 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) if (cpu_has_feature(CPU_FTR_TM_COMP)) vcpu->arch.hfscr |= HFSCR_TM; + vcpu->arch.hfscr_permitted = vcpu->arch.hfscr; + kvmppc_mmu_book3s_hv_init(vcpu); vcpu->arch.state = KVMPPC_VCPU_NOTREADY; @@ -3727,7 +3767,6 @@ static void load_spr_state(struct kvm_vcpu *vcpu) mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); mtspr(SPRN_BESCR, vcpu->arch.bescr); - mtspr(SPRN_WORT, vcpu->arch.wort); mtspr(SPRN_TIDR, vcpu->arch.tid); mtspr(SPRN_AMR, vcpu->arch.amr); mtspr(SPRN_UAMOR, vcpu->arch.uamor); @@ -3754,7 +3793,6 @@ static void store_spr_state(struct kvm_vcpu *vcpu) vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); vcpu->arch.bescr = mfspr(SPRN_BESCR); - vcpu->arch.wort = mfspr(SPRN_WORT); vcpu->arch.tid = mfspr(SPRN_TIDR); vcpu->arch.amr = mfspr(SPRN_AMR); vcpu->arch.uamor = mfspr(SPRN_UAMOR); @@ -3786,7 +3824,6 @@ static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, struct p9_host_os_sprs *host_os_sprs) { mtspr(SPRN_PSPB, 0); - mtspr(SPRN_WORT, 0); mtspr(SPRN_UAMOR, 0); mtspr(SPRN_DSCR, host_os_sprs->dscr); @@ -3852,6 +3889,18 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); +#ifdef CONFIG_PPC_PSERIES + if (kvmhv_on_pseries()) { + barrier(); + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use; + } else { + get_lppaca()->pmcregs_in_use = 1; + } + barrier(); + } +#endif kvmhv_load_guest_pmu(vcpu); msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); @@ -3986,6 +4035,13 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, save_pmu |= nesting_enabled(vcpu->kvm); kvmhv_save_guest_pmu(vcpu, save_pmu); +#ifdef CONFIG_PPC_PSERIES + if (kvmhv_on_pseries()) { + barrier(); + get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse(); + barrier(); + } +#endif vc->entry_exit_map = 0x101; vc->in_guest = 0; @@ -4146,19 +4202,31 @@ out: /* Attribute wait time */ if (do_sleep) { - vc->runner->stat.halt_wait_ns += + vc->runner->stat.generic.halt_wait_ns += ktime_to_ns(cur) - ktime_to_ns(start_wait); + KVM_STATS_LOG_HIST_UPDATE( + vc->runner->stat.generic.halt_wait_hist, + ktime_to_ns(cur) - ktime_to_ns(start_wait)); /* Attribute failed poll time */ - if (vc->halt_poll_ns) + if (vc->halt_poll_ns) { vc->runner->stat.generic.halt_poll_fail_ns += ktime_to_ns(start_wait) - ktime_to_ns(start_poll); + KVM_STATS_LOG_HIST_UPDATE( + vc->runner->stat.generic.halt_poll_fail_hist, + ktime_to_ns(start_wait) - + ktime_to_ns(start_poll)); + } } else { /* Attribute successful poll time */ - if (vc->halt_poll_ns) + if (vc->halt_poll_ns) { vc->runner->stat.generic.halt_poll_success_ns += ktime_to_ns(cur) - ktime_to_ns(start_poll); + KVM_STATS_LOG_HIST_UPDATE( + vc->runner->stat.generic.halt_poll_success_hist, + ktime_to_ns(cur) - ktime_to_ns(start_poll)); + } } /* Adjust poll time */ @@ -5328,6 +5396,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) struct kvmppc_passthru_irqmap *pimap; struct irq_chip *chip; int i, rc = 0; + struct irq_data *host_data; if (!kvm_irq_bypass) return 1; @@ -5355,7 +5424,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) * what our real-mode EOI code does, or a XIVE interrupt */ chip = irq_data_get_irq_chip(&desc->irq_data); - if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) { + if (!chip || !is_pnv_opal_msi(chip)) { pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n", host_irq, guest_gsi); mutex_unlock(&kvm->lock); @@ -5392,15 +5461,22 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) * the KVM real mode handler. */ smp_wmb(); - irq_map->r_hwirq = desc->irq_data.hwirq; + + /* + * The 'host_irq' number is mapped in the PCI-MSI domain but + * the underlying calls, which will EOI the interrupt in real + * mode, need an HW IRQ number mapped in the XICS IRQ domain. + */ + host_data = irq_domain_get_irq_data(irq_get_default_host(), host_irq); + irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data); if (i == pimap->n_mapped) pimap->n_mapped++; if (xics_on_xive()) - rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc); + rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq); else - kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); + kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq); if (rc) irq_map->r_hwirq = 0; @@ -5439,7 +5515,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) } if (xics_on_xive()) - rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc); + rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq); else kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index be8ef1c5b1bf..fcf4760a3a0e 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -137,23 +137,23 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, * exist in the system. We use a counter of VMs to track this. * * One of the operations we need to block is onlining of secondaries, so we - * protect hv_vm_count with get/put_online_cpus(). + * protect hv_vm_count with cpus_read_lock/unlock(). */ static atomic_t hv_vm_count; void kvm_hv_vm_activated(void) { - get_online_cpus(); + cpus_read_lock(); atomic_inc(&hv_vm_count); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(kvm_hv_vm_activated); void kvm_hv_vm_deactivated(void) { - get_online_cpus(); + cpus_read_lock(); atomic_dec(&hv_vm_count); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(kvm_hv_vm_deactivated); diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 898f942eb198..ed8a2c9f5629 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -99,13 +99,12 @@ static void byteswap_hv_regs(struct hv_guest_state *hr) hr->dawrx1 = swab64(hr->dawrx1); } -static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, +static void save_hv_return_state(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; hr->dpdes = vc->dpdes; - hr->hfscr = vcpu->arch.hfscr; hr->purr = vcpu->arch.purr; hr->spurr = vcpu->arch.spurr; hr->ic = vcpu->arch.ic; @@ -119,7 +118,7 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, hr->pidr = vcpu->arch.pid; hr->cfar = vcpu->arch.cfar; hr->ppr = vcpu->arch.ppr; - switch (trap) { + switch (vcpu->arch.trap) { case BOOK3S_INTERRUPT_H_DATA_STORAGE: hr->hdar = vcpu->arch.fault_dar; hr->hdsisr = vcpu->arch.fault_dsisr; @@ -128,55 +127,17 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, case BOOK3S_INTERRUPT_H_INST_STORAGE: hr->asdr = vcpu->arch.fault_gpa; break; + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: + hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) | + (HFSCR_INTR_CAUSE & vcpu->arch.hfscr)); + break; case BOOK3S_INTERRUPT_H_EMUL_ASSIST: hr->heir = vcpu->arch.emul_inst; break; } } -/* - * This can result in some L0 HV register state being leaked to an L1 - * hypervisor when the hv_guest_state is copied back to the guest after - * being modified here. - * - * There is no known problem with such a leak, and in many cases these - * register settings could be derived by the guest by observing behaviour - * and timing, interrupts, etc., but it is an issue to consider. - */ -static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) -{ - struct kvmppc_vcore *vc = vcpu->arch.vcore; - u64 mask; - - /* - * Don't let L1 change LPCR bits for the L2 except these: - */ - mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | - LPCR_LPES | LPCR_MER; - - /* - * Additional filtering is required depending on hardware - * and configuration. - */ - hr->lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm, - (vc->lpcr & ~mask) | (hr->lpcr & mask)); - - /* - * Don't let L1 enable features for L2 which we've disabled for L1, - * but preserve the interrupt cause field. - */ - hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr); - - /* Don't let data address watchpoint match in hypervisor state */ - hr->dawrx0 &= ~DAWRX_HYP; - hr->dawrx1 &= ~DAWRX_HYP; - - /* Don't let completed instruction address breakpt match in HV state */ - if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) - hr->ciabr &= ~CIABR_PRIV; -} - -static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +static void restore_hv_regs(struct kvm_vcpu *vcpu, const struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -288,6 +249,43 @@ static int kvmhv_write_guest_state_and_regs(struct kvm_vcpu *vcpu, sizeof(struct pt_regs)); } +static void load_l2_hv_regs(struct kvm_vcpu *vcpu, + const struct hv_guest_state *l2_hv, + const struct hv_guest_state *l1_hv, u64 *lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 mask; + + restore_hv_regs(vcpu, l2_hv); + + /* + * Don't let L1 change LPCR bits for the L2 except these: + */ + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | + LPCR_LPES | LPCR_MER; + + /* + * Additional filtering is required depending on hardware + * and configuration. + */ + *lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm, + (vc->lpcr & ~mask) | (*lpcr & mask)); + + /* + * Don't let L1 enable features for L2 which we don't allow for L1, + * but preserve the interrupt cause field. + */ + vcpu->arch.hfscr = l2_hv->hfscr & (HFSCR_INTR_CAUSE | vcpu->arch.hfscr_permitted); + + /* Don't let data address watchpoint match in hypervisor state */ + vcpu->arch.dawrx0 = l2_hv->dawrx0 & ~DAWRX_HYP; + vcpu->arch.dawrx1 = l2_hv->dawrx1 & ~DAWRX_HYP; + + /* Don't let completed instruction address breakpt match in HV state */ + if ((l2_hv->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) + vcpu->arch.ciabr = l2_hv->ciabr & ~CIABR_PRIV; +} + long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) { long int err, r; @@ -296,7 +294,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) struct hv_guest_state l2_hv = {0}, saved_l1_hv; struct kvmppc_vcore *vc = vcpu->arch.vcore; u64 hv_ptr, regs_ptr; - u64 hdec_exp; + u64 hdec_exp, lpcr; s64 delta_purr, delta_spurr, delta_ic, delta_vtb; if (vcpu->kvm->arch.l1_ptcr == 0) @@ -364,13 +362,14 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) /* set L1 state to L2 state */ vcpu->arch.nested = l2; vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; + l2->hfscr = l2_hv.hfscr; vcpu->arch.regs = l2_regs; /* Guest must always run with ME enabled, HV disabled. */ vcpu->arch.shregs.msr = (vcpu->arch.regs.msr | MSR_ME) & ~MSR_HV; - sanitise_hv_regs(vcpu, &l2_hv); - restore_hv_regs(vcpu, &l2_hv); + lpcr = l2_hv.lpcr; + load_l2_hv_regs(vcpu, &l2_hv, &saved_l1_hv, &lpcr); vcpu->arch.ret = RESUME_GUEST; vcpu->arch.trap = 0; @@ -380,7 +379,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) r = RESUME_HOST; break; } - r = kvmhv_run_single_vcpu(vcpu, hdec_exp, l2_hv.lpcr); + r = kvmhv_run_single_vcpu(vcpu, hdec_exp, lpcr); } while (is_kvmppc_resume_guest(r)); /* save L2 state for return */ @@ -390,7 +389,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) delta_spurr = vcpu->arch.spurr - l2_hv.spurr; delta_ic = vcpu->arch.ic - l2_hv.ic; delta_vtb = vc->vtb - l2_hv.vtb; - save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv); + save_hv_return_state(vcpu, &l2_hv); /* restore L1 state */ vcpu->arch.nested = NULL; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 0a11ec88a0ae..587c33fc4564 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -706,6 +706,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq) icp->rm_eoied_irq = irq; } + /* Handle passthrough interrupts */ if (state->host_irq) { ++vcpu->stat.pthru_all; if (state->intr_cpu != -1) { @@ -759,12 +760,12 @@ int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) static unsigned long eoi_rc; -static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) +static void icp_eoi(struct irq_data *d, u32 hwirq, __be32 xirr, bool *again) { void __iomem *xics_phys; int64_t rc; - rc = pnv_opal_pci_msi_eoi(c, hwirq); + rc = pnv_opal_pci_msi_eoi(d); if (rc) eoi_rc = rc; @@ -872,8 +873,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, icp_rm_deliver_irq(xics, icp, irq, false); /* EOI the interrupt */ - icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr, - again); + icp_eoi(irq_desc_get_irq_data(irq_map->desc), irq_map->r_hwirq, xirr, again); if (check_too_hard(xics, icp) == H_TOO_HARD) return 2; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 8dd437d7a2c6..90484425a1e6 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1088,12 +1088,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE beq kvmppc_hisi -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - /* For softpatch interrupt, go off and do TM instruction emulation */ - cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH - beq kvmppc_tm_emul -#endif - /* See if this is a leftover HDEC interrupt */ cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER bne 2f @@ -1599,42 +1593,6 @@ maybe_reenter_guest: blt deliver_guest_interrupt b guest_exit_cont -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -/* - * Softpatch interrupt for transactional memory emulation cases - * on POWER9 DD2.2. This is early in the guest exit path - we - * haven't saved registers or done a treclaim yet. - */ -kvmppc_tm_emul: - /* Save instruction image in HEIR */ - mfspr r3, SPRN_HEIR - stw r3, VCPU_HEIR(r9) - - /* - * The cases we want to handle here are those where the guest - * is in real suspend mode and is trying to transition to - * transactional mode. - */ - lbz r0, HSTATE_FAKE_SUSPEND(r13) - cmpwi r0, 0 /* keep exiting guest if in fake suspend */ - bne guest_exit_cont - rldicl r3, r11, 64 - MSR_TS_S_LG, 62 - cmpwi r3, 1 /* or if not in suspend state */ - bne guest_exit_cont - - /* Call C code to do the emulation */ - mr r3, r9 - bl kvmhv_p9_tm_emulation_early - nop - ld r9, HSTATE_KVM_VCPU(r13) - li r12, BOOK3S_INTERRUPT_HV_SOFTPATCH - cmpwi r3, 0 - beq guest_exit_cont /* continue exiting if not handled */ - ld r10, VCPU_PC(r9) - ld r11, VCPU_MSR(r9) - b fast_interrupt_c_return /* go back to guest if handled */ -#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ - /* * Check whether an HDSI is an HPTE not found fault or something else. * If it is an HPTE not found fault that is due to the guest accessing @@ -2578,7 +2536,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_P9_TM_HV_ASSIST) /* The following code handles the fake_suspend = 1 case */ mflr r0 std r0, PPC_LR_STKOFF(r1) - stdu r1, -PPC_MIN_STKFRM(r1) + stdu r1, -TM_FRAME_SIZE(r1) /* Turn on TM. */ mfmsr r8 @@ -2593,10 +2551,42 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) nop + /* + * It's possible that treclaim. may modify registers, if we have lost + * track of fake-suspend state in the guest due to it using rfscv. + * Save and restore registers in case this occurs. + */ + mfspr r3, SPRN_DSCR + mfspr r4, SPRN_XER + mfspr r5, SPRN_AMR + /* SPRN_TAR would need to be saved here if the kernel ever used it */ + mfcr r12 + SAVE_NVGPRS(r1) + SAVE_GPR(2, r1) + SAVE_GPR(3, r1) + SAVE_GPR(4, r1) + SAVE_GPR(5, r1) + stw r12, 8(r1) + std r1, HSTATE_HOST_R1(r13) + /* We have to treclaim here because that's the only way to do S->N */ li r3, TM_CAUSE_KVM_RESCHED TRECLAIM(R3) + GET_PACA(r13) + ld r1, HSTATE_HOST_R1(r13) + REST_GPR(2, r1) + REST_GPR(3, r1) + REST_GPR(4, r1) + REST_GPR(5, r1) + lwz r12, 8(r1) + REST_NVGPRS(r1) + mtspr SPRN_DSCR, r3 + mtspr SPRN_XER, r4 + mtspr SPRN_AMR, r5 + mtcr r12 + HMT_MEDIUM + /* * We were in fake suspend, so we are not going to save the * register state as the guest checkpointed state (since @@ -2624,7 +2614,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) std r5, VCPU_TFHAR(r9) std r6, VCPU_TFIAR(r9) - addi r1, r1, PPC_MIN_STKFRM + addi r1, r1, TM_FRAME_SIZE ld r0, PPC_LR_STKOFF(r1) mtlr r0 blr diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c index cc90b8b82329..866cadd70094 100644 --- a/arch/powerpc/kvm/book3s_hv_tm.c +++ b/arch/powerpc/kvm/book3s_hv_tm.c @@ -47,6 +47,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) int ra, rs; /* + * The TM softpatch interrupt sets NIP to the instruction following + * the faulting instruction, which is not executed. Rewind nip to the + * faulting instruction so it looks like a normal synchronous + * interrupt, then update nip in the places where the instruction is + * emulated. + */ + vcpu->arch.regs.nip -= 4; + + /* * rfid, rfebb, and mtmsrd encode bit 31 = 0 since it's a reserved bit * in these instructions, so masking bit 31 out doesn't change these * instructions. For treclaim., tsr., and trechkpt. instructions if bit @@ -67,7 +76,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) (newmsr & MSR_TM))); newmsr = sanitize_msr(newmsr); vcpu->arch.shregs.msr = newmsr; - vcpu->arch.cfar = vcpu->arch.regs.nip - 4; + vcpu->arch.cfar = vcpu->arch.regs.nip; vcpu->arch.regs.nip = vcpu->arch.shregs.srr0; return RESUME_GUEST; @@ -79,14 +88,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) } /* check EBB facility is available */ if (!(vcpu->arch.hfscr & HFSCR_EBB)) { - /* generate an illegal instruction interrupt */ - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - return RESUME_GUEST; + vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; + vcpu->arch.hfscr |= (u64)FSCR_EBB_LG << 56; + vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; + return -1; /* rerun host interrupt handler */ } if ((msr & MSR_PR) && !(vcpu->arch.fscr & FSCR_EBB)) { /* generate a facility unavailable interrupt */ - vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | - ((u64)FSCR_EBB_LG << 56); + vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; + vcpu->arch.fscr |= (u64)FSCR_EBB_LG << 56; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); return RESUME_GUEST; } @@ -100,7 +110,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) vcpu->arch.bescr = bescr; msr = (msr & ~MSR_TS_MASK) | MSR_TS_T; vcpu->arch.shregs.msr = msr; - vcpu->arch.cfar = vcpu->arch.regs.nip - 4; + vcpu->arch.cfar = vcpu->arch.regs.nip; vcpu->arch.regs.nip = vcpu->arch.ebbrr; return RESUME_GUEST; @@ -116,6 +126,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) newmsr = (newmsr & ~MSR_LE) | (msr & MSR_LE); newmsr = sanitize_msr(newmsr); vcpu->arch.shregs.msr = newmsr; + vcpu->arch.regs.nip += 4; return RESUME_GUEST; /* ignore bit 31, see comment above */ @@ -128,14 +139,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) } /* check for TM disabled in the HFSCR or MSR */ if (!(vcpu->arch.hfscr & HFSCR_TM)) { - /* generate an illegal instruction interrupt */ - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - return RESUME_GUEST; + vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; + vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56; + vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; + return -1; /* rerun host interrupt handler */ } if (!(msr & MSR_TM)) { /* generate a facility unavailable interrupt */ - vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | - ((u64)FSCR_TM_LG << 56); + vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; + vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); return RESUME_GUEST; @@ -152,20 +164,22 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) msr = (msr & ~MSR_TS_MASK) | MSR_TS_S; } vcpu->arch.shregs.msr = msr; + vcpu->arch.regs.nip += 4; return RESUME_GUEST; /* ignore bit 31, see comment above */ case (PPC_INST_TRECLAIM & PO_XOP_OPCODE_MASK): /* check for TM disabled in the HFSCR or MSR */ if (!(vcpu->arch.hfscr & HFSCR_TM)) { - /* generate an illegal instruction interrupt */ - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - return RESUME_GUEST; + vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; + vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56; + vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; + return -1; /* rerun host interrupt handler */ } if (!(msr & MSR_TM)) { /* generate a facility unavailable interrupt */ - vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | - ((u64)FSCR_TM_LG << 56); + vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; + vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); return RESUME_GUEST; @@ -189,6 +203,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29); vcpu->arch.shregs.msr &= ~MSR_TS_MASK; + vcpu->arch.regs.nip += 4; return RESUME_GUEST; /* ignore bit 31, see comment above */ @@ -196,14 +211,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) /* XXX do we need to check for PR=0 here? */ /* check for TM disabled in the HFSCR or MSR */ if (!(vcpu->arch.hfscr & HFSCR_TM)) { - /* generate an illegal instruction interrupt */ - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - return RESUME_GUEST; + vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; + vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56; + vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; + return -1; /* rerun host interrupt handler */ } if (!(msr & MSR_TM)) { /* generate a facility unavailable interrupt */ - vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | - ((u64)FSCR_TM_LG << 56); + vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; + vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); return RESUME_GUEST; @@ -220,6 +236,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29); vcpu->arch.shregs.msr = msr | MSR_TS_S; + vcpu->arch.regs.nip += 4; return RESUME_GUEST; } diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 303e3cb096db..ebd5d920de8c 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -10,13 +10,13 @@ #include <linux/gfp.h> #include <linux/anon_inodes.h> #include <linux/spinlock.h> - +#include <linux/debugfs.h> #include <linux/uaccess.h> + #include <asm/kvm_book3s.h> #include <asm/kvm_ppc.h> #include <asm/hvcall.h> #include <asm/xics.h> -#include <asm/debugfs.h> #include <asm/time.h> #include <linux/seq_file.h> @@ -1024,7 +1024,7 @@ static void xics_debugfs_init(struct kvmppc_xics *xics) return; } - xics->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root, + xics->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir, xics, &xics_debug_fops); pr_debug("%s: created %s\n", __func__, name); diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 8cfab3547494..a18db9e16ea4 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -22,7 +22,6 @@ #include <asm/xive.h> #include <asm/xive-regs.h> #include <asm/debug.h> -#include <asm/debugfs.h> #include <asm/time.h> #include <asm/opal.h> @@ -59,6 +58,25 @@ */ #define XIVE_Q_GAP 2 +static bool kvmppc_xive_vcpu_has_save_restore(struct kvm_vcpu *vcpu) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + + /* Check enablement at VP level */ + return xc->vp_cam & TM_QW1W2_HO; +} + +bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + struct kvmppc_xive *xive = xc->xive; + + if (xive->flags & KVMPPC_XIVE_FLAG_SAVE_RESTORE) + return kvmppc_xive_vcpu_has_save_restore(vcpu); + + return true; +} + /* * Push a vcpu's context to the XIVE on guest entry. * This assumes we are in virtual mode (MMU on) @@ -77,7 +95,8 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) return; eieio(); - __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); + if (!kvmppc_xive_vcpu_has_save_restore(vcpu)) + __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); vcpu->arch.xive_pushed = 1; eieio(); @@ -149,7 +168,8 @@ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) /* First load to pull the context, we ignore the value */ __raw_readl(tima + TM_SPC_PULL_OS_CTX); /* Second load to recover the context state (Words 0 and 1) */ - vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS); + if (!kvmppc_xive_vcpu_has_save_restore(vcpu)) + vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS); /* Fixup some of the state for the next load */ vcpu->arch.xive_saved_state.lsmfb = 0; @@ -363,9 +383,9 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio) if (!vcpu->arch.xive_vcpu) continue; rc = xive_provision_queue(vcpu, prio); - if (rc == 0 && !xive->single_escalation) + if (rc == 0 && !kvmppc_xive_has_single_escalation(xive)) kvmppc_xive_attach_escalation(vcpu, prio, - xive->single_escalation); + kvmppc_xive_has_single_escalation(xive)); if (rc) return rc; } @@ -922,13 +942,13 @@ int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) } int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, - struct irq_desc *host_desc) + unsigned long host_irq) { struct kvmppc_xive *xive = kvm->arch.xive; struct kvmppc_xive_src_block *sb; struct kvmppc_xive_irq_state *state; - struct irq_data *host_data = irq_desc_get_irq_data(host_desc); - unsigned int host_irq = irq_desc_get_irq(host_desc); + struct irq_data *host_data = + irq_domain_get_irq_data(irq_get_default_host(), host_irq); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data); u16 idx; u8 prio; @@ -937,7 +957,8 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, if (!xive) return -ENODEV; - pr_devel("set_mapped girq 0x%lx host HW irq 0x%x...\n",guest_irq, hw_irq); + pr_debug("%s: GIRQ 0x%lx host IRQ %ld XIVE HW IRQ 0x%x\n", + __func__, guest_irq, host_irq, hw_irq); sb = kvmppc_xive_find_source(xive, guest_irq, &idx); if (!sb) @@ -959,7 +980,7 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, */ rc = irq_set_vcpu_affinity(host_irq, state); if (rc) { - pr_err("Failed to set VCPU affinity for irq %d\n", host_irq); + pr_err("Failed to set VCPU affinity for host IRQ %ld\n", host_irq); return rc; } @@ -1019,12 +1040,11 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped); int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, - struct irq_desc *host_desc) + unsigned long host_irq) { struct kvmppc_xive *xive = kvm->arch.xive; struct kvmppc_xive_src_block *sb; struct kvmppc_xive_irq_state *state; - unsigned int host_irq = irq_desc_get_irq(host_desc); u16 idx; u8 prio; int rc; @@ -1032,7 +1052,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, if (!xive) return -ENODEV; - pr_devel("clr_mapped girq 0x%lx...\n", guest_irq); + pr_debug("%s: GIRQ 0x%lx host IRQ %ld\n", __func__, guest_irq, host_irq); sb = kvmppc_xive_find_source(xive, guest_irq, &idx); if (!sb) @@ -1059,7 +1079,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, /* Release the passed-through interrupt to the host */ rc = irq_set_vcpu_affinity(host_irq, NULL); if (rc) { - pr_err("Failed to clr VCPU affinity for irq %d\n", host_irq); + pr_err("Failed to clr VCPU affinity for host IRQ %ld\n", host_irq); return rc; } @@ -1199,7 +1219,7 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { if (xc->esc_virq[i]) { - if (xc->xive->single_escalation) + if (kvmppc_xive_has_single_escalation(xc->xive)) xive_cleanup_single_escalation(vcpu, xc, xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); @@ -1319,6 +1339,12 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, if (r) goto bail; + if (!kvmppc_xive_check_save_restore(vcpu)) { + pr_err("inconsistent save-restore setup for VCPU %d\n", cpu); + r = -EIO; + goto bail; + } + /* Configure VCPU fields for use by assembly push/pull */ vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000); vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO); @@ -1340,7 +1366,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, * Enable the VP first as the single escalation mode will * affect escalation interrupts numbering */ - r = xive_native_enable_vp(xc->vp_id, xive->single_escalation); + r = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive)); if (r) { pr_err("Failed to enable VP in OPAL, err %d\n", r); goto bail; @@ -1357,15 +1383,15 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, struct xive_q *q = &xc->queues[i]; /* Single escalation, no queue 7 */ - if (i == 7 && xive->single_escalation) + if (i == 7 && kvmppc_xive_has_single_escalation(xive)) break; /* Is queue already enabled ? Provision it */ if (xive->qmap & (1 << i)) { r = xive_provision_queue(vcpu, i); - if (r == 0 && !xive->single_escalation) + if (r == 0 && !kvmppc_xive_has_single_escalation(xive)) kvmppc_xive_attach_escalation( - vcpu, i, xive->single_escalation); + vcpu, i, kvmppc_xive_has_single_escalation(xive)); if (r) goto bail; } else { @@ -1380,7 +1406,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, } /* If not done above, attach priority 0 escalation */ - r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation); + r = kvmppc_xive_attach_escalation(vcpu, 0, kvmppc_xive_has_single_escalation(xive)); if (r) goto bail; @@ -2135,7 +2161,11 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) */ xive->nr_servers = KVM_MAX_VCPUS; - xive->single_escalation = xive_native_has_single_escalation(); + if (xive_native_has_single_escalation()) + xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; + + if (xive_native_has_save_restore()) + xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE; kvm->arch.xive = xive; return 0; @@ -2329,7 +2359,7 @@ static void xive_debugfs_init(struct kvmppc_xive *xive) return; } - xive->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root, + xive->dentry = debugfs_create_file(name, S_IRUGO, arch_debugfs_dir, xive, &xive_debug_fops); pr_debug("%s: created %s\n", __func__, name); diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index afe9eeac6d56..e6a9651c6f1e 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -97,6 +97,9 @@ struct kvmppc_xive_ops { int (*reset_mapped)(struct kvm *kvm, unsigned long guest_irq); }; +#define KVMPPC_XIVE_FLAG_SINGLE_ESCALATION 0x1 +#define KVMPPC_XIVE_FLAG_SAVE_RESTORE 0x2 + struct kvmppc_xive { struct kvm *kvm; struct kvm_device *dev; @@ -133,7 +136,7 @@ struct kvmppc_xive { u32 q_page_order; /* Flags */ - u8 single_escalation; + u8 flags; /* Number of entries in the VP block */ u32 nr_servers; @@ -307,6 +310,12 @@ void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, struct kvmppc_xive_vcpu *xc, int irq); int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp); int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr); +bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu); + +static inline bool kvmppc_xive_has_single_escalation(struct kvmppc_xive *xive) +{ + return xive->flags & KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; +} #endif /* CONFIG_KVM_XICS */ #endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 573ecaab3597..99db9ac49901 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -20,7 +20,6 @@ #include <asm/xive.h> #include <asm/xive-regs.h> #include <asm/debug.h> -#include <asm/debugfs.h> #include <asm/opal.h> #include <linux/debugfs.h> @@ -93,7 +92,7 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { /* Free the escalation irq */ if (xc->esc_virq[i]) { - if (xc->xive->single_escalation) + if (kvmppc_xive_has_single_escalation(xc->xive)) xive_cleanup_single_escalation(vcpu, xc, xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); @@ -168,11 +167,17 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, goto bail; } + if (!kvmppc_xive_check_save_restore(vcpu)) { + pr_err("inconsistent save-restore setup for VCPU %d\n", server_num); + rc = -EIO; + goto bail; + } + /* * Enable the VP first as the single escalation mode will * affect escalation interrupts numbering */ - rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation); + rc = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive)); if (rc) { pr_err("Failed to enable VP in OPAL: %d\n", rc); goto bail; @@ -693,7 +698,7 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, } rc = kvmppc_xive_attach_escalation(vcpu, priority, - xive->single_escalation); + kvmppc_xive_has_single_escalation(xive)); error: if (rc) kvmppc_xive_native_cleanup_queue(vcpu, priority); @@ -820,7 +825,7 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive) for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) { /* Single escalation, no queue 7 */ - if (prio == 7 && xive->single_escalation) + if (prio == 7 && kvmppc_xive_has_single_escalation(xive)) break; if (xc->esc_virq[prio]) { @@ -1111,7 +1116,12 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) */ xive->nr_servers = KVM_MAX_VCPUS; - xive->single_escalation = xive_native_has_single_escalation(); + if (xive_native_has_single_escalation()) + xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; + + if (xive_native_has_save_restore()) + xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE; + xive->ops = &kvmppc_xive_native_ops; kvm->arch.xive = xive; @@ -1257,7 +1267,7 @@ static void xive_native_debugfs_init(struct kvmppc_xive *xive) return; } - xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root, + xive->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir, xive, &xive_native_debug_fops); pr_debug("%s: created %s\n", __func__, name); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 551b30d84aee..977801c83aff 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -41,8 +41,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_ICOUNTER(VM, num_2M_pages), STATS_DESC_ICOUNTER(VM, num_1G_pages) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -69,7 +67,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, emulated_inst_exits), STATS_DESC_COUNTER(VCPU, dec_exits), STATS_DESC_COUNTER(VCPU, ext_intr_exits), - STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns), STATS_DESC_COUNTER(VCPU, halt_successful_wait), STATS_DESC_COUNTER(VCPU, dbell_exits), STATS_DESC_COUNTER(VCPU, gdbell_exits), @@ -79,8 +76,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, pthru_host), STATS_DESC_COUNTER(VCPU, pthru_bad_aff) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index eae4ec2988fc..df8172da2301 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -18,5 +18,5 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o -obj-$(CONFIG_PPC_PTDUMP) += ptdump/ +obj-$(CONFIG_PTDUMP_CORE) += ptdump/ obj-$(CONFIG_KASAN) += kasan/ diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 52e170bd95ae..d8279bfe68ea 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -787,7 +787,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, * TODO: add batching support when enabled. remember, no dynamic memory here, * although there is the control page available... */ -static void native_hpte_clear(void) +static notrace void native_hpte_clear(void) { unsigned long vpn = 0; unsigned long slot, slots; diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index ac5720371c0d..c145776d3ae5 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -36,8 +36,8 @@ #include <linux/hugetlb.h> #include <linux/cpu.h> #include <linux/pgtable.h> +#include <linux/debugfs.h> -#include <asm/debugfs.h> #include <asm/interrupt.h> #include <asm/processor.h> #include <asm/mmu.h> @@ -2072,7 +2072,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n") static int __init hash64_debugfs(void) { - debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, NULL, + debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL, &fops_hpt_order); return 0; } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 9ffa65074cb0..9e16c7b1a6c5 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -6,9 +6,9 @@ #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/memblock.h> +#include <linux/debugfs.h> #include <misc/cxl-base.h> -#include <asm/debugfs.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/trace.h> @@ -172,8 +172,8 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/* For use by kexec */ -void mmu_cleanup_all(void) +/* For use by kexec, called with MMU off */ +notrace void mmu_cleanup_all(void) { if (radix_enabled()) radix__mmu_cleanup_all(); @@ -520,7 +520,7 @@ static int __init pgtable_debugfs_setup(void) * invalidated as expected. */ debugfs_create_bool("tlbie_enabled", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &tlbie_enabled); return 0; diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index e50ddf129c15..ae20add7954a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -679,7 +679,8 @@ void radix__early_init_mmu_secondary(void) mtspr(SPRN_UAMOR, 0); } -void radix__mmu_cleanup_all(void) +/* Called during kexec sequence with MMU off */ +notrace void radix__mmu_cleanup_all(void) { unsigned long lpcr; diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index aefc100d79a7..7724af19ed7e 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -10,6 +10,7 @@ #include <linux/memblock.h> #include <linux/mmu_context.h> #include <linux/sched/mm.h> +#include <linux/debugfs.h> #include <asm/ppc-opcode.h> #include <asm/tlb.h> @@ -1106,8 +1107,8 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); * invalidating a full PID, so it has a far lower threshold to change from * individual page flushes to full-pid flushes. */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; -static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; +static u32 tlb_single_page_flush_ceiling __read_mostly = 33; +static u32 tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; static inline void __radix__flush_tlb_range(struct mm_struct *mm, unsigned long start, unsigned long end) @@ -1524,3 +1525,14 @@ void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid, EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + +static int __init create_tlb_single_page_flush_ceiling(void) +{ + debugfs_create_u32("tlb_single_page_flush_ceiling", 0600, + arch_debugfs_dir, &tlb_single_page_flush_ceiling); + debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600, + arch_debugfs_dir, &tlb_local_single_page_flush_ceiling); + return 0; +} +late_initcall(create_tlb_single_page_flush_ceiling); + diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index c91bd85eb90e..f0037bcc47a0 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -822,7 +822,7 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault) /* IRQs are not reconciled here, so can't check irqs_disabled */ VM_WARN_ON(mfmsr() & MSR_EE); - if (unlikely(!(regs->msr & MSR_RI))) + if (regs_is_unrecoverable(regs)) return -EINVAL; /* diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 9af3832c9d8d..22197b18d85e 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -18,6 +18,7 @@ static int n_root_addr_cells, n_root_size_cells; static struct drmem_lmb_info __drmem_info; struct drmem_lmb_info *drmem_info = &__drmem_info; +static bool in_drmem_update; u64 drmem_lmb_memory_max(void) { @@ -178,6 +179,11 @@ int drmem_update_dt(void) if (!memory) return -1; + /* + * Set in_drmem_update to prevent the notifier callback to process the + * DT property back since the change is coming from the LMB tree. + */ + in_drmem_update = true; prop = of_find_property(memory, "ibm,dynamic-memory", NULL); if (prop) { rc = drmem_update_dt_v1(memory, prop); @@ -186,6 +192,7 @@ int drmem_update_dt(void) if (prop) rc = drmem_update_dt_v2(memory, prop); } + in_drmem_update = false; of_node_put(memory); return rc; @@ -307,6 +314,45 @@ int __init walk_drmem_lmbs_early(unsigned long node, void *data, return ret; } +/* + * Update the LMB associativity index. + */ +static int update_lmb(struct drmem_lmb *updated_lmb, + __maybe_unused const __be32 **usm, + __maybe_unused void *data) +{ + struct drmem_lmb *lmb; + + for_each_drmem_lmb(lmb) { + if (lmb->drc_index != updated_lmb->drc_index) + continue; + + lmb->aa_index = updated_lmb->aa_index; + break; + } + return 0; +} + +/* + * Update the LMB associativity index. + * + * This needs to be called when the hypervisor is updating the + * dynamic-reconfiguration-memory node property. + */ +void drmem_update_lmbs(struct property *prop) +{ + /* + * Don't update the LMBs if triggered by the update done in + * drmem_update_dt(), the LMB values have been used to the update the DT + * property in that case. + */ + if (in_drmem_update) + return; + if (!strcmp(prop->name, "ibm,dynamic-memory")) + __walk_drmem_v1_lmbs(prop->value, NULL, NULL, update_lmb); + else if (!strcmp(prop->name, "ibm,dynamic-memory-v2")) + __walk_drmem_v2_lmbs(prop->value, NULL, NULL, update_lmb); +} #endif static int init_drmem_lmb_size(struct device_node *dn) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ad198b439222..c3c4e31462ec 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -119,8 +119,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, return rc; } -void __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 7dac910c0b21..dd1cabc2ea0f 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -180,7 +180,7 @@ static inline void mmu_mark_rodata_ro(void) { } void __init mmu_mapin_immr(void); #endif -#ifdef CONFIG_PPC_DEBUG_WX +#ifdef CONFIG_DEBUG_WX void ptdump_check_wx(void); #else static inline void ptdump_check_wx(void) { } diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S index 4613bf8e9aae..5add4a51e51f 100644 --- a/arch/powerpc/mm/nohash/tlb_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -199,7 +199,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) * Touch enough instruction cache lines to ensure cache hits */ 1: mflr r9 - bl 2f + bcl 20,31,$+4 2: mflr r6 li r7,32 PPC_ICBT(0,R6,R7) /* touch next cache line */ @@ -414,7 +414,7 @@ _GLOBAL(loadcam_multi) * Set up temporary TLB entry that is the same as what we're * running from, but in AS=1. */ - bl 1f + bcl 20,31,$+4 1: mflr r6 tlbsx 0,r8 mfspr r6,SPRN_MAS1 diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f2bf98bdcea2..6f14c8fb6359 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -40,9 +40,6 @@ static int numa_enabled = 1; static char *cmdline __initdata; -static int numa_debug; -#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } - int numa_cpu_lookup_table[NR_CPUS]; cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; struct pglist_data *node_data[MAX_NUMNODES]; @@ -51,14 +48,22 @@ EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(node_to_cpumask_map); EXPORT_SYMBOL(node_data); -static int min_common_depth; +static int primary_domain_index; static int n_mem_addr_cells, n_mem_size_cells; -static int form1_affinity; + +#define FORM0_AFFINITY 0 +#define FORM1_AFFINITY 1 +#define FORM2_AFFINITY 2 +static int affinity_form; #define MAX_DISTANCE_REF_POINTS 4 static int distance_ref_points_depth; static const __be32 *distance_ref_points; static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; +static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = { + [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 } +}; +static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE }; /* * Allocate node_to_cpumask_map based on number of available nodes @@ -79,7 +84,7 @@ static void __init setup_node_to_cpumask_map(void) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ - dbg("Node to cpumask map for %u nodes\n", nr_node_ids); + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init fake_numa_create_new_node(unsigned long end_pfn, @@ -123,7 +128,7 @@ static int __init fake_numa_create_new_node(unsigned long end_pfn, cmdline = p; fake_nid++; *nid = fake_nid; - dbg("created new fake_node with id %d\n", fake_nid); + pr_debug("created new fake_node with id %d\n", fake_nid); return 1; } return 0; @@ -137,33 +142,79 @@ static void reset_numa_cpu_lookup_table(void) numa_cpu_lookup_table[cpu] = -1; } -static void map_cpu_to_node(int cpu, int node) +void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); - dbg("adding cpu %d to node %d\n", cpu, node); - - if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) + if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) { + pr_debug("adding cpu %d to node %d\n", cpu, node); cpumask_set_cpu(cpu, node_to_cpumask_map[node]); + } } #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) -static void unmap_cpu_from_node(unsigned long cpu) +void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; - dbg("removing cpu %lu from node %d\n", cpu, node); - if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); + pr_debug("removing cpu %lu from node %d\n", cpu, node); } else { - printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", - cpu, node); + pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node); } } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ -int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +static int __associativity_to_nid(const __be32 *associativity, + int max_array_sz) +{ + int nid; + /* + * primary_domain_index is 1 based array index. + */ + int index = primary_domain_index - 1; + + if (!numa_enabled || index >= max_array_sz) + return NUMA_NO_NODE; + + nid = of_read_number(&associativity[index], 1); + + /* POWER4 LPAR uses 0xffff as invalid node */ + if (nid == 0xffff || nid >= nr_node_ids) + nid = NUMA_NO_NODE; + return nid; +} +/* + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA + * info is found. + */ +static int associativity_to_nid(const __be32 *associativity) +{ + int array_sz = of_read_number(associativity, 1); + + /* Skip the first element in the associativity array */ + return __associativity_to_nid((associativity + 1), array_sz); +} + +static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + int dist; + int node1, node2; + + node1 = associativity_to_nid(cpu1_assoc); + node2 = associativity_to_nid(cpu2_assoc); + + dist = numa_distance_table[node1][node2]; + if (dist <= LOCAL_DISTANCE) + return 0; + else if (dist <= REMOTE_DISTANCE) + return 1; + else + return 2; +} + +static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { int dist = 0; @@ -179,6 +230,15 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) return dist; } +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + /* We should not get called with FORM0 */ + VM_WARN_ON(affinity_form == FORM0_AFFINITY); + if (affinity_form == FORM1_AFFINITY) + return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); + return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc); +} + /* must hold reference to node during call */ static const __be32 *of_get_associativity(struct device_node *dev) { @@ -190,7 +250,9 @@ int __node_distance(int a, int b) int i; int distance = LOCAL_DISTANCE; - if (!form1_affinity) + if (affinity_form == FORM2_AFFINITY) + return numa_distance_table[a][b]; + else if (affinity_form == FORM0_AFFINITY) return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); for (i = 0; i < distance_ref_points_depth; i++) { @@ -205,52 +267,6 @@ int __node_distance(int a, int b) } EXPORT_SYMBOL(__node_distance); -static void initialize_distance_lookup_table(int nid, - const __be32 *associativity) -{ - int i; - - if (!form1_affinity) - return; - - for (i = 0; i < distance_ref_points_depth; i++) { - const __be32 *entry; - - entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1]; - distance_lookup_table[nid][i] = of_read_number(entry, 1); - } -} - -/* - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA - * info is found. - */ -static int associativity_to_nid(const __be32 *associativity) -{ - int nid = NUMA_NO_NODE; - - if (!numa_enabled) - goto out; - - if (of_read_number(associativity, 1) >= min_common_depth) - nid = of_read_number(&associativity[min_common_depth], 1); - - /* POWER4 LPAR uses 0xffff as invalid node */ - if (nid == 0xffff || nid >= nr_node_ids) - nid = NUMA_NO_NODE; - - if (nid > 0 && - of_read_number(associativity, 1) >= distance_ref_points_depth) { - /* - * Skip the length field and send start of associativity array - */ - initialize_distance_lookup_table(nid, associativity + 1); - } - -out: - return nid; -} - /* Returns the nid associated with the given device tree node, * or -1 if not found. */ @@ -284,11 +300,159 @@ int of_node_to_nid(struct device_node *device) } EXPORT_SYMBOL(of_node_to_nid); -static int __init find_min_common_depth(void) +static void __initialize_form1_numa_distance(const __be32 *associativity, + int max_array_sz) +{ + int i, nid; + + if (affinity_form != FORM1_AFFINITY) + return; + + nid = __associativity_to_nid(associativity, max_array_sz); + if (nid != NUMA_NO_NODE) { + for (i = 0; i < distance_ref_points_depth; i++) { + const __be32 *entry; + int index = be32_to_cpu(distance_ref_points[i]) - 1; + + /* + * broken hierarchy, return with broken distance table + */ + if (WARN(index >= max_array_sz, "Broken ibm,associativity property")) + return; + + entry = &associativity[index]; + distance_lookup_table[nid][i] = of_read_number(entry, 1); + } + } +} + +static void initialize_form1_numa_distance(const __be32 *associativity) +{ + int array_sz; + + array_sz = of_read_number(associativity, 1); + /* Skip the first element in the associativity array */ + __initialize_form1_numa_distance(associativity + 1, array_sz); +} + +/* + * Used to update distance information w.r.t newly added node. + */ +void update_numa_distance(struct device_node *node) { - int depth; + int nid; + + if (affinity_form == FORM0_AFFINITY) + return; + else if (affinity_form == FORM1_AFFINITY) { + const __be32 *associativity; + + associativity = of_get_associativity(node); + if (!associativity) + return; + + initialize_form1_numa_distance(associativity); + return; + } + + /* FORM2 affinity */ + nid = of_node_to_nid_single(node); + if (nid == NUMA_NO_NODE) + return; + + /* + * With FORM2 we expect NUMA distance of all possible NUMA + * nodes to be provided during boot. + */ + WARN(numa_distance_table[nid][nid] == -1, + "NUMA distance details for node %d not provided\n", nid); +} + +/* + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} + */ +static void initialize_form2_numa_distance_lookup_table(void) +{ + int i, j; + struct device_node *root; + const __u8 *numa_dist_table; + const __be32 *numa_lookup_index; + int numa_dist_table_length; + int max_numa_index, distance_index; + + if (firmware_has_feature(FW_FEATURE_OPAL)) + root = of_find_node_by_path("/ibm,opal"); + else + root = of_find_node_by_path("/rtas"); + if (!root) + root = of_find_node_by_path("/"); + + numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL); + max_numa_index = of_read_number(&numa_lookup_index[0], 1); + + /* first element of the array is the size and is encode-int */ + numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL); + numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1); + /* Skip the size which is encoded int */ + numa_dist_table += sizeof(__be32); + + pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n", + numa_dist_table_length, max_numa_index); + + for (i = 0; i < max_numa_index; i++) + /* +1 skip the max_numa_index in the property */ + numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1); + + + if (numa_dist_table_length != max_numa_index * max_numa_index) { + WARN(1, "Wrong NUMA distance information\n"); + /* consider everybody else just remote. */ + for (i = 0; i < max_numa_index; i++) { + for (j = 0; j < max_numa_index; j++) { + int nodeA = numa_id_index_table[i]; + int nodeB = numa_id_index_table[j]; + + if (nodeA == nodeB) + numa_distance_table[nodeA][nodeB] = LOCAL_DISTANCE; + else + numa_distance_table[nodeA][nodeB] = REMOTE_DISTANCE; + } + } + } + + distance_index = 0; + for (i = 0; i < max_numa_index; i++) { + for (j = 0; j < max_numa_index; j++) { + int nodeA = numa_id_index_table[i]; + int nodeB = numa_id_index_table[j]; + + numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index++]; + pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]); + } + } + of_node_put(root); +} + +static int __init find_primary_domain_index(void) +{ + int index; struct device_node *root; + /* + * Check for which form of affinity. + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) { + affinity_form = FORM1_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { + pr_debug("Using form 2 affinity\n"); + affinity_form = FORM2_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { + pr_debug("Using form 1 affinity\n"); + affinity_form = FORM1_AFFINITY; + } else + affinity_form = FORM0_AFFINITY; + if (firmware_has_feature(FW_FEATURE_OPAL)) root = of_find_node_by_path("/ibm,opal"); else @@ -313,42 +477,37 @@ static int __init find_min_common_depth(void) &distance_ref_points_depth); if (!distance_ref_points) { - dbg("NUMA: ibm,associativity-reference-points not found.\n"); + pr_debug("ibm,associativity-reference-points not found.\n"); goto err; } distance_ref_points_depth /= sizeof(int); - - if (firmware_has_feature(FW_FEATURE_OPAL) || - firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { - dbg("Using form 1 affinity\n"); - form1_affinity = 1; - } - - if (form1_affinity) { - depth = of_read_number(distance_ref_points, 1); - } else { + if (affinity_form == FORM0_AFFINITY) { if (distance_ref_points_depth < 2) { - printk(KERN_WARNING "NUMA: " - "short ibm,associativity-reference-points\n"); + pr_warn("short ibm,associativity-reference-points\n"); goto err; } - depth = of_read_number(&distance_ref_points[1], 1); + index = of_read_number(&distance_ref_points[1], 1); + } else { + /* + * Both FORM1 and FORM2 affinity find the primary domain details + * at the same offset. + */ + index = of_read_number(distance_ref_points, 1); } - /* * Warn and cap if the hardware supports more than * MAX_DISTANCE_REF_POINTS domains. */ if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { - printk(KERN_WARNING "NUMA: distance array capped at " - "%d entries\n", MAX_DISTANCE_REF_POINTS); + pr_warn("distance array capped at %d entries\n", + MAX_DISTANCE_REF_POINTS); distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; } of_node_put(root); - return depth; + return index; err: of_node_put(root); @@ -426,6 +585,38 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa) return 0; } +static int get_nid_and_numa_distance(struct drmem_lmb *lmb) +{ + struct assoc_arrays aa = { .arrays = NULL }; + int default_nid = NUMA_NO_NODE; + int nid = default_nid; + int rc, index; + + if ((primary_domain_index < 0) || !numa_enabled) + return default_nid; + + rc = of_get_assoc_arrays(&aa); + if (rc) + return default_nid; + + if (primary_domain_index <= aa.array_sz && + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { + const __be32 *associativity; + + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); + if (nid > 0 && affinity_form == FORM1_AFFINITY) { + /* + * lookup array associativity entries have + * no length of the array as the first element. + */ + __initialize_form1_numa_distance(associativity, aa.array_sz); + } + } + return nid; +} + /* * This is like of_node_to_nid_single() for memory represented in the * ibm,dynamic-reconfiguration-memory node. @@ -437,35 +628,28 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb) int nid = default_nid; int rc, index; - if ((min_common_depth < 0) || !numa_enabled) + if ((primary_domain_index < 0) || !numa_enabled) return default_nid; rc = of_get_assoc_arrays(&aa); if (rc) return default_nid; - if (min_common_depth <= aa.array_sz && + if (primary_domain_index <= aa.array_sz && !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { - index = lmb->aa_index * aa.array_sz + min_common_depth - 1; - nid = of_read_number(&aa.arrays[index], 1); - - if (nid == 0xffff || nid >= nr_node_ids) - nid = default_nid; + const __be32 *associativity; - if (nid > 0) { - index = lmb->aa_index * aa.array_sz; - initialize_distance_lookup_table(nid, - &aa.arrays[index]); - } + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); } - return nid; } #ifdef CONFIG_PPC_SPLPAR -static int vphn_get_nid(long lcpu) + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) { - __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; long rc, hwid; /* @@ -485,12 +669,30 @@ static int vphn_get_nid(long lcpu) rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); if (rc == H_SUCCESS) - return associativity_to_nid(associativity); + return 0; } + return -1; +} + +static int vphn_get_nid(long lcpu) +{ + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + + + if (!__vphn_get_associativity(lcpu, associativity)) + return associativity_to_nid(associativity); + return NUMA_NO_NODE; + } #else + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) +{ + return -1; +} + static int vphn_get_nid(long unused) { return NUMA_NO_NODE; @@ -598,9 +800,6 @@ static int ppc_numa_cpu_prepare(unsigned int cpu) static int ppc_numa_cpu_dead(unsigned int cpu) { -#ifdef CONFIG_HOTPLUG_CPU - unmap_cpu_from_node(cpu); -#endif return 0; } @@ -685,7 +884,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, size = read_n_cells(n_mem_size_cells, usm); } - nid = of_drconf_to_nid_single(lmb); + nid = get_nid_and_numa_distance(lmb); fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), &nid); node_set_online(nid); @@ -702,24 +901,31 @@ static int __init parse_numa_properties(void) struct device_node *memory; int default_nid = 0; unsigned long i; + const __be32 *associativity; if (numa_enabled == 0) { - printk(KERN_WARNING "NUMA disabled by user\n"); + pr_warn("disabled by user\n"); return -1; } - min_common_depth = find_min_common_depth(); + primary_domain_index = find_primary_domain_index(); - if (min_common_depth < 0) { + if (primary_domain_index < 0) { /* - * if we fail to parse min_common_depth from device tree + * if we fail to parse primary_domain_index from device tree * mark the numa disabled, boot with numa disabled. */ numa_enabled = false; - return min_common_depth; + return primary_domain_index; } - dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); + pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index); + + /* + * If it is FORM2 initialize the distance table here. + */ + if (affinity_form == FORM2_AFFINITY) + initialize_form2_numa_distance_lookup_table(); /* * Even though we connect cpus to numa domains later in SMP @@ -727,18 +933,30 @@ static int __init parse_numa_properties(void) * each node to be onlined must have NODE_DATA etc backing it. */ for_each_present_cpu(i) { + __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE]; struct device_node *cpu; - int nid = vphn_get_nid(i); + int nid = NUMA_NO_NODE; - /* - * Don't fall back to default_nid yet -- we will plug - * cpus into nodes once the memory scan has discovered - * the topology. - */ - if (nid == NUMA_NO_NODE) { + memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32)); + + if (__vphn_get_associativity(i, vphn_assoc) == 0) { + nid = associativity_to_nid(vphn_assoc); + initialize_form1_numa_distance(vphn_assoc); + } else { + + /* + * Don't fall back to default_nid yet -- we will plug + * cpus into nodes once the memory scan has discovered + * the topology. + */ cpu = of_get_cpu_node(i, NULL); BUG_ON(!cpu); - nid = of_node_to_nid_single(cpu); + + associativity = of_get_associativity(cpu); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } of_node_put(cpu); } @@ -774,8 +992,11 @@ new_range: * have associativity properties. If none, then * everything goes to default_nid. */ - nid = of_node_to_nid_single(memory); - if (nid < 0) + associativity = of_get_associativity(memory); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } else nid = default_nid; fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); @@ -811,10 +1032,8 @@ static void __init setup_nonnuma(void) unsigned int nid = 0; int i; - printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", - top_of_ram, total_ram); - printk(KERN_DEBUG "Memory hole size: %ldMB\n", - (top_of_ram - total_ram) >> 20); + pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); + pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { fake_numa_create_new_node(end_pfn, &nid); @@ -893,7 +1112,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) static void __init find_possible_nodes(void) { struct device_node *rtas; - const __be32 *domains; + const __be32 *domains = NULL; int prop_length, max_nodes; u32 i; @@ -909,9 +1128,14 @@ static void __init find_possible_nodes(void) * it doesn't exist, then fallback on ibm,max-associativity-domains. * Current denotes what the platform can support compared to max * which denotes what the Hypervisor can support. + * + * If the LPAR is migratable, new nodes might be activated after a LPM, + * so we should consider the max number in that case. */ - domains = of_get_property(rtas, "ibm,current-associativity-domains", - &prop_length); + if (!of_get_property(of_root, "ibm,migratable-partition", NULL)) + domains = of_get_property(rtas, + "ibm,current-associativity-domains", + &prop_length); if (!domains) { domains = of_get_property(rtas, "ibm,max-associativity-domains", &prop_length); @@ -919,14 +1143,16 @@ static void __init find_possible_nodes(void) goto out; } - max_nodes = of_read_number(&domains[min_common_depth], 1); + max_nodes = of_read_number(&domains[primary_domain_index], 1); + pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); + for (i = 0; i < max_nodes; i++) { if (!node_possible(i)) node_set(i, node_possible_map); } prop_length /= sizeof(int); - if (prop_length > min_common_depth + 2) + if (prop_length > primary_domain_index + 2) coregroup_enabled = 1; out: @@ -1014,9 +1240,6 @@ static int __init early_numa(char *p) if (strstr(p, "off")) numa_enabled = 0; - if (strstr(p, "debug")) - numa_debug = 1; - p = strstr(p, "fake="); if (p) cmdline = p + strlen("fake="); @@ -1179,7 +1402,7 @@ static long vphn_get_associativity(unsigned long cpu, switch (rc) { case H_SUCCESS: - dbg("VPHN hcall succeeded. Reset polling...\n"); + pr_debug("VPHN hcall succeeded. Reset polling...\n"); goto out; case H_FUNCTION: @@ -1259,7 +1482,7 @@ int cpu_to_coregroup_id(int cpu) goto out; index = of_read_number(associativity, 1); - if (index > min_common_depth + 1) + if (index > primary_domain_index + 1) return of_read_number(&associativity[index - 1], 1); out: diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c index 86da2a669680..fac932eb8f9a 100644 --- a/arch/powerpc/mm/ptdump/8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -75,8 +75,10 @@ static const struct flag_info flag_array[] = { }; struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ + { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile index 712762be3cb1..4050cbb55acf 100644 --- a/arch/powerpc/mm/ptdump/Makefile +++ b/arch/powerpc/mm/ptdump/Makefile @@ -5,5 +5,10 @@ obj-y += ptdump.o obj-$(CONFIG_4xx) += shared.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o -obj-$(CONFIG_PPC_BOOK3S_32) += shared.o bats.o segment_regs.o -obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o hashpagetable.o +obj-$(CONFIG_PPC_BOOK3S_32) += shared.o +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o + +ifdef CONFIG_PTDUMP_DEBUGFS +obj-$(CONFIG_PPC_BOOK3S_32) += bats.o segment_regs.o +obj-$(CONFIG_PPC_BOOK3S_64) += hashpagetable.o +endif diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c index c4c628b03cf8..820c119013e4 100644 --- a/arch/powerpc/mm/ptdump/bats.c +++ b/arch/powerpc/mm/ptdump/bats.c @@ -7,7 +7,7 @@ */ #include <linux/pgtable.h> -#include <asm/debugfs.h> +#include <linux/debugfs.h> #include <asm/cpu_has_feature.h> #include "ptdump.h" @@ -57,7 +57,7 @@ static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool #define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), mfspr(_u), _d) -static int bats_show_603(struct seq_file *m, void *v) +static int bats_show(struct seq_file *m, void *v) { seq_puts(m, "---[ Instruction Block Address Translation ]---\n"); @@ -88,22 +88,12 @@ static int bats_show_603(struct seq_file *m, void *v) return 0; } -static int bats_open(struct inode *inode, struct file *file) -{ - return single_open(file, bats_show_603, NULL); -} - -static const struct file_operations bats_fops = { - .open = bats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(bats); static int __init bats_init(void) { debugfs_create_file("block_address_translation", 0400, - powerpc_debugfs_root, NULL, &bats_fops); + arch_debugfs_dir, NULL, &bats_fops); return 0; } device_initcall(bats_init); diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c index 14f73868db66..5ad92d9dc5d1 100644 --- a/arch/powerpc/mm/ptdump/book3s64.c +++ b/arch/powerpc/mm/ptdump/book3s64.c @@ -103,8 +103,10 @@ static const struct flag_info flag_array[] = { }; struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ + { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index ad6df9a2e7c8..c7f824d294b2 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -526,17 +526,7 @@ static int ptdump_show(struct seq_file *m, void *v) return 0; } -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump); static int ptdump_init(void) { diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 5062c58b1e5b..bf251191e78d 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -16,6 +16,7 @@ #include <linux/io.h> #include <linux/mm.h> #include <linux/highmem.h> +#include <linux/ptdump.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <asm/fixmap.h> @@ -54,11 +55,12 @@ * */ struct pg_state { + struct ptdump_state ptdump; struct seq_file *seq; const struct addr_marker *marker; unsigned long start_address; unsigned long start_pa; - unsigned int level; + int level; u64 current_flags; bool check_wx; unsigned long wx_pages; @@ -102,6 +104,11 @@ static struct addr_marker address_markers[] = { { -1, NULL }, }; +static struct ptdump_range ptdump_range[] __ro_after_init = { + {TASK_SIZE_MAX, ~0UL}, + {0, 0} +}; + #define pt_dump_seq_printf(m, fmt, args...) \ ({ \ if (m) \ @@ -188,10 +195,9 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) st->wx_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_page_update_state(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val, unsigned long page_size) +static void note_page_update_state(struct pg_state *st, unsigned long addr, int level, u64 val) { - u64 flag = val & pg_level[level].mask; + u64 flag = level >= 0 ? val & pg_level[level].mask : 0; u64 pa = val & PTE_RPN_MASK; st->level = level; @@ -205,15 +211,15 @@ static void note_page_update_state(struct pg_state *st, unsigned long addr, } } -static void note_page(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val, unsigned long page_size) +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) { - u64 flag = val & pg_level[level].mask; + u64 flag = level >= 0 ? val & pg_level[level].mask : 0; + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); /* At first no level is set */ - if (!st->level) { + if (st->level == -1) { pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); - note_page_update_state(st, addr, level, val, page_size); + note_page_update_state(st, addr, level, val); /* * Dump the section of virtual memory when: * - the PTE flags from one entry to the next differs. @@ -242,95 +248,7 @@ static void note_page(struct pg_state *st, unsigned long addr, * Address indicates we have passed the end of the * current section of virtual memory */ - note_page_update_state(st, addr, level, val, page_size); - } -} - -static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) -{ - pte_t *pte = pte_offset_kernel(pmd, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PTE; i++, pte++) { - addr = start + i * PAGE_SIZE; - note_page(st, addr, 4, pte_val(*pte), PAGE_SIZE); - - } -} - -static void walk_hugepd(struct pg_state *st, hugepd_t *phpd, unsigned long start, - int pdshift, int level) -{ -#ifdef CONFIG_ARCH_HAS_HUGEPD - unsigned int i; - int shift = hugepd_shift(*phpd); - int ptrs_per_hpd = pdshift - shift > 0 ? 1 << (pdshift - shift) : 1; - - if (start & ((1 << shift) - 1)) - return; - - for (i = 0; i < ptrs_per_hpd; i++) { - unsigned long addr = start + (i << shift); - pte_t *pte = hugepte_offset(*phpd, addr, pdshift); - - note_page(st, addr, level + 1, pte_val(*pte), 1 << shift); - } -#endif -} - -static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) -{ - pmd_t *pmd = pmd_offset(pud, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { - addr = start + i * PMD_SIZE; - if (!pmd_none(*pmd) && !pmd_is_leaf(*pmd)) - /* pmd exists */ - walk_pte(st, pmd, addr); - else - note_page(st, addr, 3, pmd_val(*pmd), PMD_SIZE); - } -} - -static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) -{ - pud_t *pud = pud_offset(p4d, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PUD; i++, pud++) { - addr = start + i * PUD_SIZE; - if (!pud_none(*pud) && !pud_is_leaf(*pud)) - /* pud exists */ - walk_pmd(st, pud, addr); - else - note_page(st, addr, 2, pud_val(*pud), PUD_SIZE); - } -} - -static void walk_pagetables(struct pg_state *st) -{ - unsigned int i; - unsigned long addr = st->start_address & PGDIR_MASK; - pgd_t *pgd = pgd_offset_k(addr); - - /* - * Traverse the linux pagetable structure and dump pages that are in - * the hash pagetable. - */ - for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { - p4d_t *p4d = p4d_offset(pgd, 0); - - if (p4d_none(*p4d) || p4d_is_leaf(*p4d)) - note_page(st, addr, 1, p4d_val(*p4d), PGDIR_SIZE); - else if (is_hugepd(__hugepd(p4d_val(*p4d)))) - walk_hugepd(st, (hugepd_t *)p4d, addr, PGDIR_SHIFT, 1); - else - /* p4d exists */ - walk_pud(st, p4d, addr); + note_page_update_state(st, addr, level, val); } } @@ -383,32 +301,19 @@ static int ptdump_show(struct seq_file *m, void *v) struct pg_state st = { .seq = m, .marker = address_markers, - .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, + .level = -1, + .ptdump = { + .note_page = note_page, + .range = ptdump_range, + } }; -#ifdef CONFIG_PPC64 - if (!radix_enabled()) - st.start_address = KERN_VIRT_START; -#endif - /* Traverse kernel page tables */ - walk_pagetables(&st); - note_page(&st, 0, 0, 0, 0); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); return 0; } - -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump); static void build_pgtable_complete_mask(void) { @@ -420,22 +325,24 @@ static void build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } -#ifdef CONFIG_PPC_DEBUG_WX +#ifdef CONFIG_DEBUG_WX void ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, - .marker = address_markers, + .marker = (struct addr_marker[]) { + { 0, NULL}, + { -1, NULL}, + }, + .level = -1, .check_wx = true, - .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, + .ptdump = { + .note_page = note_page, + .range = ptdump_range, + } }; -#ifdef CONFIG_PPC64 - if (!radix_enabled()) - st.start_address = KERN_VIRT_START; -#endif - - walk_pagetables(&st); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); if (st.wx_pages) pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", @@ -445,12 +352,23 @@ void ptdump_check_wx(void) } #endif -static int ptdump_init(void) +static int __init ptdump_init(void) { +#ifdef CONFIG_PPC64 + if (!radix_enabled()) + ptdump_range[0].start = KERN_VIRT_START; + else + ptdump_range[0].start = PAGE_OFFSET; + + ptdump_range[0].end = PAGE_OFFSET + (PGDIR_SIZE * PTRS_PER_PGD); +#endif + populate_markers(); build_pgtable_complete_mask(); - debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, - &ptdump_fops); + + if (IS_ENABLED(CONFIG_PTDUMP_DEBUGFS)) + debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); + return 0; } device_initcall(ptdump_init); diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c index 565048a0c9be..9df3af8d481f 100644 --- a/arch/powerpc/mm/ptdump/segment_regs.c +++ b/arch/powerpc/mm/ptdump/segment_regs.c @@ -6,7 +6,7 @@ * This dumps the content of Segment Registers */ -#include <asm/debugfs.h> +#include <linux/debugfs.h> static void seg_show(struct seq_file *m, int i) { @@ -41,21 +41,11 @@ static int sr_show(struct seq_file *m, void *v) return 0; } -static int sr_open(struct inode *inode, struct file *file) -{ - return single_open(file, sr_show, NULL); -} - -static const struct file_operations sr_fops = { - .open = sr_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(sr); static int __init sr_init(void) { - debugfs_create_file("segment_registers", 0400, powerpc_debugfs_root, + debugfs_create_file("segment_registers", 0400, arch_debugfs_dir, NULL, &sr_fops); return 0; } diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c index c005fe041c18..03607ab90c66 100644 --- a/arch/powerpc/mm/ptdump/shared.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -68,8 +68,10 @@ static const struct flag_info flag_array[] = { }; struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ + { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index bb0ee716de91..73e62e9b179b 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -340,6 +340,13 @@ static inline void perf_read_regs(struct pt_regs *regs) * If the PMU doesn't update the SIAR for non marked events use * pt_regs. * + * If regs is a kernel interrupt, always use SIAR. Some PMUs have an + * issue with regs_sipr not being in synch with SIAR in interrupt entry + * and return sequences, which can result in regs_sipr being true for + * kernel interrupts and SIAR, which has the effect of causing samples + * to pile up at mtmsrd MSR[EE] 0->1 or pending irq replay around + * interrupt entry/exit. + * * If the PMU has HV/PR flags then check to see if they * place the exception in userspace. If so, use pt_regs. In * continuous sampling mode the SIAR and the PMU exception are @@ -356,6 +363,8 @@ static inline void perf_read_regs(struct pt_regs *regs) use_siar = 1; else if ((ppmu->flags & PPMU_NO_CONT_SAMPLING)) use_siar = 0; + else if (!user_mode(regs)) + use_siar = 1; else if (!(ppmu->flags & PPMU_NO_SIPR) && regs_sipr(regs)) use_siar = 0; else @@ -2251,18 +2260,10 @@ unsigned long perf_misc_flags(struct pt_regs *regs) */ unsigned long perf_instruction_pointer(struct pt_regs *regs) { - bool use_siar = regs_use_siar(regs); unsigned long siar = mfspr(SPRN_SIAR); - if (ppmu && (ppmu->flags & PPMU_P10_DD1)) { - if (siar) - return siar; - else - return regs->nip; - } else if (use_siar && siar_valid(regs)) - return mfspr(SPRN_SIAR) + perf_ip_adjust(regs); - else if (use_siar) - return 0; // no valid instruction pointer + if (regs_use_siar(regs) && siar_valid(regs) && siar) + return siar + perf_ip_adjust(regs); else return regs->nip; } diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index d48413e28c39..c756228a081f 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -175,7 +175,7 @@ static unsigned long single_gpci_request(u32 req, u32 starting_index, */ count = 0; for (i = offset; i < offset + length; i++) - count |= arg->bytes[i] << (i - offset); + count |= (u64)(arg->bytes[i]) << ((length - 1 - (i - offset)) * 8); *value = count; out: diff --git a/arch/powerpc/platforms/44x/machine_check.c b/arch/powerpc/platforms/44x/machine_check.c index a5c898bb9bab..5d19daacd78a 100644 --- a/arch/powerpc/platforms/44x/machine_check.c +++ b/arch/powerpc/platforms/44x/machine_check.c @@ -11,7 +11,7 @@ int machine_check_440A(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; printk("Machine check in kernel mode.\n"); if (reason & ESR_IMCP){ @@ -48,7 +48,7 @@ int machine_check_440A(struct pt_regs *regs) #ifdef CONFIG_PPC_47x int machine_check_47x(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; u32 mcsr; printk(KERN_ERR "Machine check in kernel mode.\n"); diff --git a/arch/powerpc/platforms/4xx/machine_check.c b/arch/powerpc/platforms/4xx/machine_check.c index a71c29892a91..a905da1d6f41 100644 --- a/arch/powerpc/platforms/4xx/machine_check.c +++ b/arch/powerpc/platforms/4xx/machine_check.c @@ -10,7 +10,7 @@ int machine_check_4xx(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; if (reason & ESR_IMCP) { printk("Instruction"); diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig index b77cbb0a50e1..4142ebf01382 100644 --- a/arch/powerpc/platforms/85xx/Kconfig +++ b/arch/powerpc/platforms/85xx/Kconfig @@ -208,12 +208,6 @@ config TQM8560 select TQM85xx select CPM2 -config SBC8548 - bool "Wind River SBC8548" - select DEFAULT_UIMAGE - help - This option enables support for the Wind River SBC8548 board - config PPA8548 bool "Prodrive PPA8548" help diff --git a/arch/powerpc/platforms/85xx/Makefile b/arch/powerpc/platforms/85xx/Makefile index d1dd0dca5ebf..60e4e97a929d 100644 --- a/arch/powerpc/platforms/85xx/Makefile +++ b/arch/powerpc/platforms/85xx/Makefile @@ -26,7 +26,6 @@ obj-$(CONFIG_CORENET_GENERIC) += corenet_generic.o obj-$(CONFIG_FB_FSL_DIU) += t1042rdb_diu.o obj-$(CONFIG_STX_GP3) += stx_gp3.o obj-$(CONFIG_TQM85xx) += tqm85xx.o -obj-$(CONFIG_SBC8548) += sbc8548.o obj-$(CONFIG_PPA8548) += ppa8548.o obj-$(CONFIG_SOCRATES) += socrates.o socrates_fpga_pic.o obj-$(CONFIG_KSI8560) += ksi8560.o diff --git a/arch/powerpc/platforms/85xx/sbc8548.c b/arch/powerpc/platforms/85xx/sbc8548.c deleted file mode 100644 index e4acf5ce6b07..000000000000 --- a/arch/powerpc/platforms/85xx/sbc8548.c +++ /dev/null @@ -1,134 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Wind River SBC8548 setup and early boot code. - * - * Copyright 2007 Wind River Systems Inc. - * - * By Paul Gortmaker (see MAINTAINERS for contact information) - * - * Based largely on the MPC8548CDS support - Copyright 2005 Freescale Inc. - */ - -#include <linux/stddef.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/reboot.h> -#include <linux/pci.h> -#include <linux/kdev_t.h> -#include <linux/major.h> -#include <linux/console.h> -#include <linux/delay.h> -#include <linux/seq_file.h> -#include <linux/initrd.h> -#include <linux/interrupt.h> -#include <linux/fsl_devices.h> -#include <linux/of_platform.h> -#include <linux/pgtable.h> - -#include <asm/page.h> -#include <linux/atomic.h> -#include <asm/time.h> -#include <asm/io.h> -#include <asm/machdep.h> -#include <asm/ipic.h> -#include <asm/pci-bridge.h> -#include <asm/irq.h> -#include <mm/mmu_decl.h> -#include <asm/prom.h> -#include <asm/udbg.h> -#include <asm/mpic.h> - -#include <sysdev/fsl_soc.h> -#include <sysdev/fsl_pci.h> - -#include "mpc85xx.h" - -static int sbc_rev; - -static void __init sbc8548_pic_init(void) -{ - struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN, - 0, 256, " OpenPIC "); - BUG_ON(mpic == NULL); - mpic_init(mpic); -} - -/* Extract the HW Rev from the EPLD on the board */ -static int __init sbc8548_hw_rev(void) -{ - struct device_node *np; - struct resource res; - unsigned int *rev; - int board_rev = 0; - - np = of_find_compatible_node(NULL, NULL, "hw-rev"); - if (np == NULL) { - printk("No HW-REV found in DTB.\n"); - return -ENODEV; - } - - of_address_to_resource(np, 0, &res); - of_node_put(np); - - rev = ioremap(res.start,sizeof(unsigned int)); - board_rev = (*rev) >> 28; - iounmap(rev); - - return board_rev; -} - -/* - * Setup the architecture - */ -static void __init sbc8548_setup_arch(void) -{ - if (ppc_md.progress) - ppc_md.progress("sbc8548_setup_arch()", 0); - - fsl_pci_assign_primary(); - - sbc_rev = sbc8548_hw_rev(); -} - -static void sbc8548_show_cpuinfo(struct seq_file *m) -{ - uint pvid, svid, phid1; - - pvid = mfspr(SPRN_PVR); - svid = mfspr(SPRN_SVR); - - seq_printf(m, "Vendor\t\t: Wind River\n"); - seq_printf(m, "Machine\t\t: SBC8548 v%d\n", sbc_rev); - seq_printf(m, "PVR\t\t: 0x%x\n", pvid); - seq_printf(m, "SVR\t\t: 0x%x\n", svid); - - /* Display cpu Pll setting */ - phid1 = mfspr(SPRN_HID1); - seq_printf(m, "PLL setting\t: 0x%x\n", ((phid1 >> 24) & 0x3f)); -} - -machine_arch_initcall(sbc8548, mpc85xx_common_publish_devices); - -/* - * Called very early, device-tree isn't unflattened - */ -static int __init sbc8548_probe(void) -{ - return of_machine_is_compatible("SBC8548"); -} - -define_machine(sbc8548) { - .name = "SBC8548", - .probe = sbc8548_probe, - .setup_arch = sbc8548_setup_arch, - .init_IRQ = sbc8548_pic_init, - .show_cpuinfo = sbc8548_show_cpuinfo, - .get_irq = mpic_get_irq, -#ifdef CONFIG_PCI - .pcibios_fixup_bus = fsl_pcibios_fixup_bus, - .pcibios_fixup_phb = fsl_pcibios_fixup_phb, -#endif - .calibrate_decr = generic_calibrate_decr, - .progress = udbg_progress, -}; diff --git a/arch/powerpc/platforms/86xx/Kconfig b/arch/powerpc/platforms/86xx/Kconfig index 07a9d60c618a..be867abebc83 100644 --- a/arch/powerpc/platforms/86xx/Kconfig +++ b/arch/powerpc/platforms/86xx/Kconfig @@ -20,12 +20,6 @@ config MPC8641_HPCN help This option enables support for the MPC8641 HPCN board. -config SBC8641D - bool "Wind River SBC8641D" - select DEFAULT_UIMAGE - help - This option enables support for the WRS SBC8641D board. - config MPC8610_HPCD bool "Freescale MPC8610 HPCD" select DEFAULT_UIMAGE @@ -74,7 +68,7 @@ config MPC8641 select FSL_PCI if PCI select PPC_UDBG_16550 select MPIC - default y if MPC8641_HPCN || SBC8641D || GEF_SBC610 || GEF_SBC310 || GEF_PPC9A \ + default y if MPC8641_HPCN || GEF_SBC610 || GEF_SBC310 || GEF_PPC9A \ || MVME7100 config MPC8610 diff --git a/arch/powerpc/platforms/86xx/Makefile b/arch/powerpc/platforms/86xx/Makefile index 2c04449be107..5bbe1475bf26 100644 --- a/arch/powerpc/platforms/86xx/Makefile +++ b/arch/powerpc/platforms/86xx/Makefile @@ -6,7 +6,6 @@ obj-y := pic.o common.o obj-$(CONFIG_SMP) += mpc86xx_smp.o obj-$(CONFIG_MPC8641_HPCN) += mpc86xx_hpcn.o -obj-$(CONFIG_SBC8641D) += sbc8641d.o obj-$(CONFIG_MPC8610_HPCD) += mpc8610_hpcd.o obj-$(CONFIG_GEF_SBC610) += gef_sbc610.o obj-$(CONFIG_GEF_SBC310) += gef_sbc310.o diff --git a/arch/powerpc/platforms/86xx/sbc8641d.c b/arch/powerpc/platforms/86xx/sbc8641d.c deleted file mode 100644 index dc23dd383d6e..000000000000 --- a/arch/powerpc/platforms/86xx/sbc8641d.c +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8641D board specific routines - * - * Copyright 2008 Wind River Systems Inc. - * - * By Paul Gortmaker (see MAINTAINERS for contact information) - * - * Based largely on the 8641 HPCN support by Freescale Semiconductor Inc. - */ - -#include <linux/stddef.h> -#include <linux/kernel.h> -#include <linux/pci.h> -#include <linux/kdev_t.h> -#include <linux/delay.h> -#include <linux/seq_file.h> -#include <linux/of_platform.h> - -#include <asm/time.h> -#include <asm/machdep.h> -#include <asm/pci-bridge.h> -#include <asm/prom.h> -#include <mm/mmu_decl.h> -#include <asm/udbg.h> - -#include <asm/mpic.h> - -#include <sysdev/fsl_pci.h> -#include <sysdev/fsl_soc.h> - -#include "mpc86xx.h" - -static void __init -sbc8641_setup_arch(void) -{ - if (ppc_md.progress) - ppc_md.progress("sbc8641_setup_arch()", 0); - - printk("SBC8641 board from Wind River\n"); - -#ifdef CONFIG_SMP - mpc86xx_smp_init(); -#endif - - fsl_pci_assign_primary(); -} - - -static void -sbc8641_show_cpuinfo(struct seq_file *m) -{ - uint svid = mfspr(SPRN_SVR); - - seq_printf(m, "Vendor\t\t: Wind River Systems\n"); - - seq_printf(m, "SVR\t\t: 0x%x\n", svid); -} - - -/* - * Called very early, device-tree isn't unflattened - */ -static int __init sbc8641_probe(void) -{ - if (of_machine_is_compatible("wind,sbc8641")) - return 1; /* Looks good */ - - return 0; -} - -machine_arch_initcall(sbc8641, mpc86xx_common_publish_devices); - -define_machine(sbc8641) { - .name = "SBC8641D", - .probe = sbc8641_probe, - .setup_arch = sbc8641_setup_arch, - .init_IRQ = mpc86xx_init_irq, - .show_cpuinfo = sbc8641_show_cpuinfo, - .get_irq = mpic_get_irq, - .time_init = mpc86xx_time_init, - .calibrate_decr = generic_calibrate_decr, - .progress = udbg_progress, -#ifdef CONFIG_PCI - .pcibios_fixup_bus = fsl_pcibios_fixup_bus, -#endif -}; diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index ca2555b8a0c2..82335e364c44 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -12,8 +12,8 @@ #include <linux/export.h> #include <linux/of_platform.h> #include <linux/slab.h> +#include <linux/debugfs.h> -#include <asm/debugfs.h> #include <asm/dcr.h> #include <asm/machdep.h> #include <asm/prom.h> @@ -480,6 +480,6 @@ void axon_msi_debug_setup(struct device_node *dn, struct axon_msic *msic) snprintf(name, sizeof(name), "msic_%d", of_node_to_nid(dn)); - debugfs_create_file(name, 0600, powerpc_debugfs_root, msic, &fops_msic); + debugfs_create_file(name, 0600, arch_debugfs_dir, msic, &fops_msic); } #endif /* DEBUG */ diff --git a/arch/powerpc/platforms/embedded6xx/holly.c b/arch/powerpc/platforms/embedded6xx/holly.c index 85521b3e7098..7a85b117f7a4 100644 --- a/arch/powerpc/platforms/embedded6xx/holly.c +++ b/arch/powerpc/platforms/embedded6xx/holly.c @@ -251,7 +251,7 @@ static int ppc750_machine_check_exception(struct pt_regs *regs) /* Are we prepared to handle this fault */ if ((entry = search_exception_tables(regs->nip)) != NULL) { tsi108_clear_pci_cfg_error(); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } diff --git a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c index d8da6a483e59..9eb9abb5bce2 100644 --- a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c +++ b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c @@ -173,7 +173,7 @@ static int mpc7448_machine_check_exception(struct pt_regs *regs) /* Are we prepared to handle this fault */ if ((entry = search_exception_tables(regs->nip)) != NULL) { tsi108_clear_pci_cfg_error(); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } diff --git a/arch/powerpc/platforms/pasemi/idle.c b/arch/powerpc/platforms/pasemi/idle.c index 534b0317fc15..6087c70ed2ef 100644 --- a/arch/powerpc/platforms/pasemi/idle.c +++ b/arch/powerpc/platforms/pasemi/idle.c @@ -59,7 +59,7 @@ static int pasemi_system_reset_exception(struct pt_regs *regs) restore_astate(hard_smp_processor_id()); /* everything handled */ - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); return 1; } diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 528a7e0cf83a..e3ffdc8e8567 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -199,12 +199,12 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev, */ power7_fastsleep_workaround_exit = false; - get_online_cpus(); + cpus_read_lock(); primary_thread_mask = cpu_online_cores_map(); on_each_cpu_mask(&primary_thread_mask, pnv_fastsleep_workaround_apply, &err, 1); - put_online_cpus(); + cpus_read_unlock(); if (err) { pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply"); goto fail; @@ -667,7 +667,6 @@ static unsigned long power9_idle_stop(unsigned long psscr) sprs.purr = mfspr(SPRN_PURR); sprs.spurr = mfspr(SPRN_SPURR); sprs.dscr = mfspr(SPRN_DSCR); - sprs.wort = mfspr(SPRN_WORT); sprs.ciabr = mfspr(SPRN_CIABR); sprs.mmcra = mfspr(SPRN_MMCRA); @@ -785,7 +784,6 @@ core_woken: mtspr(SPRN_PURR, sprs.purr); mtspr(SPRN_SPURR, sprs.spurr); mtspr(SPRN_DSCR, sprs.dscr); - mtspr(SPRN_WORT, sprs.wort); mtspr(SPRN_CIABR, sprs.ciabr); mtspr(SPRN_MMCRA, sprs.mmcra); diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 537a4daed614..877720c64515 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -18,7 +18,6 @@ #include <linux/memory_hotplug.h> #include <linux/numa.h> #include <asm/machdep.h> -#include <asm/debugfs.h> #include <asm/cacheflush.h> /* This enables us to keep track of the memory removed from each node. */ @@ -330,7 +329,7 @@ DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get, static int memtrace_init(void) { memtrace_debugfs_dir = debugfs_create_dir("memtrace", - powerpc_debugfs_root); + arch_debugfs_dir); debugfs_create_file("enable", 0600, memtrace_debugfs_dir, NULL, &memtrace_init_fops); diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 7824cc364bc4..05d3832019b9 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -13,11 +13,11 @@ #include <linux/of_address.h> #include <linux/of_platform.h> #include <linux/crash_dump.h> +#include <linux/debugfs.h> #include <asm/opal.h> #include <asm/io.h> #include <asm/imc-pmu.h> #include <asm/cputhreads.h> -#include <asm/debugfs.h> static struct dentry *imc_debugfs_parent; @@ -56,7 +56,7 @@ static void export_imc_mode_and_cmd(struct device_node *node, u32 cb_offset; struct imc_mem_info *ptr = pmu_ptr->mem_info; - imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); + imc_debugfs_parent = debugfs_create_dir("imc", arch_debugfs_dir); if (of_property_read_u32(node, "cb_offset", &cb_offset)) cb_offset = IMC_CNTL_BLK_OFFSET; @@ -186,7 +186,7 @@ static void disable_nest_pmu_counters(void) int nid, cpu; const struct cpumask *l_cpumask; - get_online_cpus(); + cpus_read_lock(); for_each_node_with_cpus(nid) { l_cpumask = cpumask_of_node(nid); cpu = cpumask_first_and(l_cpumask, cpu_online_mask); @@ -195,7 +195,7 @@ static void disable_nest_pmu_counters(void) opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, get_hard_smp_processor_id(cpu)); } - put_online_cpus(); + cpus_read_unlock(); } static void disable_core_pmu_counters(void) @@ -203,7 +203,7 @@ static void disable_core_pmu_counters(void) cpumask_t cores_map; int cpu, rc; - get_online_cpus(); + cpus_read_lock(); /* Disable the IMC Core functions */ cores_map = cpu_online_cores_map(); for_each_cpu(cpu, &cores_map) { @@ -213,7 +213,7 @@ static void disable_core_pmu_counters(void) pr_err("%s: Failed to stop Core (cpu = %d)\n", __FUNCTION__, cpu); } - put_online_cpus(); + cpus_read_unlock(); } int get_max_nest_dev(void) diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index 608569082ba0..1e5d51db40f8 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -10,13 +10,13 @@ #include <linux/bug.h> #include <linux/io.h> #include <linux/slab.h> +#include <linux/debugfs.h> #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/opal.h> #include <asm/prom.h> #include <linux/uaccess.h> -#include <asm/debugfs.h> #include <asm/isa-bridge.h> static int opal_lpc_chip_id = -1; @@ -371,7 +371,7 @@ static int opal_lpc_init_debugfs(void) if (opal_lpc_chip_id < 0) return -ENODEV; - root = debugfs_create_dir("lpc", powerpc_debugfs_root); + root = debugfs_create_dir("lpc", arch_debugfs_dir); rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO); rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index fd510d961b8c..6b4eed2ef4fa 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -14,11 +14,11 @@ #include <linux/gfp.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/debugfs.h> #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/opal.h> -#include <asm/debugfs.h> #include <asm/prom.h> static u64 opal_scom_unmangle(u64 addr) @@ -189,7 +189,7 @@ static int scom_debug_init(void) if (!firmware_has_feature(FW_FEATURE_OPAL)) return 0; - root = debugfs_create_dir("scom", powerpc_debugfs_root); + root = debugfs_create_dir("scom", arch_debugfs_dir); if (!root) return -1; diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 2835376e61a4..e9d18519e650 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -588,7 +588,7 @@ static int opal_recover_mce(struct pt_regs *regs, { int recovered = 0; - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 7de464679292..3dd35c327d1c 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -20,6 +20,7 @@ #include <linux/iommu.h> #include <linux/rculist.h> #include <linux/sizes.h> +#include <linux/debugfs.h> #include <asm/sections.h> #include <asm/io.h> @@ -32,10 +33,10 @@ #include <asm/iommu.h> #include <asm/tce.h> #include <asm/xics.h> -#include <asm/debugfs.h> #include <asm/firmware.h> #include <asm/pnv-pci.h> #include <asm/mmzone.h> +#include <asm/xive.h> #include <misc/cxl-base.h> @@ -1962,27 +1963,40 @@ void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pe->dma_setup_done = true; } -int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) +/* + * Called from KVM in real mode to EOI passthru interrupts. The ICP + * EOI is handled directly in KVM in kvmppc_deliver_irq_passthru(). + * + * The IRQ data is mapped in the PCI-MSI domain and the EOI OPAL call + * needs an HW IRQ number mapped in the XICS IRQ domain. The HW IRQ + * numbers of the in-the-middle MSI domain are vector numbers and it's + * good enough for OPAL. Use that. + */ +int64_t pnv_opal_pci_msi_eoi(struct irq_data *d) { - struct pnv_phb *phb = container_of(chip, struct pnv_phb, - ioda.irq_chip); + struct pci_controller *hose = irq_data_get_irq_chip_data(d->parent_data); + struct pnv_phb *phb = hose->private_data; - return opal_pci_msi_eoi(phb->opal_id, hw_irq); + return opal_pci_msi_eoi(phb->opal_id, d->parent_data->hwirq); } +/* + * The IRQ data is mapped in the XICS domain, with OPAL HW IRQ numbers + */ static void pnv_ioda2_msi_eoi(struct irq_data *d) { int64_t rc; unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); - struct irq_chip *chip = irq_data_get_irq_chip(d); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; - rc = pnv_opal_pci_msi_eoi(chip, hw_irq); + rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); WARN_ON_ONCE(rc); icp_native_eoi(d); } - +/* P8/CXL only */ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) { struct irq_data *idata; @@ -2004,27 +2018,32 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi; } irq_set_chip(virq, &phb->ioda.irq_chip); + irq_set_chip_data(virq, phb->hose); } +static struct irq_chip pnv_pci_msi_irq_chip; + /* * Returns true iff chip is something that we could call * pnv_opal_pci_msi_eoi for. */ bool is_pnv_opal_msi(struct irq_chip *chip) { - return chip->irq_eoi == pnv_ioda2_msi_eoi; + return chip == &pnv_pci_msi_irq_chip; } EXPORT_SYMBOL_GPL(is_pnv_opal_msi); -static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int virq, - unsigned int is_64, struct msi_msg *msg) +static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int xive_num, + unsigned int is_64, struct msi_msg *msg) { struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); - unsigned int xive_num = hwirq - phb->msi_base; __be32 data; int rc; + dev_dbg(&dev->dev, "%s: setup %s-bit MSI for vector #%d\n", __func__, + is_64 ? "64" : "32", xive_num); + /* No PE assigned ? bail out ... no MSI for you ! */ if (pe == NULL) return -ENXIO; @@ -2072,12 +2091,209 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, } msg->data = be32_to_cpu(data); - pnv_set_msi_irq_chip(phb, virq); + return 0; +} + +/* + * The msi_free() op is called before irq_domain_free_irqs_top() when + * the handler data is still available. Use that to clear the XIVE + * controller. + */ +static void pnv_msi_ops_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, + unsigned int irq) +{ + if (xive_enabled()) + xive_irq_free_data(irq); +} + +static struct msi_domain_ops pnv_pci_msi_domain_ops = { + .msi_free = pnv_msi_ops_msi_free, +}; + +static void pnv_msi_shutdown(struct irq_data *d) +{ + d = d->parent_data; + if (d->chip->irq_shutdown) + d->chip->irq_shutdown(d); +} + +static void pnv_msi_mask(struct irq_data *d) +{ + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); +} + +static void pnv_msi_unmask(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} + +static struct irq_chip pnv_pci_msi_irq_chip = { + .name = "PNV-PCI-MSI", + .irq_shutdown = pnv_msi_shutdown, + .irq_mask = pnv_msi_mask, + .irq_unmask = pnv_msi_unmask, + .irq_eoi = irq_chip_eoi_parent, +}; + +static struct msi_domain_info pnv_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX), + .ops = &pnv_pci_msi_domain_ops, + .chip = &pnv_pci_msi_irq_chip, +}; + +static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg) +{ + struct msi_desc *entry = irq_data_get_msi_desc(d); + struct pci_dev *pdev = msi_desc_to_pci_dev(entry); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + int rc; + + rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq, + entry->msi_attrib.is_64, msg); + if (rc) + dev_err(&pdev->dev, "Failed to setup %s-bit MSI #%ld : %d\n", + entry->msi_attrib.is_64 ? "64" : "32", d->hwirq, rc); +} + +/* + * The IRQ data is mapped in the MSI domain in which HW IRQ numbers + * correspond to vector numbers. + */ +static void pnv_msi_eoi(struct irq_data *d) +{ + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + + if (phb->model == PNV_PHB_MODEL_PHB3) { + /* + * The EOI OPAL call takes an OPAL HW IRQ number but + * since it is translated into a vector number in + * OPAL, use that directly. + */ + WARN_ON_ONCE(opal_pci_msi_eoi(phb->opal_id, d->hwirq)); + } + + irq_chip_eoi_parent(d); +} + +static struct irq_chip pnv_msi_irq_chip = { + .name = "PNV-MSI", + .irq_shutdown = pnv_msi_shutdown, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = pnv_msi_eoi, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_compose_msi_msg = pnv_msi_compose_msg, +}; + +static int pnv_irq_parent_domain_alloc(struct irq_domain *domain, + unsigned int virq, int hwirq) +{ + struct irq_fwspec parent_fwspec; + int ret; + + parent_fwspec.fwnode = domain->parent->fwnode; + parent_fwspec.param_count = 2; + parent_fwspec.param[0] = hwirq; + parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec); + if (ret) + return ret; + + return 0; +} + +static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct pci_controller *hose = domain->host_data; + struct pnv_phb *phb = hose->private_data; + msi_alloc_info_t *info = arg; + struct pci_dev *pdev = msi_desc_to_pci_dev(info->desc); + int hwirq; + int i, ret; + + hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, nr_irqs); + if (hwirq < 0) { + dev_warn(&pdev->dev, "failed to find a free MSI\n"); + return -ENOSPC; + } + + dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__, + hose->dn, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + ret = pnv_irq_parent_domain_alloc(domain, virq + i, + phb->msi_base + hwirq + i); + if (ret) + goto out; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &pnv_msi_irq_chip, hose); + } + + return 0; + +out: + irq_domain_free_irqs_parent(domain, virq, i - 1); + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, nr_irqs); + return ret; +} + +static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + + pr_debug("%s bridge %pOF %d/%lx #%d\n", __func__, hose->dn, + virq, d->hwirq, nr_irqs); + + msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs); + /* XIVE domain is cleared through ->msi_free() */ +} + +static const struct irq_domain_ops pnv_irq_domain_ops = { + .alloc = pnv_irq_domain_alloc, + .free = pnv_irq_domain_free, +}; + +static int pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count) +{ + struct pnv_phb *phb = hose->private_data; + struct irq_domain *parent = irq_get_default_host(); + + hose->fwnode = irq_domain_alloc_named_id_fwnode("PNV-MSI", phb->opal_id); + if (!hose->fwnode) + return -ENOMEM; - pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," - " address=%x_%08x data=%x PE# %x\n", - pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, - msg->address_hi, msg->address_lo, data, pe->pe_number); + hose->dev_domain = irq_domain_create_hierarchy(parent, 0, count, + hose->fwnode, + &pnv_irq_domain_ops, hose); + if (!hose->dev_domain) { + pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n", + hose->dn, hose->global_number); + irq_domain_free_fwnode(hose->fwnode); + return -ENOMEM; + } + + hose->msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(hose->dn), + &pnv_msi_domain_info, + hose->dev_domain); + if (!hose->msi_domain) { + pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", + hose->dn, hose->global_number); + irq_domain_free_fwnode(hose->fwnode); + irq_domain_remove(hose->dev_domain); + return -ENOMEM; + } return 0; } @@ -2102,10 +2318,10 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) return; } - phb->msi_setup = pnv_pci_ioda_msi_setup; - phb->msi32_support = 1; pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", count, phb->msi_base); + + pnv_msi_allocate_domains(phb->hose, count); } static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, @@ -2259,7 +2475,7 @@ static void pnv_pci_ioda_create_dbgfs(void) phb = hose->private_data; sprintf(name, "PCI%04x", hose->global_number); - phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); + phb->dbgfs = debugfs_create_dir(name, arch_debugfs_dir); debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs, phb, &pnv_pci_diag_data_fops); @@ -2709,8 +2925,6 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .dma_dev_setup = pnv_pci_ioda_dma_dev_setup, .dma_bus_setup = pnv_pci_ioda_dma_bus_setup, .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported, - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, .enable_device_hook = pnv_pci_enable_device_hook, .release_device = pnv_pci_release_device, .window_alignment = pnv_pci_window_alignment, diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 6bb3c52633fb..9a8391b983d1 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -160,73 +160,6 @@ exit: } EXPORT_SYMBOL_GPL(pnv_pci_set_power_state); -int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) -{ - struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); - struct msi_desc *entry; - struct msi_msg msg; - int hwirq; - unsigned int virq; - int rc; - - if (WARN_ON(!phb) || !phb->msi_bmp.bitmap) - return -ENODEV; - - if (pdev->no_64bit_msi && !phb->msi32_support) - return -ENODEV; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->msi_attrib.is_64 && !phb->msi32_support) { - pr_warn("%s: Supports only 64-bit MSIs\n", - pci_name(pdev)); - return -ENXIO; - } - hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1); - if (hwirq < 0) { - pr_warn("%s: Failed to find a free MSI\n", - pci_name(pdev)); - return -ENOSPC; - } - virq = irq_create_mapping(NULL, phb->msi_base + hwirq); - if (!virq) { - pr_warn("%s: Failed to map MSI to linux irq\n", - pci_name(pdev)); - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); - return -ENOMEM; - } - rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq, - virq, entry->msi_attrib.is_64, &msg); - if (rc) { - pr_warn("%s: Failed to setup MSI\n", pci_name(pdev)); - irq_dispose_mapping(virq); - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); - return rc; - } - irq_set_msi_desc(virq, entry); - pci_write_msi_msg(virq, &msg); - } - return 0; -} - -void pnv_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); - struct msi_desc *entry; - irq_hw_number_t hwirq; - - if (WARN_ON(!phb)) - return; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->irq) - continue; - hwirq = virq_to_hw(entry->irq); - irq_set_msi_desc(entry->irq, NULL); - irq_dispose_mapping(entry->irq); - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, 1); - } -} - /* Nicely print the contents of the PE State Tables (PEST). */ static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size) { diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index c8d4f222a86f..966a9eb64339 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -123,11 +123,7 @@ struct pnv_phb { #endif unsigned int msi_base; - unsigned int msi32_support; struct msi_bitmap msi_bmp; - int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int virq, - unsigned int is_64, struct msi_msg *msg); int (*init_m64)(struct pnv_phb *phb); int (*get_pe_state)(struct pnv_phb *phb, int pe_no); void (*freeze_pe)(struct pnv_phb *phb, int pe_no); @@ -289,8 +285,6 @@ extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np); extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); -extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); -extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c index 7ddc7ec6a7c0..ef710a715903 100644 --- a/arch/powerpc/platforms/ps3/htab.c +++ b/arch/powerpc/platforms/ps3/htab.c @@ -169,7 +169,8 @@ static void ps3_hpte_invalidate(unsigned long slot, unsigned long vpn, spin_unlock_irqrestore(&ps3_htab_lock, flags); } -static void ps3_hpte_clear(void) +/* Called during kexec sequence with MMU off */ +static notrace void ps3_hpte_clear(void) { unsigned long hpte_count = (1UL << ppc64_pft_size) >> 4; u64 i; diff --git a/arch/powerpc/platforms/ps3/mm.c b/arch/powerpc/platforms/ps3/mm.c index a81eac35d900..9c44f335c0b9 100644 --- a/arch/powerpc/platforms/ps3/mm.c +++ b/arch/powerpc/platforms/ps3/mm.c @@ -195,9 +195,11 @@ fail: /** * ps3_mm_vas_destroy - + * + * called during kexec sequence with MMU off. */ -void ps3_mm_vas_destroy(void) +notrace void ps3_mm_vas_destroy(void) { int result; @@ -1243,9 +1245,11 @@ void __init ps3_mm_init(void) /** * ps3_mm_shutdown - final cleanup of address space + * + * called during kexec sequence with MMU off. */ -void ps3_mm_shutdown(void) +notrace void ps3_mm_shutdown(void) { ps3_mm_region_destroy(&map.r1); } diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 982f069e4c31..352af5b14a0f 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -11,10 +11,10 @@ #include <linux/spinlock.h> #include <asm/smp.h> #include <linux/uaccess.h> +#include <linux/debugfs.h> #include <asm/firmware.h> #include <asm/dtl.h> #include <asm/lppaca.h> -#include <asm/debugfs.h> #include <asm/plpar_wrappers.h> #include <asm/machdep.h> @@ -338,7 +338,7 @@ static int dtl_init(void) /* set up common debugfs structure */ - dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root); + dtl_dir = debugfs_create_dir("dtl", arch_debugfs_dir); debugfs_create_x8("dtl_event_mask", 0600, dtl_dir, &dtl_event_mask); debugfs_create_u32("dtl_buf_entries", 0400, dtl_dir, &dtl_buf_entries); diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 4c7b7f5a2ebc..f162156b7b68 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -119,10 +119,11 @@ struct vec5_fw_feature { static __initdata struct vec5_fw_feature vec5_fw_features_table[] = { - {FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY}, + {FW_FEATURE_FORM1_AFFINITY, OV5_FORM1_AFFINITY}, {FW_FEATURE_PRRN, OV5_PRRN}, {FW_FEATURE_DRMEM_V2, OV5_DRMEM_V2}, {FW_FEATURE_DRC_INFO, OV5_DRC_INFO}, + {FW_FEATURE_FORM2_AFFINITY, OV5_FORM2_AFFINITY}, }; static void __init fw_vec5_feature_init(const char *vec5, unsigned long len) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 7e970f81d8ff..d646c22e94ab 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -39,6 +39,12 @@ /* This version can't take the spinlock, because it never returns */ static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE; +/* + * Record the CPU ids used on each nodes. + * Protected by cpu_add_remove_lock. + */ +static cpumask_var_t node_recorded_ids_map[MAX_NUMNODES]; + static void rtas_stop_self(void) { static struct rtas_args args; @@ -139,72 +145,148 @@ static void pseries_cpu_die(unsigned int cpu) paca_ptrs[cpu]->cpu_start = 0; } +/** + * find_cpu_id_range - found a linear ranger of @nthreads free CPU ids. + * @nthreads : the number of threads (cpu ids) + * @assigned_node : the node it belongs to or NUMA_NO_NODE if free ids from any + * node can be peek. + * @cpu_mask: the returned CPU mask. + * + * Returns 0 on success. + */ +static int find_cpu_id_range(unsigned int nthreads, int assigned_node, + cpumask_var_t *cpu_mask) +{ + cpumask_var_t candidate_mask; + unsigned int cpu, node; + int rc = -ENOSPC; + + if (!zalloc_cpumask_var(&candidate_mask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_clear(*cpu_mask); + for (cpu = 0; cpu < nthreads; cpu++) + cpumask_set_cpu(cpu, *cpu_mask); + + BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask)); + + /* Get a bitmap of unoccupied slots. */ + cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask); + + if (assigned_node != NUMA_NO_NODE) { + /* + * Remove free ids previously assigned on the other nodes. We + * can walk only online nodes because once a node became online + * it is not turned offlined back. + */ + for_each_online_node(node) { + if (node == assigned_node) + continue; + cpumask_andnot(candidate_mask, candidate_mask, + node_recorded_ids_map[node]); + } + } + + if (cpumask_empty(candidate_mask)) + goto out; + + while (!cpumask_empty(*cpu_mask)) { + if (cpumask_subset(*cpu_mask, candidate_mask)) + /* Found a range where we can insert the new cpu(s) */ + break; + cpumask_shift_left(*cpu_mask, *cpu_mask, nthreads); + } + + if (!cpumask_empty(*cpu_mask)) + rc = 0; + +out: + free_cpumask_var(candidate_mask); + return rc; +} + /* * Update cpu_present_mask and paca(s) for a new cpu node. The wrinkle - * here is that a cpu device node may represent up to two logical cpus + * here is that a cpu device node may represent multiple logical cpus * in the SMT case. We must honor the assumption in other code that * the logical ids for sibling SMT threads x and y are adjacent, such * that x^1 == y and y^1 == x. */ static int pseries_add_processor(struct device_node *np) { - unsigned int cpu; - cpumask_var_t candidate_mask, tmp; - int err = -ENOSPC, len, nthreads, i; + int len, nthreads, node, cpu, assigned_node; + int rc = 0; + cpumask_var_t cpu_mask; const __be32 *intserv; intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len); if (!intserv) return 0; - zalloc_cpumask_var(&candidate_mask, GFP_KERNEL); - zalloc_cpumask_var(&tmp, GFP_KERNEL); - nthreads = len / sizeof(u32); - for (i = 0; i < nthreads; i++) - cpumask_set_cpu(i, tmp); - cpu_maps_update_begin(); + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; - BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask)); + /* + * Fetch from the DT nodes read by dlpar_configure_connector() the NUMA + * node id the added CPU belongs to. + */ + node = of_node_to_nid(np); + if (node < 0 || !node_possible(node)) + node = first_online_node; - /* Get a bitmap of unoccupied slots. */ - cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask); - if (cpumask_empty(candidate_mask)) { - /* If we get here, it most likely means that NR_CPUS is - * less than the partition's max processors setting. + BUG_ON(node == NUMA_NO_NODE); + assigned_node = node; + + cpu_maps_update_begin(); + + rc = find_cpu_id_range(nthreads, node, &cpu_mask); + if (rc && nr_node_ids > 1) { + /* + * Try again, considering the free CPU ids from the other node. */ - printk(KERN_ERR "Cannot add cpu %pOF; this system configuration" - " supports %d logical cpus.\n", np, - num_possible_cpus()); - goto out_unlock; + node = NUMA_NO_NODE; + rc = find_cpu_id_range(nthreads, NUMA_NO_NODE, &cpu_mask); } - while (!cpumask_empty(tmp)) - if (cpumask_subset(tmp, candidate_mask)) - /* Found a range where we can insert the new cpu(s) */ - break; - else - cpumask_shift_left(tmp, tmp, nthreads); - - if (cpumask_empty(tmp)) { - printk(KERN_ERR "Unable to find space in cpu_present_mask for" - " processor %pOFn with %d thread(s)\n", np, - nthreads); - goto out_unlock; + if (rc) { + pr_err("Cannot add cpu %pOF; this system configuration" + " supports %d logical cpus.\n", np, num_possible_cpus()); + goto out; } - for_each_cpu(cpu, tmp) { + for_each_cpu(cpu, cpu_mask) { BUG_ON(cpu_present(cpu)); set_cpu_present(cpu, true); set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++)); } - err = 0; -out_unlock: + + /* Record the newly used CPU ids for the associate node. */ + cpumask_or(node_recorded_ids_map[assigned_node], + node_recorded_ids_map[assigned_node], cpu_mask); + + /* + * If node is set to NUMA_NO_NODE, CPU ids have be reused from + * another node, remove them from its mask. + */ + if (node == NUMA_NO_NODE) { + cpu = cpumask_first(cpu_mask); + pr_warn("Reusing free CPU ids %d-%d from another node\n", + cpu, cpu + nthreads - 1); + for_each_online_node(node) { + if (node == assigned_node) + continue; + cpumask_andnot(node_recorded_ids_map[node], + node_recorded_ids_map[node], + cpu_mask); + } + } + +out: cpu_maps_update_done(); - free_cpumask_var(candidate_mask); - free_cpumask_var(tmp); - return err; + free_cpumask_var(cpu_mask); + return rc; } /* @@ -498,6 +580,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index) return saved_rc; } + update_numa_distance(dn); + rc = dlpar_online_cpu(dn); if (rc) { saved_rc = rc; @@ -908,6 +992,7 @@ static struct notifier_block pseries_smp_nb = { static int __init pseries_cpu_hotplug_init(void) { int qcss_tok; + unsigned int node; #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE ppc_md.cpu_probe = dlpar_cpu_probe; @@ -929,8 +1014,18 @@ static int __init pseries_cpu_hotplug_init(void) smp_ops->cpu_die = pseries_cpu_die; /* Processors can be added/removed only on LPAR */ - if (firmware_has_feature(FW_FEATURE_LPAR)) + if (firmware_has_feature(FW_FEATURE_LPAR)) { + for_each_node(node) { + alloc_bootmem_cpumask_var(&node_recorded_ids_map[node]); + + /* Record ids of CPU added at boot time */ + cpumask_or(node_recorded_ids_map[node], + node_recorded_ids_map[node], + cpumask_of_node(node)); + } + of_reconfig_notifier_register(&pseries_smp_nb); + } return 0; } diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 377d852f5a9a..91cf23495ccb 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) return -ENODEV; } + update_numa_distance(lmb_node); + dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); if (!dr_node) { dlpar_free_cc_nodes(lmb_node); @@ -211,13 +213,11 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb) { unsigned long section_nr; - struct mem_section *mem_sect; struct memory_block *mem_block; section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr)); - mem_sect = __nr_to_section(section_nr); - mem_block = find_memory_block(mem_sect); + mem_block = find_memory_block(section_nr); return mem_block; } @@ -286,7 +286,7 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si { unsigned long block_sz, start_pfn; int sections_per_block; - int i, nid; + int i; start_pfn = base >> PAGE_SHIFT; @@ -297,10 +297,9 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si block_sz = pseries_memory_block_size(); sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - nid = memory_add_physaddr_to_nid(base); for (i = 0; i < sections_per_block; i++) { - __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE); + __remove_memory(base, MIN_MEMORY_BLOCK_SIZE); base += MIN_MEMORY_BLOCK_SIZE; } @@ -387,7 +386,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) block_sz = pseries_memory_block_size(); - __remove_memory(mem_block->nid, lmb->base_addr, block_sz); + __remove_memory(lmb->base_addr, block_sz); put_device(&mem_block->dev); /* Update memory regions for memory remove */ @@ -660,7 +659,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) rc = dlpar_online_lmb(lmb); if (rc) { - __remove_memory(nid, lmb->base_addr, block_sz); + __remove_memory(lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); } else { lmb->flags |= DRCONF_MEM_ASSIGNED; @@ -979,6 +978,10 @@ static int pseries_memory_notifier(struct notifier_block *nb, case OF_RECONFIG_DETACH_NODE: err = pseries_remove_mem_node(rd->dn); break; + case OF_RECONFIG_UPDATE_PROPERTY: + if (!strcmp(rd->dn->name, + "ibm,dynamic-reconfiguration-memory")) + drmem_update_lmbs(rd->prop); } return notifier_from_errno(err); } diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 0c55b991f665..dab5c56ffd0e 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -53,28 +53,31 @@ enum { DDW_EXT_QUERY_OUT_SIZE = 2 }; -static struct iommu_table_group *iommu_pseries_alloc_group(int node) +static struct iommu_table *iommu_pseries_alloc_table(int node) { - struct iommu_table_group *table_group; struct iommu_table *tbl; - table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, - node); - if (!table_group) - return NULL; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); if (!tbl) - goto free_group; + return NULL; INIT_LIST_HEAD_RCU(&tbl->it_group_list); kref_init(&tbl->it_kref); + return tbl; +} + +static struct iommu_table_group *iommu_pseries_alloc_group(int node) +{ + struct iommu_table_group *table_group; - table_group->tables[0] = tbl; + table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node); + if (!table_group) + return NULL; - return table_group; + table_group->tables[0] = iommu_pseries_alloc_table(node); + if (table_group->tables[0]) + return table_group; -free_group: kfree(table_group); return NULL; } @@ -107,6 +110,8 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, u64 proto_tce; __be64 *tcep; u64 rpn; + const unsigned long tceshift = tbl->it_page_shift; + const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl); proto_tce = TCE_PCI_READ; // Read allowed @@ -117,10 +122,10 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, while (npages--) { /* can't move this out since we might cross MEMBLOCK boundary */ - rpn = __pa(uaddr) >> TCE_SHIFT; - *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); + rpn = __pa(uaddr) >> tceshift; + *tcep = cpu_to_be64(proto_tce | rpn << tceshift); - uaddr += TCE_PAGE_SIZE; + uaddr += pagesize; tcep++; } return 0; @@ -146,7 +151,7 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) return be64_to_cpu(*tcep); } -static void tce_free_pSeriesLP(unsigned long liobn, long, long); +static void tce_free_pSeriesLP(unsigned long liobn, long, long, long); static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, @@ -166,12 +171,12 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, proto_tce |= TCE_PCI_WRITE; while (npages--) { - tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift; + tce = proto_tce | rpn << tceshift; rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; - tce_free_pSeriesLP(liobn, tcenum_start, + tce_free_pSeriesLP(liobn, tcenum_start, tceshift, (npages_start - (npages + 1))); break; } @@ -205,10 +210,11 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long tcenum_start = tcenum, npages_start = npages; int ret = 0; unsigned long flags; + const unsigned long tceshift = tbl->it_page_shift; if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { return tce_build_pSeriesLP(tbl->it_index, tcenum, - tbl->it_page_shift, npages, uaddr, + tceshift, npages, uaddr, direction, attrs); } @@ -225,13 +231,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if (!tcep) { local_irq_restore(flags); return tce_build_pSeriesLP(tbl->it_index, tcenum, - tbl->it_page_shift, + tceshift, npages, uaddr, direction, attrs); } __this_cpu_write(tce_page, tcep); } - rpn = __pa(uaddr) >> TCE_SHIFT; + rpn = __pa(uaddr) >> tceshift; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; @@ -245,12 +251,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE); for (l = 0; l < limit; l++) { - tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); + tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift); rpn++; } rc = plpar_tce_put_indirect((u64)tbl->it_index, - (u64)tcenum << 12, + (u64)tcenum << tceshift, (u64)__pa(tcep), limit); @@ -277,12 +283,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } -static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long npages) +static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, + long npages) { u64 rc; while (npages--) { - rc = plpar_tce_put((u64)liobn, (u64)tcenum << 12, 0); + rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0); if (rc && printk_ratelimit()) { printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); @@ -301,9 +308,11 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n u64 rc; if (!firmware_has_feature(FW_FEATURE_STUFF_TCE)) - return tce_free_pSeriesLP(tbl->it_index, tcenum, npages); + return tce_free_pSeriesLP(tbl->it_index, tcenum, + tbl->it_page_shift, npages); - rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages); + rc = plpar_tce_stuff((u64)tbl->it_index, + (u64)tcenum << tbl->it_page_shift, 0, npages); if (rc && printk_ratelimit()) { printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n"); @@ -319,7 +328,8 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) u64 rc; unsigned long tce_ret; - rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret); + rc = plpar_tce_get((u64)tbl->it_index, + (u64)tcenum << tbl->it_page_shift, &tce_ret); if (rc && printk_ratelimit()) { printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc); @@ -339,7 +349,7 @@ struct dynamic_dma_window_prop { __be32 window_shift; /* ilog2(tce_window_size) */ }; -struct direct_window { +struct dma_win { struct device_node *device; const struct dynamic_dma_window_prop *prop; struct list_head list; @@ -359,12 +369,13 @@ struct ddw_create_response { u32 addr_lo; }; -static LIST_HEAD(direct_window_list); +static LIST_HEAD(dma_win_list); /* prevents races between memory on/offline and window creation */ -static DEFINE_SPINLOCK(direct_window_list_lock); +static DEFINE_SPINLOCK(dma_win_list_lock); /* protects initializing window twice for same device */ -static DEFINE_MUTEX(direct_window_init_mutex); +static DEFINE_MUTEX(dma_win_init_mutex); #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" +#define DMA64_PROPNAME "linux,dma64-ddr-window-info" static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, unsigned long num_pfn, const void *arg) @@ -491,6 +502,24 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); } +static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno, + unsigned long liobn, unsigned long win_addr, + unsigned long window_size, unsigned long page_shift, + void *base, struct iommu_table_ops *table_ops) +{ + tbl->it_busno = busno; + tbl->it_index = liobn; + tbl->it_offset = win_addr >> page_shift; + tbl->it_size = window_size >> page_shift; + tbl->it_page_shift = page_shift; + tbl->it_base = (unsigned long)base; + tbl->it_blocksize = 16; + tbl->it_type = TCE_PCI; + tbl->it_ops = table_ops; +} + +struct iommu_table_ops iommu_table_pseries_ops; + static void iommu_table_setparms(struct pci_controller *phb, struct device_node *dn, struct iommu_table *tbl) @@ -499,8 +528,13 @@ static void iommu_table_setparms(struct pci_controller *phb, const unsigned long *basep; const u32 *sizep; - node = phb->dn; + /* Test if we are going over 2GB of DMA space */ + if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) { + udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); + panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); + } + node = phb->dn; basep = of_get_property(node, "linux,tce-base", NULL); sizep = of_get_property(node, "linux,tce-size", NULL); if (basep == NULL || sizep == NULL) { @@ -509,33 +543,18 @@ static void iommu_table_setparms(struct pci_controller *phb, return; } - tbl->it_base = (unsigned long)__va(*basep); + iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur, + phb->dma_window_size, IOMMU_PAGE_SHIFT_4K, + __va(*basep), &iommu_table_pseries_ops); if (!is_kdump_kernel()) memset((void *)tbl->it_base, 0, *sizep); - tbl->it_busno = phb->bus->number; - tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; - - /* Units of tce entries */ - tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift; - - /* Test if we are going over 2GB of DMA space */ - if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) { - udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); - panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); - } - phb->dma_window_base_cur += phb->dma_window_size; - - /* Set the tce table size - measured in entries */ - tbl->it_size = phb->dma_window_size >> tbl->it_page_shift; - - tbl->it_index = 0; - tbl->it_blocksize = 16; - tbl->it_type = TCE_PCI; } +struct iommu_table_ops iommu_table_lpar_multi_ops; + /* * iommu_table_setparms_lpar * @@ -547,17 +566,13 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb, struct iommu_table_group *table_group, const __be32 *dma_window) { - unsigned long offset, size; + unsigned long offset, size, liobn; - of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size); + of_parse_dma_window(dn, dma_window, &liobn, &offset, &size); + + iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL, + &iommu_table_lpar_multi_ops); - tbl->it_busno = phb->bus->number; - tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; - tbl->it_base = 0; - tbl->it_blocksize = 16; - tbl->it_type = TCE_PCI; - tbl->it_offset = offset >> tbl->it_page_shift; - tbl->it_size = size >> tbl->it_page_shift; table_group->tce32_start = offset; table_group->tce32_size = size; @@ -637,7 +652,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) tbl = pci->table_group->tables[0]; iommu_table_setparms(pci->phb, dn, tbl); - tbl->it_ops = &iommu_table_pseries_ops; + if (!iommu_init_table(tbl, pci->phb->node, 0, 0)) panic("Failed to initialize iommu table"); @@ -698,7 +713,10 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n", dn); - /* Find nearest ibm,dma-window, walking up the device tree */ + /* + * Find nearest ibm,dma-window (default DMA window), walking up the + * device tree + */ for (pdn = dn; pdn != NULL; pdn = pdn->parent) { dma_window = of_get_property(pdn, "ibm,dma-window", NULL); if (dma_window != NULL) @@ -720,7 +738,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) tbl = ppci->table_group->tables[0]; iommu_table_setparms_lpar(ppci->phb, pdn, tbl, ppci->table_group, dma_window); - tbl->it_ops = &iommu_table_lpar_multi_ops; + if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) panic("Failed to initialize iommu table"); iommu_register_group(ppci->table_group, @@ -750,7 +768,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node); tbl = PCI_DN(dn)->table_group->tables[0]; iommu_table_setparms(phb, dn, tbl); - tbl->it_ops = &iommu_table_pseries_ops; + if (!iommu_init_table(tbl, phb->node, 0, 0)) panic("Failed to initialize iommu table"); @@ -785,17 +803,10 @@ static int __init disable_ddw_setup(char *str) early_param("disable_ddw", disable_ddw_setup); -static void remove_dma_window(struct device_node *np, u32 *ddw_avail, - struct property *win) +static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp) { - struct dynamic_dma_window_prop *dwp; - u64 liobn; int ret; - dwp = win->value; - liobn = (u64)be32_to_cpu(dwp->liobn); - - /* clear the whole window, note the arg is in kernel pages */ ret = tce_clearrange_multi_pSeriesLP(0, 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); if (ret) @@ -804,94 +815,136 @@ static void remove_dma_window(struct device_node *np, u32 *ddw_avail, else pr_debug("%pOF successfully cleared tces in window.\n", np); +} + +/* + * Call only if DMA window is clean. + */ +static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn) +{ + int ret; ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn); if (ret) - pr_warn("%pOF: failed to remove direct window: rtas returned " + pr_warn("%pOF: failed to remove DMA window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); else - pr_debug("%pOF: successfully removed direct window: rtas returned " + pr_debug("%pOF: successfully removed DMA window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); } -static void remove_ddw(struct device_node *np, bool remove_prop) +static void remove_dma_window(struct device_node *np, u32 *ddw_avail, + struct property *win) +{ + struct dynamic_dma_window_prop *dwp; + u64 liobn; + + dwp = win->value; + liobn = (u64)be32_to_cpu(dwp->liobn); + + clean_dma_window(np, dwp); + __remove_dma_window(np, ddw_avail, liobn); +} + +static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_name) { struct property *win; u32 ddw_avail[DDW_APPLICABLE_SIZE]; int ret = 0; + win = of_find_property(np, win_name, NULL); + if (!win) + return -EINVAL; + ret = of_property_read_u32_array(np, "ibm,ddw-applicable", &ddw_avail[0], DDW_APPLICABLE_SIZE); if (ret) - return; + return 0; - win = of_find_property(np, DIRECT64_PROPNAME, NULL); - if (!win) - return; if (win->length >= sizeof(struct dynamic_dma_window_prop)) remove_dma_window(np, ddw_avail, win); if (!remove_prop) - return; + return 0; ret = of_remove_property(np, win); if (ret) - pr_warn("%pOF: failed to remove direct window property: %d\n", + pr_warn("%pOF: failed to remove DMA window property: %d\n", np, ret); + return 0; } -static u64 find_existing_ddw(struct device_node *pdn, int *window_shift) +static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift) { - struct direct_window *window; - const struct dynamic_dma_window_prop *direct64; - u64 dma_addr = 0; + struct dma_win *window; + const struct dynamic_dma_window_prop *dma64; + bool found = false; - spin_lock(&direct_window_list_lock); + spin_lock(&dma_win_list_lock); /* check if we already created a window and dupe that config if so */ - list_for_each_entry(window, &direct_window_list, list) { + list_for_each_entry(window, &dma_win_list, list) { if (window->device == pdn) { - direct64 = window->prop; - dma_addr = be64_to_cpu(direct64->dma_base); - *window_shift = be32_to_cpu(direct64->window_shift); + dma64 = window->prop; + *dma_addr = be64_to_cpu(dma64->dma_base); + *window_shift = be32_to_cpu(dma64->window_shift); + found = true; break; } } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); - return dma_addr; + return found; } -static int find_existing_ddw_windows(void) +static struct dma_win *ddw_list_new_entry(struct device_node *pdn, + const struct dynamic_dma_window_prop *dma64) { - int len; - struct device_node *pdn; - struct direct_window *window; - const struct dynamic_dma_window_prop *direct64; + struct dma_win *window; - if (!firmware_has_feature(FW_FEATURE_LPAR)) - return 0; + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) + return NULL; - for_each_node_with_property(pdn, DIRECT64_PROPNAME) { - direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); - if (!direct64) - continue; + window->device = pdn; + window->prop = dma64; - window = kzalloc(sizeof(*window), GFP_KERNEL); - if (!window || len < sizeof(struct dynamic_dma_window_prop)) { - kfree(window); - remove_ddw(pdn, true); + return window; +} + +static void find_existing_ddw_windows_named(const char *name) +{ + int len; + struct device_node *pdn; + struct dma_win *window; + const struct dynamic_dma_window_prop *dma64; + + for_each_node_with_property(pdn, name) { + dma64 = of_get_property(pdn, name, &len); + if (!dma64 || len < sizeof(*dma64)) { + remove_ddw(pdn, true, name); continue; } - window->device = pdn; - window->prop = direct64; - spin_lock(&direct_window_list_lock); - list_add(&window->list, &direct_window_list); - spin_unlock(&direct_window_list_lock); + window = ddw_list_new_entry(pdn, dma64); + if (!window) + break; + + spin_lock(&dma_win_list_lock); + list_add(&window->list, &dma_win_list); + spin_unlock(&dma_win_list_lock); } +} + +static int find_existing_ddw_windows(void) +{ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + return 0; + + find_existing_ddw_windows_named(DIRECT64_PROPNAME); + find_existing_ddw_windows_named(DMA64_PROPNAME); return 0; } @@ -1130,6 +1183,35 @@ static int iommu_get_page_shift(u32 query_page_size) return 0; } +static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr, + u32 page_shift, u32 window_shift) +{ + struct dynamic_dma_window_prop *ddwprop; + struct property *win64; + + win64 = kzalloc(sizeof(*win64), GFP_KERNEL); + if (!win64) + return NULL; + + win64->name = kstrdup(propname, GFP_KERNEL); + ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL); + win64->value = ddwprop; + win64->length = sizeof(*ddwprop); + if (!win64->name || !win64->value) { + kfree(win64->name); + kfree(win64->value); + kfree(win64); + return NULL; + } + + ddwprop->liobn = cpu_to_be32(liobn); + ddwprop->dma_base = cpu_to_be64(dma_addr); + ddwprop->tce_shift = cpu_to_be32(page_shift); + ddwprop->window_shift = cpu_to_be32(window_shift); + + return win64; +} + /* * If the PE supports dynamic dma windows, and there is space for a table * that can map all pages in a linear offset, then setup such a table, @@ -1139,34 +1221,39 @@ static int iommu_get_page_shift(u32 query_page_size) * pdn: the parent pe node with the ibm,dma_window property * Future: also check if we can remap the base window for our base page size * - * returns the dma offset for use by the direct mapped DMA code. + * returns true if can map all pages (direct mapping), false otherwise.. */ -static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) +static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) { int len = 0, ret; int max_ram_len = order_base_2(ddw_memory_hotplug_max()); struct ddw_query_response query; struct ddw_create_response create; int page_shift; - u64 dma_addr; + u64 win_addr; + const char *win_name; struct device_node *dn; u32 ddw_avail[DDW_APPLICABLE_SIZE]; - struct direct_window *window; + struct dma_win *window; struct property *win64; - struct dynamic_dma_window_prop *ddwprop; + bool ddw_enabled = false; struct failed_ddw_pdn *fpdn; - bool default_win_removed = false; + bool default_win_removed = false, direct_mapping = false; bool pmem_present; + struct pci_dn *pci = PCI_DN(pdn); + struct iommu_table *tbl = pci->table_group->tables[0]; dn = of_find_node_by_type(NULL, "ibm,pmemory"); pmem_present = dn != NULL; of_node_put(dn); - mutex_lock(&direct_window_init_mutex); + mutex_lock(&dma_win_init_mutex); - dma_addr = find_existing_ddw(pdn, &len); - if (dma_addr != 0) + if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) { + direct_mapping = (len >= max_ram_len); + ddw_enabled = true; goto out_unlock; + } /* * If we already went through this for a previous function of @@ -1240,12 +1327,12 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) page_shift = iommu_get_page_shift(query.page_size); if (!page_shift) { - dev_dbg(&dev->dev, "no supported direct page size in mask %x", - query.page_size); + dev_dbg(&dev->dev, "no supported page size in mask %x", + query.page_size); goto out_failed; } - /* verify the window * number of ptes will map the partition */ - /* check largest block * page size > max memory hotplug addr */ + + /* * The "ibm,pmemory" can appear anywhere in the address space. * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS @@ -1261,81 +1348,127 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_info(&dev->dev, "Skipping ibm,pmemory"); } + /* check if the available block * number of ptes will map everything */ if (query.largest_available_block < (1ULL << (len - page_shift))) { dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu %llu-sized pages\n", 1ULL << len, query.largest_available_block, 1ULL << page_shift); - goto out_failed; - } - win64 = kzalloc(sizeof(struct property), GFP_KERNEL); - if (!win64) { - dev_info(&dev->dev, - "couldn't allocate property for 64bit dma window\n"); - goto out_failed; - } - win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL); - win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL); - win64->length = sizeof(*ddwprop); - if (!win64->name || !win64->value) { - dev_info(&dev->dev, - "couldn't allocate property name and value\n"); - goto out_free_prop; + + /* DDW + IOMMU on single window may fail if there is any allocation */ + if (default_win_removed && iommu_table_in_use(tbl)) { + dev_dbg(&dev->dev, "current IOMMU table in use, can't be replaced.\n"); + goto out_failed; + } + + len = order_base_2(query.largest_available_block << page_shift); + win_name = DMA64_PROPNAME; + } else { + direct_mapping = true; + win_name = DIRECT64_PROPNAME; } ret = create_ddw(dev, ddw_avail, &create, page_shift, len); if (ret != 0) - goto out_free_prop; - - ddwprop->liobn = cpu_to_be32(create.liobn); - ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) | - create.addr_lo); - ddwprop->tce_shift = cpu_to_be32(page_shift); - ddwprop->window_shift = cpu_to_be32(len); + goto out_failed; dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n", create.liobn, dn); - window = kzalloc(sizeof(*window), GFP_KERNEL); - if (!window) - goto out_clear_window; + win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; + win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len); - ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, - win64->value, tce_setrange_multi_pSeriesLP_walk); - if (ret) { - dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n", - dn, ret); - goto out_free_window; + if (!win64) { + dev_info(&dev->dev, + "couldn't allocate property, property name, or value\n"); + goto out_remove_win; } ret = of_add_property(pdn, win64); if (ret) { - dev_err(&dev->dev, "unable to add dma window property for %pOF: %d", - pdn, ret); - goto out_free_window; + dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d", + pdn, ret); + goto out_free_prop; } - window->device = pdn; - window->prop = ddwprop; - spin_lock(&direct_window_list_lock); - list_add(&window->list, &direct_window_list); - spin_unlock(&direct_window_list_lock); + window = ddw_list_new_entry(pdn, win64->value); + if (!window) + goto out_del_prop; + + if (direct_mapping) { + /* DDW maps the whole partition, so enable direct DMA mapping */ + ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, + win64->value, tce_setrange_multi_pSeriesLP_walk); + if (ret) { + dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n", + dn, ret); + + /* Make sure to clean DDW if any TCE was set*/ + clean_dma_window(pdn, win64->value); + goto out_del_list; + } + } else { + struct iommu_table *newtbl; + int i; + + for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { + const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; + + /* Look for MMIO32 */ + if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) + break; + } + + if (i == ARRAY_SIZE(pci->phb->mem_resources)) + goto out_del_list; + + /* New table for using DDW instead of the default DMA window */ + newtbl = iommu_pseries_alloc_table(pci->phb->node); + if (!newtbl) { + dev_dbg(&dev->dev, "couldn't create new IOMMU table\n"); + goto out_del_list; + } + + iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr, + 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops); + iommu_init_table(newtbl, pci->phb->node, pci->phb->mem_resources[i].start, + pci->phb->mem_resources[i].end); - dma_addr = be64_to_cpu(ddwprop->dma_base); + pci->table_group->tables[1] = newtbl; + + /* Keep default DMA window stuct if removed */ + if (default_win_removed) { + tbl->it_size = 0; + kfree(tbl->it_map); + } + + set_iommu_table_base(&dev->dev, newtbl); + } + + spin_lock(&dma_win_list_lock); + list_add(&window->list, &dma_win_list); + spin_unlock(&dma_win_list_lock); + + dev->dev.archdata.dma_offset = win_addr; + ddw_enabled = true; goto out_unlock; -out_free_window: +out_del_list: kfree(window); -out_clear_window: - remove_ddw(pdn, true); +out_del_prop: + of_remove_property(pdn, win64); out_free_prop: kfree(win64->name); kfree(win64->value); kfree(win64); +out_remove_win: + /* DDW is clean, so it's ok to call this directly. */ + __remove_dma_window(pdn, ddw_avail, create.liobn); + out_failed: if (default_win_removed) reset_dma_window(dev, pdn); @@ -1347,17 +1480,17 @@ out_failed: list_add(&fpdn->list, &failed_ddw_pdn_list); out_unlock: - mutex_unlock(&direct_window_init_mutex); + mutex_unlock(&dma_win_init_mutex); /* * If we have persistent memory and the window size is only as big * as RAM, then we failed to create a window to cover persistent * memory and need to set the DMA limit. */ - if (pmem_present && dma_addr && (len == max_ram_len)) - dev->dev.bus_dma_limit = dma_addr + (1ULL << len); + if (pmem_present && ddw_enabled && direct_mapping && len == max_ram_len) + dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len); - return dma_addr; + return ddw_enabled && direct_mapping; } static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) @@ -1399,7 +1532,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) tbl = pci->table_group->tables[0]; iommu_table_setparms_lpar(pci->phb, pdn, tbl, pci->table_group, dma_window); - tbl->it_ops = &iommu_table_lpar_multi_ops; + iommu_init_table(tbl, pci->phb->node, 0, 0); iommu_register_group(pci->table_group, pci_domain_nr(pci->phb->bus), 0); @@ -1436,11 +1569,8 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) break; } - if (pdn && PCI_DN(pdn)) { - pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn); - if (pdev->dev.archdata.dma_offset) - return true; - } + if (pdn && PCI_DN(pdn)) + return enable_ddw(pdev, pdn); return false; } @@ -1448,29 +1578,29 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { - struct direct_window *window; + struct dma_win *window; struct memory_notify *arg = data; int ret = 0; switch (action) { case MEM_GOING_ONLINE: - spin_lock(&direct_window_list_lock); - list_for_each_entry(window, &direct_window_list, list) { + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, arg->nr_pages, window->prop); /* XXX log error */ } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); break; case MEM_CANCEL_ONLINE: case MEM_OFFLINE: - spin_lock(&direct_window_list_lock); - list_for_each_entry(window, &direct_window_list, list) { + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, arg->nr_pages, window->prop); /* XXX log error */ } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); break; default: break; @@ -1491,7 +1621,7 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti struct of_reconfig_data *rd = data; struct device_node *np = rd->dn; struct pci_dn *pci = PCI_DN(np); - struct direct_window *window; + struct dma_win *window; switch (action) { case OF_RECONFIG_DETACH_NODE: @@ -1502,20 +1632,22 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti * we have to remove the property when releasing * the device node. */ - remove_ddw(np, false); + if (remove_ddw(np, false, DIRECT64_PROPNAME)) + remove_ddw(np, false, DMA64_PROPNAME); + if (pci && pci->table_group) iommu_pseries_free_group(pci->table_group, np->full_name); - spin_lock(&direct_window_list_lock); - list_for_each_entry(window, &direct_window_list, list) { + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { if (window->device == np) { list_del(&window->list); kfree(window); break; } } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); break; default: err = NOTIFY_DONE; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index dab356e3ff87..3df6bdfea475 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -22,6 +22,8 @@ #include <linux/workqueue.h> #include <linux/proc_fs.h> #include <linux/pgtable.h> +#include <linux/debugfs.h> + #include <asm/processor.h> #include <asm/mmu.h> #include <asm/page.h> @@ -39,7 +41,6 @@ #include <asm/kexec.h> #include <asm/fadump.h> #include <asm/asm-prototypes.h> -#include <asm/debugfs.h> #include <asm/dtl.h> #include "pseries.h" @@ -261,7 +262,7 @@ static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu) if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc) return -EIO; - return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); + return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); } static int cpu_home_node_dispatch_distance(int disp_cpu) @@ -281,7 +282,7 @@ static int cpu_home_node_dispatch_distance(int disp_cpu) if (!disp_cpu_assoc || !vcpu_assoc) return -EIO; - return cpu_distance(disp_cpu_assoc, vcpu_assoc); + return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc); } static void update_vcpu_disp_stat(int disp_cpu) @@ -801,7 +802,8 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group) return -1; } -static void manual_hpte_clear_all(void) +/* Called during kexec sequence with MMU off */ +static notrace void manual_hpte_clear_all(void) { unsigned long size_bytes = 1UL << ppc64_pft_size; unsigned long hpte_count = size_bytes >> 4; @@ -834,7 +836,8 @@ static void manual_hpte_clear_all(void) } } -static int hcall_hpte_clear_all(void) +/* Called during kexec sequence with MMU off */ +static notrace int hcall_hpte_clear_all(void) { int rc; @@ -845,7 +848,8 @@ static int hcall_hpte_clear_all(void) return rc; } -static void pseries_hpte_clear_all(void) +/* Called during kexec sequence with MMU off */ +static notrace void pseries_hpte_clear_all(void) { int rc; @@ -2016,7 +2020,7 @@ static int __init vpa_debugfs_init(void) if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return 0; - vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root); + vpa_dir = debugfs_create_dir("vpa", arch_debugfs_dir); /* set up the per-cpu vpa file*/ for_each_possible_cpu(i) { diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 637300330507..1b305e411862 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -13,6 +13,7 @@ #include <asm/hw_irq.h> #include <asm/ppc-pci.h> #include <asm/machdep.h> +#include <asm/xive.h> #include "pseries.h" @@ -110,21 +111,6 @@ static int rtas_query_irq_number(struct pci_dn *pdn, int offset) return rtas_ret[0]; } -static void rtas_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct msi_desc *entry; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->irq) - continue; - - irq_set_msi_desc(entry->irq, NULL); - irq_dispose_mapping(entry->irq); - } - - rtas_disable_msi(pdev); -} - static int check_req(struct pci_dev *pdev, int nvec, char *prop_name) { struct device_node *dn; @@ -164,12 +150,12 @@ static int check_req_msix(struct pci_dev *pdev, int nvec) /* Quota calculation */ -static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) +static struct device_node *__find_pe_total_msi(struct device_node *node, int *total) { struct device_node *dn; const __be32 *p; - dn = of_node_get(pci_device_to_OF_node(dev)); + dn = of_node_get(node); while (dn) { p = of_get_property(dn, "ibm,pe-total-#msi", NULL); if (p) { @@ -185,6 +171,11 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) return NULL; } +static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) +{ + return __find_pe_total_msi(pci_device_to_OF_node(dev), total); +} + static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) { struct device_node *dn; @@ -368,12 +359,11 @@ static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev) pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0); } -static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) +static int rtas_prepare_msi_irqs(struct pci_dev *pdev, int nvec_in, int type, + msi_alloc_info_t *arg) { struct pci_dn *pdn; - int hwirq, virq, i, quota, rc; - struct msi_desc *entry; - struct msi_msg msg; + int quota, rc; int nvec = nvec_in; int use_32bit_msi_hack = 0; @@ -451,53 +441,229 @@ again: return rc; } - i = 0; - for_each_pci_msi_entry(entry, pdev) { - hwirq = rtas_query_irq_number(pdn, i++); - if (hwirq < 0) { - pr_debug("rtas_msi: error (%d) getting hwirq\n", rc); - return hwirq; - } + return 0; +} - /* - * Depending on the number of online CPUs in the original - * kernel, it is likely for CPU #0 to be offline in a kdump - * kernel. The associated IRQs in the affinity mappings - * provided by irq_create_affinity_masks() are thus not - * started by irq_startup(), as per-design for managed IRQs. - * This can be a problem with multi-queue block devices driven - * by blk-mq : such a non-started IRQ is very likely paired - * with the single queue enforced by blk-mq during kdump (see - * blk_mq_alloc_tag_set()). This causes the device to remain - * silent and likely hangs the guest at some point. - * - * We don't really care for fine-grained affinity when doing - * kdump actually : simply ignore the pre-computed affinity - * masks in this case and let the default mask with all CPUs - * be used when creating the IRQ mappings. - */ - if (is_kdump_kernel()) - virq = irq_create_mapping(NULL, hwirq); - else - virq = irq_create_mapping_affinity(NULL, hwirq, - entry->affinity); +static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct msi_desc *desc = first_pci_msi_entry(pdev); + int type = desc->msi_attrib.is_msix ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI; - if (!virq) { - pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq); - return -ENOSPC; - } + return rtas_prepare_msi_irqs(pdev, nvec, type, arg); +} + +/* + * ->msi_free() is called before irq_domain_free_irqs_top() when the + * handler data is still available. Use that to clear the XIVE + * controller data. + */ +static void pseries_msi_ops_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, + unsigned int irq) +{ + if (xive_enabled()) + xive_irq_free_data(irq); +} + +/* + * RTAS can not disable one MSI at a time. It's all or nothing. Do it + * at the end after all IRQs have been freed. + */ +static void pseries_msi_domain_free_irqs(struct irq_domain *domain, + struct device *dev) +{ + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return; + + __msi_domain_free_irqs(domain, dev); + + rtas_disable_msi(to_pci_dev(dev)); +} + +static struct msi_domain_ops pseries_pci_msi_domain_ops = { + .msi_prepare = pseries_msi_ops_prepare, + .msi_free = pseries_msi_ops_msi_free, + .domain_free_irqs = pseries_msi_domain_free_irqs, +}; + +static void pseries_msi_shutdown(struct irq_data *d) +{ + d = d->parent_data; + if (d->chip->irq_shutdown) + d->chip->irq_shutdown(d); +} + +static void pseries_msi_mask(struct irq_data *d) +{ + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); +} + +static void pseries_msi_unmask(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} - dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq); - irq_set_msi_desc(virq, entry); +static struct irq_chip pseries_pci_msi_irq_chip = { + .name = "pSeries-PCI-MSI", + .irq_shutdown = pseries_msi_shutdown, + .irq_mask = pseries_msi_mask, + .irq_unmask = pseries_msi_unmask, + .irq_eoi = irq_chip_eoi_parent, +}; + +static struct msi_domain_info pseries_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX), + .ops = &pseries_pci_msi_domain_ops, + .chip = &pseries_pci_msi_irq_chip, +}; + +static void pseries_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __pci_read_msi_msg(irq_data_get_msi_desc(data), msg); +} + +static struct irq_chip pseries_msi_irq_chip = { + .name = "pSeries-MSI", + .irq_shutdown = pseries_msi_shutdown, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_compose_msi_msg = pseries_msi_compose_msg, +}; + +static int pseries_irq_parent_domain_alloc(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) +{ + struct irq_fwspec parent_fwspec; + int ret; + + parent_fwspec.fwnode = domain->parent->fwnode; + parent_fwspec.param_count = 2; + parent_fwspec.param[0] = hwirq; + parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec); + if (ret) + return ret; + + return 0; +} + +static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct pci_controller *phb = domain->host_data; + msi_alloc_info_t *info = arg; + struct msi_desc *desc = info->desc; + struct pci_dev *pdev = msi_desc_to_pci_dev(desc); + int hwirq; + int i, ret; + + hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_attrib.entry_nr); + if (hwirq < 0) { + dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq); + return hwirq; + } + + dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__, + phb->dn, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + ret = pseries_irq_parent_domain_alloc(domain, virq + i, hwirq + i); + if (ret) + goto out; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &pseries_msi_irq_chip, domain->host_data); + } + + return 0; + +out: + /* TODO: handle RTAS cleanup in ->msi_finish() ? */ + irq_domain_free_irqs_parent(domain, virq, i - 1); + return ret; +} + +static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct pci_controller *phb = irq_data_get_irq_chip_data(d); + + pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs); + + /* XIVE domain data is cleared through ->msi_free() */ +} - /* Read config space back so we can restore after reset */ - __pci_read_msi_msg(entry, &msg); - entry->msg = msg; +static const struct irq_domain_ops pseries_irq_domain_ops = { + .alloc = pseries_irq_domain_alloc, + .free = pseries_irq_domain_free, +}; + +static int __pseries_msi_allocate_domains(struct pci_controller *phb, + unsigned int count) +{ + struct irq_domain *parent = irq_get_default_host(); + + phb->fwnode = irq_domain_alloc_named_id_fwnode("pSeries-MSI", + phb->global_number); + if (!phb->fwnode) + return -ENOMEM; + + phb->dev_domain = irq_domain_create_hierarchy(parent, 0, count, + phb->fwnode, + &pseries_irq_domain_ops, phb); + if (!phb->dev_domain) { + pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n", + phb->dn, phb->global_number); + irq_domain_free_fwnode(phb->fwnode); + return -ENOMEM; + } + + phb->msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(phb->dn), + &pseries_msi_domain_info, + phb->dev_domain); + if (!phb->msi_domain) { + pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", + phb->dn, phb->global_number); + irq_domain_free_fwnode(phb->fwnode); + irq_domain_remove(phb->dev_domain); + return -ENOMEM; } return 0; } +int pseries_msi_allocate_domains(struct pci_controller *phb) +{ + int count; + + if (!__find_pe_total_msi(phb->dn, &count)) { + pr_err("PCI: failed to find MSIs for bridge %pOF (domain %d)\n", + phb->dn, phb->global_number); + return -ENOSPC; + } + + return __pseries_msi_allocate_domains(phb, count); +} + +void pseries_msi_free_domains(struct pci_controller *phb) +{ + if (phb->msi_domain) + irq_domain_remove(phb->msi_domain); + if (phb->dev_domain) + irq_domain_remove(phb->dev_domain); + if (phb->fwnode) + irq_domain_free_fwnode(phb->fwnode); +} + static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) { /* No LSI -> leave MSIs (if any) configured */ @@ -518,8 +684,6 @@ static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) static int rtas_msi_init(void) { - struct pci_controller *phb; - query_token = rtas_token("ibm,query-interrupt-source-number"); change_token = rtas_token("ibm,change-msi"); @@ -531,16 +695,6 @@ static int rtas_msi_init(void) pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n"); - WARN_ON(pseries_pci_controller_ops.setup_msi_irqs); - pseries_pci_controller_ops.setup_msi_irqs = rtas_setup_msi_irqs; - pseries_pci_controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs; - - list_for_each_entry(phb, &hose_list, list_node) { - WARN_ON(phb->controller_ops.setup_msi_irqs); - phb->controller_ops.setup_msi_irqs = rtas_setup_msi_irqs; - phb->controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs; - } - WARN_ON(ppc_md.pci_irq_fixup); ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup; diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index a8f9140a24fa..90c9d3531694 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -33,6 +33,8 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) pci_devs_phb_init_dynamic(phb); + pseries_msi_allocate_domains(phb); + /* Create EEH devices for the PHB */ eeh_phb_pe_create(phb); @@ -74,6 +76,8 @@ int remove_phb_dynamic(struct pci_controller *phb) } } + pseries_msi_free_domains(phb); + /* Remove the PCI bus and unregister the bridge device from sysfs */ phb->bus = NULL; pci_remove_bus(b); diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 1f051a786fb3..3544778e06d0 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -85,6 +85,8 @@ struct pci_host_bridge; int pseries_root_bridge_prepare(struct pci_host_bridge *bridge); extern struct pci_controller_ops pseries_pci_controller_ops; +int pseries_msi_allocate_domains(struct pci_controller *phb); +void pseries_msi_free_domains(struct pci_controller *phb); unsigned long pseries_memory_block_size(void); diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 167f2e1b8d39..56092dccfdb8 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -783,7 +783,7 @@ static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) { int recovered = 0; - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 0dfaa6ab44cc..f79126f16258 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -486,6 +486,8 @@ static void __init pSeries_discover_phbs(void) /* create pci_dn's for DT nodes under this PHB */ pci_devs_phb_init_dynamic(phb); + + pseries_msi_allocate_domains(phb); } of_node_put(root); diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index 1d829e257996..87f001b4c4e4 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -63,6 +63,9 @@ void __init svm_swiotlb_init(void) int set_memory_encrypted(unsigned long addr, int numpages) { + if (!mem_encrypt_active()) + return 0; + if (!PAGE_ALIGNED(addr)) return -EINVAL; @@ -73,6 +76,9 @@ int set_memory_encrypted(unsigned long addr, int numpages) int set_memory_decrypted(unsigned long addr, int numpages) { + if (!mem_encrypt_active()) + return 0; + if (!PAGE_ALIGNED(addr)) return -EINVAL; diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index b5c1cf1bc64d..b043e3936d21 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -184,7 +184,7 @@ static int h_get_nx_fault(u32 winid, u64 buffer) * Note: The hypervisor forwards an interrupt for each fault request. * So one fault CRB to process for each H_GET_NX_FAULT hcall. */ -irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) +static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) { struct pseries_vas_window *txwin = data; struct coprocessor_request_block crb; diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 5a95b8ea23d8..ff7906b48ca1 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -108,7 +108,7 @@ int fsl_rio_mcheck_exception(struct pt_regs *regs) __func__); out_be32((u32 *)(rio_regs_win + RIO_LTLEDCSR), 0); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } diff --git a/arch/powerpc/sysdev/xics/ics-native.c b/arch/powerpc/sysdev/xics/ics-native.c index d450502f4053..dec7d93a8ba1 100644 --- a/arch/powerpc/sysdev/xics/ics-native.c +++ b/arch/powerpc/sysdev/xics/ics-native.c @@ -131,19 +131,15 @@ static struct irq_chip ics_native_irq_chip = { .irq_retrigger = xics_retrigger, }; -static int ics_native_map(struct ics *ics, unsigned int virq) +static int ics_native_check(struct ics *ics, unsigned int hw_irq) { - unsigned int vec = (unsigned int)virq_to_hw(virq); struct ics_native *in = to_ics_native(ics); - pr_devel("%s: vec=0x%x\n", __func__, vec); + pr_devel("%s: hw_irq=0x%x\n", __func__, hw_irq); - if (vec < in->ibase || vec >= (in->ibase + in->icount)) + if (hw_irq < in->ibase || hw_irq >= (in->ibase + in->icount)) return -EINVAL; - irq_set_chip_and_handler(virq, &ics_native_irq_chip, handle_fasteoi_irq); - irq_set_chip_data(virq, ics); - return 0; } @@ -177,10 +173,11 @@ static int ics_native_host_match(struct ics *ics, struct device_node *node) } static struct ics ics_native_template = { - .map = ics_native_map, + .check = ics_native_check, .mask_unknown = ics_native_mask_unknown, .get_server = ics_native_get_server, .host_match = ics_native_host_match, + .chip = &ics_native_irq_chip, }; static int __init ics_native_add_one(struct device_node *np) diff --git a/arch/powerpc/sysdev/xics/ics-opal.c b/arch/powerpc/sysdev/xics/ics-opal.c index 823f6c9664cd..c4d95d8beb6f 100644 --- a/arch/powerpc/sysdev/xics/ics-opal.c +++ b/arch/powerpc/sysdev/xics/ics-opal.c @@ -62,17 +62,6 @@ static void ics_opal_unmask_irq(struct irq_data *d) static unsigned int ics_opal_startup(struct irq_data *d) { -#ifdef CONFIG_PCI_MSI - /* - * The generic MSI code returns with the interrupt disabled on the - * card, using the MSI mask bits. Firmware doesn't appear to unmask - * at that level, so we do it here by hand. - */ - if (irq_data_get_msi_desc(d)) - pci_msi_unmask_irq(d); -#endif - - /* unmask it */ ics_opal_unmask_irq(d); return 0; } @@ -133,7 +122,7 @@ static int ics_opal_set_affinity(struct irq_data *d, } server = ics_opal_mangle_server(wanted_server); - pr_devel("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n", + pr_debug("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n", d->irq, hw_irq, wanted_server, server); rc = opal_set_xive(hw_irq, server, priority); @@ -157,26 +146,13 @@ static struct irq_chip ics_opal_irq_chip = { .irq_retrigger = xics_retrigger, }; -static int ics_opal_map(struct ics *ics, unsigned int virq); -static void ics_opal_mask_unknown(struct ics *ics, unsigned long vec); -static long ics_opal_get_server(struct ics *ics, unsigned long vec); - static int ics_opal_host_match(struct ics *ics, struct device_node *node) { return 1; } -/* Only one global & state struct ics */ -static struct ics ics_hal = { - .map = ics_opal_map, - .mask_unknown = ics_opal_mask_unknown, - .get_server = ics_opal_get_server, - .host_match = ics_opal_host_match, -}; - -static int ics_opal_map(struct ics *ics, unsigned int virq) +static int ics_opal_check(struct ics *ics, unsigned int hw_irq) { - unsigned int hw_irq = (unsigned int)virq_to_hw(virq); int64_t rc; __be16 server; int8_t priority; @@ -189,9 +165,6 @@ static int ics_opal_map(struct ics *ics, unsigned int virq) if (rc != OPAL_SUCCESS) return -ENXIO; - irq_set_chip_and_handler(virq, &ics_opal_irq_chip, handle_fasteoi_irq); - irq_set_chip_data(virq, &ics_hal); - return 0; } @@ -222,6 +195,15 @@ static long ics_opal_get_server(struct ics *ics, unsigned long vec) return ics_opal_unmangle_server(be16_to_cpu(server)); } +/* Only one global & state struct ics */ +static struct ics ics_hal = { + .check = ics_opal_check, + .mask_unknown = ics_opal_mask_unknown, + .get_server = ics_opal_get_server, + .host_match = ics_opal_host_match, + .chip = &ics_opal_irq_chip, +}; + int __init ics_opal_init(void) { if (!firmware_has_feature(FW_FEATURE_OPAL)) diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c index 4cf18000f07c..b9da317b7a2d 100644 --- a/arch/powerpc/sysdev/xics/ics-rtas.c +++ b/arch/powerpc/sysdev/xics/ics-rtas.c @@ -24,19 +24,6 @@ static int ibm_set_xive; static int ibm_int_on; static int ibm_int_off; -static int ics_rtas_map(struct ics *ics, unsigned int virq); -static void ics_rtas_mask_unknown(struct ics *ics, unsigned long vec); -static long ics_rtas_get_server(struct ics *ics, unsigned long vec); -static int ics_rtas_host_match(struct ics *ics, struct device_node *node); - -/* Only one global & state struct ics */ -static struct ics ics_rtas = { - .map = ics_rtas_map, - .mask_unknown = ics_rtas_mask_unknown, - .get_server = ics_rtas_get_server, - .host_match = ics_rtas_host_match, -}; - static void ics_rtas_unmask_irq(struct irq_data *d) { unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); @@ -70,15 +57,6 @@ static void ics_rtas_unmask_irq(struct irq_data *d) static unsigned int ics_rtas_startup(struct irq_data *d) { -#ifdef CONFIG_PCI_MSI - /* - * The generic MSI code returns with the interrupt disabled on the - * card, using the MSI mask bits. Firmware doesn't appear to unmask - * at that level, so we do it here by hand. - */ - if (irq_data_get_msi_desc(d)) - pci_msi_unmask_irq(d); -#endif /* unmask it */ ics_rtas_unmask_irq(d); return 0; @@ -146,6 +124,9 @@ static int ics_rtas_set_affinity(struct irq_data *d, return -1; } + pr_debug("%s: irq %d [hw 0x%x] server: 0x%x\n", __func__, d->irq, + hw_irq, irq_server); + status = rtas_call_reentrant(ibm_set_xive, 3, 1, NULL, hw_irq, irq_server, xics_status[1]); @@ -169,9 +150,8 @@ static struct irq_chip ics_rtas_irq_chip = { .irq_retrigger = xics_retrigger, }; -static int ics_rtas_map(struct ics *ics, unsigned int virq) +static int ics_rtas_check(struct ics *ics, unsigned int hw_irq) { - unsigned int hw_irq = (unsigned int)virq_to_hw(virq); int status[2]; int rc; @@ -183,9 +163,6 @@ static int ics_rtas_map(struct ics *ics, unsigned int virq) if (rc) return -ENXIO; - irq_set_chip_and_handler(virq, &ics_rtas_irq_chip, handle_fasteoi_irq); - irq_set_chip_data(virq, &ics_rtas); - return 0; } @@ -213,6 +190,15 @@ static int ics_rtas_host_match(struct ics *ics, struct device_node *node) return !of_device_is_compatible(node, "chrp,iic"); } +/* Only one global & state struct ics */ +static struct ics ics_rtas = { + .check = ics_rtas_check, + .mask_unknown = ics_rtas_mask_unknown, + .get_server = ics_rtas_get_server, + .host_match = ics_rtas_host_match, + .chip = &ics_rtas_irq_chip, +}; + __init int ics_rtas_init(void) { ibm_get_xive = rtas_token("ibm,get-xive"); diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index b14c502e56a8..244a727c6ba4 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -38,7 +38,7 @@ DEFINE_PER_CPU(struct xics_cppr, xics_cppr); struct irq_domain *xics_host; -static LIST_HEAD(ics_list); +static struct ics *xics_ics; void xics_update_irq_servers(void) { @@ -111,12 +111,11 @@ void xics_setup_cpu(void) void xics_mask_unknown_vec(unsigned int vec) { - struct ics *ics; - pr_err("Interrupt 0x%x (real) is invalid, disabling it.\n", vec); - list_for_each_entry(ics, &ics_list, link) - ics->mask_unknown(ics, vec); + if (WARN_ON(!xics_ics)) + return; + xics_ics->mask_unknown(xics_ics, vec); } @@ -133,7 +132,7 @@ static void xics_request_ipi(void) * IPIs are marked IRQF_PERCPU. The handler was set in map. */ BUG_ON(request_irq(ipi, icp_ops->ipi_action, - IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); + IRQF_NO_DEBUG | IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); } void __init xics_smp_probe(void) @@ -184,6 +183,8 @@ void xics_migrate_irqs_away(void) unsigned int irq, virq; struct irq_desc *desc; + pr_debug("%s: CPU %u\n", __func__, cpu); + /* If we used to be the default server, move to the new "boot_cpuid" */ if (hw_cpu == xics_default_server) xics_update_irq_servers(); @@ -198,7 +199,7 @@ void xics_migrate_irqs_away(void) struct irq_chip *chip; long server; unsigned long flags; - struct ics *ics; + struct irq_data *irqd; /* We can't set affinity on ISA interrupts */ if (virq < NR_IRQS_LEGACY) @@ -206,9 +207,11 @@ void xics_migrate_irqs_away(void) /* We only need to migrate enabled IRQS */ if (!desc->action) continue; - if (desc->irq_data.domain != xics_host) + /* We need a mapping in the XICS IRQ domain */ + irqd = irq_domain_get_irq_data(xics_host, virq); + if (!irqd) continue; - irq = desc->irq_data.hwirq; + irq = irqd_to_hwirq(irqd); /* We need to get IPIs still. */ if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) continue; @@ -219,13 +222,10 @@ void xics_migrate_irqs_away(void) raw_spin_lock_irqsave(&desc->lock, flags); /* Locate interrupt server */ - server = -1; - ics = irq_desc_get_chip_data(desc); - if (ics) - server = ics->get_server(ics, irq); + server = xics_ics->get_server(xics_ics, irq); if (server < 0) { - printk(KERN_ERR "%s: Can't find server for irq %d\n", - __func__, irq); + pr_err("%s: Can't find server for irq %d/%x\n", + __func__, virq, irq); goto unlock; } @@ -307,13 +307,9 @@ int xics_get_irq_server(unsigned int virq, const struct cpumask *cpumask, static int xics_host_match(struct irq_domain *h, struct device_node *node, enum irq_domain_bus_token bus_token) { - struct ics *ics; - - list_for_each_entry(ics, &ics_list, link) - if (ics->host_match(ics, node)) - return 1; - - return 0; + if (WARN_ON(!xics_ics)) + return 0; + return xics_ics->host_match(xics_ics, node) ? 1 : 0; } /* Dummies */ @@ -327,12 +323,10 @@ static struct irq_chip xics_ipi_chip = { .irq_unmask = xics_ipi_unmask, }; -static int xics_host_map(struct irq_domain *h, unsigned int virq, - irq_hw_number_t hw) +static int xics_host_map(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) { - struct ics *ics; - - pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hw); + pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hwirq); /* * Mark interrupts as edge sensitive by default so that resend @@ -342,18 +336,23 @@ static int xics_host_map(struct irq_domain *h, unsigned int virq, irq_clear_status_flags(virq, IRQ_LEVEL); /* Don't call into ICS for IPIs */ - if (hw == XICS_IPI) { + if (hwirq == XICS_IPI) { irq_set_chip_and_handler(virq, &xics_ipi_chip, handle_percpu_irq); return 0; } - /* Let the ICS setup the chip data */ - list_for_each_entry(ics, &ics_list, link) - if (ics->map(ics, virq) == 0) - return 0; + if (WARN_ON(!xics_ics)) + return -EINVAL; + + if (xics_ics->check(xics_ics, hwirq)) + return -EINVAL; + + /* Let the ICS be the chip data for the XICS domain. For ICS native */ + irq_domain_set_info(domain, virq, hwirq, xics_ics->chip, + xics_ics, handle_fasteoi_irq, NULL, NULL); - return -EINVAL; + return 0; } static int xics_host_xlate(struct irq_domain *h, struct device_node *ct, @@ -412,22 +411,76 @@ int xics_retrigger(struct irq_data *data) return 0; } +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +static int xics_host_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *hwirq, unsigned int *type) +{ + return xics_host_xlate(d, to_of_node(fwspec->fwnode), fwspec->param, + fwspec->param_count, hwirq, type); +} + +static int xics_host_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_fwspec *fwspec = arg; + irq_hw_number_t hwirq; + unsigned int type = IRQ_TYPE_NONE; + int i, rc; + + rc = xics_host_domain_translate(domain, fwspec, &hwirq, &type); + if (rc) + return rc; + + pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) + irq_domain_set_info(domain, virq + i, hwirq + i, xics_ics->chip, + xics_ics, handle_fasteoi_irq, NULL, NULL); + + return 0; +} + +static void xics_host_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + pr_debug("%s %d #%d\n", __func__, virq, nr_irqs); +} +#endif + static const struct irq_domain_ops xics_host_ops = { +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + .alloc = xics_host_domain_alloc, + .free = xics_host_domain_free, + .translate = xics_host_domain_translate, +#endif .match = xics_host_match, .map = xics_host_map, .xlate = xics_host_xlate, }; -static void __init xics_init_host(void) +static int __init xics_allocate_domain(void) { - xics_host = irq_domain_add_tree(NULL, &xics_host_ops, NULL); - BUG_ON(xics_host == NULL); + struct fwnode_handle *fn; + + fn = irq_domain_alloc_named_fwnode("XICS"); + if (!fn) + return -ENOMEM; + + xics_host = irq_domain_create_tree(fn, &xics_host_ops, NULL); + if (!xics_host) { + irq_domain_free_fwnode(fn); + return -ENOMEM; + } + irq_set_default_host(xics_host); + return 0; } void __init xics_register_ics(struct ics *ics) { - list_add(&ics->link, &ics_list); + if (WARN_ONCE(xics_ics, "XICS: Source Controller is already defined !")) + return; + xics_ics = ics; } static void __init xics_get_server_size(void) @@ -484,6 +537,8 @@ void __init xics_init(void) /* Initialize common bits */ xics_get_server_size(); xics_update_irq_servers(); - xics_init_host(); + rc = xics_allocate_domain(); + if (rc < 0) + pr_err("XICS: Failed to create IRQ domain"); xics_setup_cpu(); } diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 8183ca343675..c732ce5a3e1a 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -21,7 +21,6 @@ #include <linux/msi.h> #include <linux/vmalloc.h> -#include <asm/debugfs.h> #include <asm/prom.h> #include <asm/io.h> #include <asm/smp.h> @@ -313,11 +312,10 @@ void xmon_xive_get_irq_all(void) struct irq_desc *desc; for_each_irq_desc(i, desc) { - struct irq_data *d = irq_desc_get_irq_data(desc); - unsigned int hwirq = (unsigned int)irqd_to_hwirq(d); + struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, i); - if (d->domain == xive_irq_domain) - xmon_xive_get_irq_config(hwirq, d); + if (d) + xmon_xive_get_irq_config(irqd_to_hwirq(d), d); } } @@ -617,16 +615,6 @@ static unsigned int xive_irq_startup(struct irq_data *d) pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", d->irq, hw_irq, d); -#ifdef CONFIG_PCI_MSI - /* - * The generic MSI code returns with the interrupt disabled on the - * card, using the MSI mask bits. Firmware doesn't appear to unmask - * at that level, so we do it here by hand. - */ - if (irq_data_get_msi_desc(d)) - pci_msi_unmask_irq(d); -#endif - /* Pick a target */ target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d)); if (target == XIVE_INVALID_TARGET) { @@ -714,16 +702,12 @@ static int xive_irq_set_affinity(struct irq_data *d, u32 target, old_target; int rc = 0; - pr_devel("xive_irq_set_affinity: irq %d\n", d->irq); + pr_debug("%s: irq %d/%x\n", __func__, d->irq, hw_irq); /* Is this valid ? */ if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) return -EINVAL; - /* Don't do anything if the interrupt isn't started */ - if (!irqd_is_started(d)) - return IRQ_SET_MASK_OK; - /* * If existing target is already in the new mask, and is * online then do nothing. @@ -759,7 +743,7 @@ static int xive_irq_set_affinity(struct irq_data *d, return rc; } - pr_devel(" target: 0x%x\n", target); + pr_debug(" target: 0x%x\n", target); xd->target = target; /* Give up previous target */ @@ -990,6 +974,8 @@ EXPORT_SYMBOL_GPL(is_xive_irq); void xive_cleanup_irq_data(struct xive_irq_data *xd) { + pr_debug("%s for HW %x\n", __func__, xd->hw_irq); + if (xd->eoi_mmio) { iounmap(xd->eoi_mmio); if (xd->eoi_mmio == xd->trig_mmio) @@ -1031,7 +1017,7 @@ static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) return 0; } -static void xive_irq_free_data(unsigned int virq) +void xive_irq_free_data(unsigned int virq) { struct xive_irq_data *xd = irq_get_handler_data(virq); @@ -1041,6 +1027,7 @@ static void xive_irq_free_data(unsigned int virq) xive_cleanup_irq_data(xd); kfree(xd); } +EXPORT_SYMBOL_GPL(xive_irq_free_data); #ifdef CONFIG_SMP @@ -1179,7 +1166,7 @@ static int xive_request_ipi(unsigned int cpu) return 0; ret = request_irq(xid->irq, xive_muxed_ipi_action, - IRQF_PERCPU | IRQF_NO_THREAD, + IRQF_NO_DEBUG | IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL); WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); @@ -1379,7 +1366,71 @@ static void xive_irq_domain_debug_show(struct seq_file *m, struct irq_domain *d, } #endif +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +static int xive_irq_domain_translate(struct irq_domain *d, + struct irq_fwspec *fwspec, + unsigned long *hwirq, + unsigned int *type) +{ + return xive_irq_domain_xlate(d, to_of_node(fwspec->fwnode), + fwspec->param, fwspec->param_count, + hwirq, type); +} + +static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_fwspec *fwspec = arg; + irq_hw_number_t hwirq; + unsigned int type = IRQ_TYPE_NONE; + int i, rc; + + rc = xive_irq_domain_translate(domain, fwspec, &hwirq, &type); + if (rc) + return rc; + + pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + /* TODO: call xive_irq_domain_map() */ + + /* + * Mark interrupts as edge sensitive by default so that resend + * actually works. Will fix that up below if needed. + */ + irq_clear_status_flags(virq, IRQ_LEVEL); + + /* allocates and sets handler data */ + rc = xive_irq_alloc_data(virq + i, hwirq + i); + if (rc) + return rc; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &xive_irq_chip, domain->host_data); + irq_set_handler(virq + i, handle_fasteoi_irq); + } + + return 0; +} + +static void xive_irq_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + int i; + + pr_debug("%s %d #%d\n", __func__, virq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) + xive_irq_free_data(virq + i); +} +#endif + static const struct irq_domain_ops xive_irq_domain_ops = { +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + .alloc = xive_irq_domain_alloc, + .free = xive_irq_domain_free, + .translate = xive_irq_domain_translate, +#endif .match = xive_irq_domain_match, .map = xive_irq_domain_map, .unmap = xive_irq_domain_unmap, @@ -1717,9 +1768,9 @@ static int xive_core_debug_show(struct seq_file *m, void *private) xive_debug_show_cpu(m, cpu); for_each_irq_desc(i, desc) { - struct irq_data *d = irq_desc_get_irq_data(desc); + struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, i); - if (d->domain == xive_irq_domain) + if (d) xive_debug_show_irq(m, d); } return 0; @@ -1729,7 +1780,7 @@ DEFINE_SHOW_ATTRIBUTE(xive_core_debug); int xive_core_debug_init(void) { if (xive_enabled()) - debugfs_create_file("xive", 0400, powerpc_debugfs_root, + debugfs_create_file("xive", 0400, arch_debugfs_dir, NULL, &xive_core_debug_fops); return 0; } diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 57e3f1540435..1aec282cd650 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -41,6 +41,7 @@ static u32 xive_queue_shift; static u32 xive_pool_vps = XIVE_INVALID_VP; static struct kmem_cache *xive_provision_cache; static bool xive_has_single_esc; +static bool xive_has_save_restore; int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) { @@ -588,6 +589,9 @@ bool __init xive_native_init(void) if (of_get_property(np, "single-escalation-support", NULL) != NULL) xive_has_single_esc = true; + if (of_get_property(np, "vp-save-restore", NULL)) + xive_has_save_restore = true; + /* Configure Thread Management areas for KVM */ for_each_possible_cpu(cpu) kvmppc_set_xive_tima(cpu, r.start, tima); @@ -752,6 +756,12 @@ bool xive_native_has_single_escalation(void) } EXPORT_SYMBOL_GPL(xive_native_has_single_escalation); +bool xive_native_has_save_restore(void) +{ + return xive_has_save_restore; +} +EXPORT_SYMBOL_GPL(xive_native_has_save_restore); + int xive_native_get_queue_info(u32 vp_id, u32 prio, u64 *out_qpage, u64 *out_qsize, diff --git a/arch/powerpc/tools/head_check.sh b/arch/powerpc/tools/head_check.sh index e32d3162e5ed..689907cda996 100644 --- a/arch/powerpc/tools/head_check.sh +++ b/arch/powerpc/tools/head_check.sh @@ -49,30 +49,30 @@ vmlinux="$2" $nm "$vmlinux" | grep -e " [TA] _stext$" -e " t start_first_256B$" -e " a text_start$" -e " t start_text$" > .tmp_symbols.txt -vma=$(cat .tmp_symbols.txt | grep -e " [TA] _stext$" | cut -d' ' -f1) +vma=$(grep -e " [TA] _stext$" .tmp_symbols.txt | cut -d' ' -f1) -expected_start_head_addr=$vma +expected_start_head_addr="$vma" -start_head_addr=$(cat .tmp_symbols.txt | grep " t start_first_256B$" | cut -d' ' -f1) +start_head_addr=$(grep " t start_first_256B$" .tmp_symbols.txt | cut -d' ' -f1) if [ "$start_head_addr" != "$expected_start_head_addr" ]; then - echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr" - echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" - echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" + echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr" 1>&2 + echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" 1>&2 + echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" 1>&2 exit 1 fi -top_vma=$(echo $vma | cut -d'0' -f1) +top_vma=$(echo "$vma" | cut -d'0' -f1) -expected_start_text_addr=$(cat .tmp_symbols.txt | grep " a text_start$" | cut -d' ' -f1 | sed "s/^0/$top_vma/") +expected_start_text_addr=$(grep " a text_start$" .tmp_symbols.txt | cut -d' ' -f1 | sed "s/^0/$top_vma/") -start_text_addr=$(cat .tmp_symbols.txt | grep " t start_text$" | cut -d' ' -f1) +start_text_addr=$(grep " t start_text$" .tmp_symbols.txt | cut -d' ' -f1) if [ "$start_text_addr" != "$expected_start_text_addr" ]; then - echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr" - echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" - echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" + echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr" 1>&2 + echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" 1>&2 + echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" 1>&2 exit 1 fi diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index da4d7f225a40..dd8241c009e5 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -26,8 +26,8 @@ #include <linux/ctype.h> #include <linux/highmem.h> #include <linux/security.h> +#include <linux/debugfs.h> -#include <asm/debugfs.h> #include <asm/ptrace.h> #include <asm/smp.h> #include <asm/string.h> @@ -482,16 +482,6 @@ static inline void get_output_lock(void) {} static inline void release_output_lock(void) {} #endif -static inline int unrecoverable_excp(struct pt_regs *regs) -{ -#if defined(CONFIG_4xx) || defined(CONFIG_PPC_BOOK3E) - /* We have no MSR_RI bit on 4xx or Book3e, so we simply return false */ - return 0; -#else - return ((regs->msr & MSR_RI) == 0); -#endif -} - static void xmon_touch_watchdogs(void) { touch_softlockup_watchdog_sync(); @@ -565,7 +555,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi) bp = NULL; if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) == (MSR_IR|MSR_64BIT)) bp = at_breakpoint(regs->nip); - if (bp || unrecoverable_excp(regs)) + if (bp || regs_is_unrecoverable(regs)) fromipi = 0; if (!fromipi) { @@ -577,7 +567,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi) cpu, BP_NUM(bp)); xmon_print_symbol(regs->nip, " ", ")\n"); } - if (unrecoverable_excp(regs)) + if (regs_is_unrecoverable(regs)) printf("WARNING: exception is not recoverable, " "can't continue\n"); release_output_lock(); @@ -693,7 +683,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi) printf("Stopped at breakpoint %tx (", BP_NUM(bp)); xmon_print_symbol(regs->nip, " ", ")\n"); } - if (unrecoverable_excp(regs)) + if (regs_is_unrecoverable(regs)) printf("WARNING: exception is not recoverable, " "can't continue\n"); remove_bpts(); @@ -4077,8 +4067,8 @@ DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get, static int __init setup_xmon_dbgfs(void) { - debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL, - &xmon_dbgfs_ops); + debugfs_create_file("xmon", 0600, arch_debugfs_dir, NULL, + &xmon_dbgfs_ops); return 0; } device_initcall(setup_xmon_dbgfs); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 1452584f83be..301a54233c7e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -13,9 +13,7 @@ config 32BIT config RISCV def_bool y select ARCH_CLOCKSOURCE_INIT - select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU - select ARCH_STACKWALK + select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_HAS_BINFMT_FLAT select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_VIRTUAL if MMU @@ -31,14 +29,19 @@ config RISCV select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT + select ARCH_STACKWALK + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU select ARCH_SUPPORTS_HUGETLBFS if MMU select ARCH_USE_MEMTEST select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU + select BUILDTIME_TABLE_SORT if MMU select CLONE_BACKWARDS select CLINT_TIMER if !MMU select COMMON_CLK @@ -48,9 +51,11 @@ config RISCV select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_EARLY_IOREMAP select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO - select GENERIC_IOREMAP + select GENERIC_IDLE_POLL_SETUP + select GENERIC_IOREMAP if MMU select GENERIC_IRQ_MULTI_HANDLER select GENERIC_IRQ_SHOW + select GENERIC_IRQ_SHOW_LEVEL select GENERIC_LIB_DEVMEM_IS_ALLOWED select GENERIC_PCI_IOMAP select GENERIC_PTDUMP if MMU @@ -70,6 +75,7 @@ config RISCV select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU + select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_VMAP_STACK if MMU && 64BIT select HAVE_ASM_MODVERSIONS select HAVE_CONTEXT_TRACKING @@ -95,6 +101,7 @@ config RISCV select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select IRQ_DOMAIN + select IRQ_FORCED_THREADING select MODULES_USE_ELF_RELA if MODULES select MODULE_SECTIONS if MODULES select OF @@ -107,6 +114,7 @@ config RISCV select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select TRACE_IRQFLAGS_SUPPORT select UACCESS_MEMCPY if !MMU select ZONE_DMA32 if 64BIT @@ -176,9 +184,6 @@ config ARCH_SUPPORTS_UPROBES config STACKTRACE_SUPPORT def_bool y -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config GENERIC_BUG def_bool y depends on BUG @@ -231,7 +236,7 @@ config ARCH_RV32I config ARCH_RV64I bool "RV64I" select 64BIT - select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && GCC_VERSION >= 50000 + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && $(cc-option,-fpatchable-function-entry=8) select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index bc74afdbf31e..0eb4568fbd29 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -41,7 +41,7 @@ endif ifeq ($(CONFIG_LD_IS_LLD),y) KBUILD_CFLAGS += -mno-relax KBUILD_AFLAGS += -mno-relax -ifneq ($(LLVM_IAS),1) +ifndef CONFIG_AS_IS_LLVM KBUILD_CFLAGS += -Wa,-mno-relax KBUILD_AFLAGS += -Wa,-mno-relax endif @@ -108,6 +108,12 @@ PHONY += vdso_install vdso_install: $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@ +ifeq ($(CONFIG_MMU),y) +prepare: vdso_prepare +vdso_prepare: prepare0 + $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso include/generated/vdso-offsets.h +endif + ifneq ($(CONFIG_XIP_KERNEL),y) ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_CANAAN),yy) KBUILD_IMAGE := $(boot)/loader.bin @@ -126,8 +132,11 @@ $(BOOT_TARGETS): vmlinux Image.%: Image $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ -zinstall install: - $(Q)$(MAKE) $(build)=$(boot) $@ +install: install-image = Image +zinstall: install-image = Image.gz +install zinstall: + $(CONFIG_SHELL) $(srctree)/$(boot)/install.sh $(KERNELRELEASE) \ + $(boot)/$(install-image) System.map "$(INSTALL_PATH)" archclean: $(Q)$(MAKE) $(clean)=$(boot) diff --git a/arch/riscv/boot/Makefile b/arch/riscv/boot/Makefile index 6bf299f70c27..becd0621071c 100644 --- a/arch/riscv/boot/Makefile +++ b/arch/riscv/boot/Makefile @@ -58,11 +58,3 @@ $(obj)/Image.lzo: $(obj)/Image FORCE $(obj)/loader.bin: $(obj)/loader FORCE $(call if_changed,objcopy) - -install: - $(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \ - $(obj)/Image System.map "$(INSTALL_PATH)" - -zinstall: - $(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \ - $(obj)/Image.gz System.map "$(INSTALL_PATH)" diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts index baea7d204639..b254c60589a1 100644 --- a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts +++ b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts @@ -16,10 +16,14 @@ aliases { ethernet0 = &emac1; + serial0 = &serial0; + serial1 = &serial1; + serial2 = &serial2; + serial3 = &serial3; }; chosen { - stdout-path = &serial0; + stdout-path = "serial0:115200n8"; }; cpus { diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index bc68231a8fb7..4ebc80315f01 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -39,10 +39,12 @@ CONFIG_PCI=y CONFIG_PCIEPORTBUS=y CONFIG_PCI_HOST_GENERIC=y CONFIG_PCIE_XILINX=y +CONFIG_PCIE_FU740=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_BLK_DEV_LOOP=y CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_NVME=m CONFIG_BLK_DEV_SD=y CONFIG_BLK_DEV_SR=y CONFIG_SCSI_VIRTIO=y @@ -108,6 +110,8 @@ CONFIG_NFS_V4_1=y CONFIG_NFS_V4_2=y CONFIG_ROOT_NFS=y CONFIG_9P_FS=y +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ISO8859_1=m CONFIG_CRYPTO_USER_API_HASH=y CONFIG_CRYPTO_DEV_VIRTIO=y CONFIG_PRINTK_TIME=y diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index f4b490cd0e5d..f53c40026c7a 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -42,6 +42,9 @@ */ #define ELF_ET_DYN_BASE ((TASK_SIZE / 3) * 2) +#ifdef CONFIG_64BIT +#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) +#endif /* * This yields a mask that user programs can use to figure out what * instruction set this CPU supports. This could be done in user space, diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index b0ca5058e7ae..109c97e991a6 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -79,8 +79,8 @@ typedef struct page *pgtable_t; #endif #ifdef CONFIG_MMU -extern unsigned long pfn_base; -#define ARCH_PFN_OFFSET (pfn_base) +extern unsigned long riscv_pfn_base; +#define ARCH_PFN_OFFSET (riscv_pfn_base) #else #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ @@ -91,10 +91,8 @@ struct kernel_mapping { uintptr_t size; /* Offset between linear mapping virtual address and kernel load address */ unsigned long va_pa_offset; -#ifdef CONFIG_64BIT /* Offset between kernel mapping virtual address and kernel load address */ unsigned long va_kernel_pa_offset; -#endif unsigned long va_kernel_xip_pa_offset; #ifdef CONFIG_XIP_KERNEL uintptr_t xiprom; @@ -105,11 +103,11 @@ struct kernel_mapping { extern struct kernel_mapping kernel_map; extern phys_addr_t phys_ram_base; -#ifdef CONFIG_64BIT #define is_kernel_mapping(x) \ ((x) >= kernel_map.virt_addr && (x) < (kernel_map.virt_addr + kernel_map.size)) + #define is_linear_mapping(x) \ - ((x) >= PAGE_OFFSET && (x) < kernel_map.virt_addr) + ((x) >= PAGE_OFFSET && (!IS_ENABLED(CONFIG_64BIT) || (x) < kernel_map.virt_addr)) #define linear_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + kernel_map.va_pa_offset)) #define kernel_mapping_pa_to_va(y) ({ \ @@ -123,7 +121,7 @@ extern phys_addr_t phys_ram_base; #define linear_mapping_va_to_pa(x) ((unsigned long)(x) - kernel_map.va_pa_offset) #define kernel_mapping_va_to_pa(y) ({ \ unsigned long _y = y; \ - (_y < kernel_map.virt_addr + XIP_OFFSET) ? \ + (IS_ENABLED(CONFIG_XIP_KERNEL) && _y < kernel_map.virt_addr + XIP_OFFSET) ? \ ((unsigned long)(_y) - kernel_map.va_kernel_xip_pa_offset) : \ ((unsigned long)(_y) - kernel_map.va_kernel_pa_offset - XIP_OFFSET); \ }) @@ -133,15 +131,6 @@ extern phys_addr_t phys_ram_base; is_linear_mapping(_x) ? \ linear_mapping_va_to_pa(_x) : kernel_mapping_va_to_pa(_x); \ }) -#else -#define is_kernel_mapping(x) \ - ((x) >= kernel_map.virt_addr && (x) < (kernel_map.virt_addr + kernel_map.size)) -#define is_linear_mapping(x) \ - ((x) >= PAGE_OFFSET) - -#define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + kernel_map.va_pa_offset)) -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - kernel_map.va_pa_offset) -#endif /* CONFIG_64BIT */ #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 021ed64ee608..46b492c78cbb 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -37,6 +37,14 @@ struct thread_struct { unsigned long bad_cause; }; +/* Whitelist the fstate from the task_struct for hardened usercopy */ +static inline void arch_thread_struct_whitelist(unsigned long *offset, + unsigned long *size) +{ + *offset = offsetof(struct thread_struct, fstate); + *size = sizeof_field(struct thread_struct, fstate); +} + #define INIT_THREAD { \ .sp = sizeof(init_stack) + (long)&init_stack, \ } diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h index 1453a2f563bc..893e47195e30 100644 --- a/arch/riscv/include/asm/vdso.h +++ b/arch/riscv/include/asm/vdso.h @@ -8,26 +8,25 @@ #ifndef _ASM_RISCV_VDSO_H #define _ASM_RISCV_VDSO_H + +/* + * All systems with an MMU have a VDSO, but systems without an MMU don't + * support shared libraries and therefor don't have one. + */ +#ifdef CONFIG_MMU + #include <linux/types.h> +#include <generated/vdso-offsets.h> #ifndef CONFIG_GENERIC_TIME_VSYSCALL struct vdso_data { }; #endif -/* - * The VDSO symbols are mapped into Linux so we can just use regular symbol - * addressing to get their offsets in userspace. The symbols are mapped at an - * offset of 0, but since the linker must support setting weak undefined - * symbols to the absolute address 0 it also happens to support other low - * addresses even when the code model suggests those low addresses would not - * otherwise be availiable. - */ #define VDSO_SYMBOL(base, name) \ -({ \ - extern const char __vdso_##name[]; \ - (void __user *)((unsigned long)(base) + __vdso_##name); \ -}) + (void __user *)((unsigned long)(base) + __vdso_##name##_offset) + +#endif /* CONFIG_MMU */ asmlinkage long sys_riscv_flush_icache(uintptr_t, uintptr_t, uintptr_t); diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index d86781357044..90deabfe63ea 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -113,7 +113,7 @@ static void fill_cacheinfo(struct cacheinfo **this_leaf, } } -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct device_node *np = of_cpu_device_node_get(cpu); @@ -155,7 +155,7 @@ static int __init_cache_level(unsigned int cpu) return 0; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *this_leaf = this_cpu_ci->info_list; @@ -187,6 +187,3 @@ static int __populate_cache_leaves(unsigned int cpu) return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/riscv/kernel/probes/decode-insn.c b/arch/riscv/kernel/probes/decode-insn.c index 0ed043acc882..64f6183b4717 100644 --- a/arch/riscv/kernel/probes/decode-insn.c +++ b/arch/riscv/kernel/probes/decode-insn.c @@ -38,11 +38,10 @@ riscv_probe_decode_insn(probe_opcode_t *addr, struct arch_probe_insn *api) RISCV_INSN_REJECTED(c_ebreak, insn); #endif - RISCV_INSN_REJECTED(auipc, insn); - RISCV_INSN_REJECTED(branch, insn); - RISCV_INSN_SET_SIMULATE(jal, insn); RISCV_INSN_SET_SIMULATE(jalr, insn); + RISCV_INSN_SET_SIMULATE(auipc, insn); + RISCV_INSN_SET_SIMULATE(branch, insn); return INSN_GOOD; } diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index 2519ce26377d..d73e96f6ed7c 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -83,3 +83,115 @@ bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *reg return ret; } + +#define auipc_rd_idx(opcode) \ + ((opcode >> 7) & 0x1f) + +#define auipc_imm(opcode) \ + ((((opcode) >> 12) & 0xfffff) << 12) + +#if __riscv_xlen == 64 +#define auipc_offset(opcode) sign_extend64(auipc_imm(opcode), 31) +#elif __riscv_xlen == 32 +#define auipc_offset(opcode) auipc_imm(opcode) +#else +#error "Unexpected __riscv_xlen" +#endif + +bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + /* + * auipc instruction: + * 31 12 11 7 6 0 + * | imm[31:12] | rd | opcode | + * 20 5 7 + */ + + u32 rd_idx = auipc_rd_idx(opcode); + unsigned long rd_val = addr + auipc_offset(opcode); + + if (!rv_insn_reg_set_val(regs, rd_idx, rd_val)) + return false; + + instruction_pointer_set(regs, addr + 4); + + return true; +} + +#define branch_rs1_idx(opcode) \ + (((opcode) >> 15) & 0x1f) + +#define branch_rs2_idx(opcode) \ + (((opcode) >> 20) & 0x1f) + +#define branch_funct3(opcode) \ + (((opcode) >> 12) & 0x7) + +#define branch_imm(opcode) \ + (((((opcode) >> 8) & 0xf ) << 1) | \ + ((((opcode) >> 25) & 0x3f) << 5) | \ + ((((opcode) >> 7) & 0x1 ) << 11) | \ + ((((opcode) >> 31) & 0x1 ) << 12)) + +#define branch_offset(opcode) \ + sign_extend32((branch_imm(opcode)), 12) + +#define BRANCH_BEQ 0x0 +#define BRANCH_BNE 0x1 +#define BRANCH_BLT 0x4 +#define BRANCH_BGE 0x5 +#define BRANCH_BLTU 0x6 +#define BRANCH_BGEU 0x7 + +bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + /* + * branch instructions: + * 31 30 25 24 20 19 15 14 12 11 8 7 6 0 + * | imm[12] | imm[10:5] | rs2 | rs1 | funct3 | imm[4:1] | imm[11] | opcode | + * 1 6 5 5 3 4 1 7 + * imm[12|10:5] rs2 rs1 000 imm[4:1|11] 1100011 BEQ + * imm[12|10:5] rs2 rs1 001 imm[4:1|11] 1100011 BNE + * imm[12|10:5] rs2 rs1 100 imm[4:1|11] 1100011 BLT + * imm[12|10:5] rs2 rs1 101 imm[4:1|11] 1100011 BGE + * imm[12|10:5] rs2 rs1 110 imm[4:1|11] 1100011 BLTU + * imm[12|10:5] rs2 rs1 111 imm[4:1|11] 1100011 BGEU + */ + + s32 offset; + s32 offset_tmp; + unsigned long rs1_val; + unsigned long rs2_val; + + if (!rv_insn_reg_get_val(regs, branch_rs1_idx(opcode), &rs1_val) || + !rv_insn_reg_get_val(regs, branch_rs2_idx(opcode), &rs2_val)) + return false; + + offset_tmp = branch_offset(opcode); + switch (branch_funct3(opcode)) { + case BRANCH_BEQ: + offset = (rs1_val == rs2_val) ? offset_tmp : 4; + break; + case BRANCH_BNE: + offset = (rs1_val != rs2_val) ? offset_tmp : 4; + break; + case BRANCH_BLT: + offset = ((long)rs1_val < (long)rs2_val) ? offset_tmp : 4; + break; + case BRANCH_BGE: + offset = ((long)rs1_val >= (long)rs2_val) ? offset_tmp : 4; + break; + case BRANCH_BLTU: + offset = (rs1_val < rs2_val) ? offset_tmp : 4; + break; + case BRANCH_BGEU: + offset = (rs1_val >= rs2_val) ? offset_tmp : 4; + break; + default: + return false; + } + + instruction_pointer_set(regs, addr + offset); + + return true; +} diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 120b2f6f71bc..b9620e5f00ba 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -255,7 +255,7 @@ static void __init parse_dtb(void) pr_err("No DTB passed to the kernel\n"); #ifdef CONFIG_CMDLINE_FORCE - strlcpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); + strscpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); pr_info("Forcing kernel command line to: %s\n", boot_command_line); #endif } diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 0a98fd0ddfe9..0daaa3e4630d 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -199,11 +199,6 @@ int is_valid_bugaddr(unsigned long pc) } #endif /* CONFIG_GENERIC_BUG */ -/* stvec & scratch is already set from head.S */ -void __init trap_init(void) -{ -} - #ifdef CONFIG_VMAP_STACK static DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack)__aligned(16); diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 24d936c147cd..f2e065671e4d 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -23,10 +23,10 @@ ifneq ($(c-gettimeofday-y),) endif # Build rules -targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds vdso-syms.S +targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) -obj-y += vdso.o vdso-syms.o +obj-y += vdso.o CPPFLAGS_vdso.lds += -P -C -U$(ARCH) # Disable -pg to prevent insert call site @@ -36,6 +36,7 @@ CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os GCOV_PROFILE := n KCOV_INSTRUMENT := n KASAN_SANITIZE := n +UBSAN_SANITIZE := n # Force dependency $(obj)/vdso.o: $(obj)/vdso.so @@ -43,20 +44,22 @@ $(obj)/vdso.o: $(obj)/vdso.so # link rule for the .so file, .lds has to be first $(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold) -LDFLAGS_vdso.so.dbg = -shared -s -soname=linux-vdso.so.1 \ +LDFLAGS_vdso.so.dbg = -shared -S -soname=linux-vdso.so.1 \ --build-id=sha1 --hash-style=both --eh-frame-hdr -# We also create a special relocatable object that should mirror the symbol -# table and layout of the linked DSO. With ld --just-symbols we can then -# refer to these symbols in the kernel code rather than hand-coded addresses. -$(obj)/vdso-syms.S: $(obj)/vdso.so FORCE - $(call if_changed,so2s) - # strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) +# Generate VDSO offsets using helper script +gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE + $(call if_changed,vdsosym) + # actual build commands # The DSO images are built using a special linker script # Make sure only to export the intended __vdso_xxx symbol offsets. @@ -65,11 +68,6 @@ quiet_cmd_vdsold = VDSOLD $@ $(OBJCOPY) $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \ rm $@.tmp -# Extracts symbol offsets from the VDSO, converting them into an assembly file -# that contains the same symbols at the same offsets. -quiet_cmd_so2s = SO2S $@ - cmd_so2s = $(NM) -D $< | $(srctree)/$(src)/so2s.sh > $@ - # install commands for the unstripped file quiet_cmd_vdso_install = INSTALL $@ cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ diff --git a/arch/riscv/kernel/vdso/gen_vdso_offsets.sh b/arch/riscv/kernel/vdso/gen_vdso_offsets.sh new file mode 100755 index 000000000000..c2e5613f3495 --- /dev/null +++ b/arch/riscv/kernel/vdso/gen_vdso_offsets.sh @@ -0,0 +1,5 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +LC_ALL=C +sed -n -e 's/^[0]\+\(0[0-9a-fA-F]*\) . \(__vdso_[a-zA-Z0-9_]*\)$/\#define \2_offset\t0x\1/p' diff --git a/arch/riscv/kernel/vdso/so2s.sh b/arch/riscv/kernel/vdso/so2s.sh deleted file mode 100755 index e64cb6d9440e..000000000000 --- a/arch/riscv/kernel/vdso/so2s.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0+ -# Copyright 2020 Palmer Dabbelt <palmerdabbelt@google.com> - -sed 's!\([0-9a-f]*\) T \([a-z0-9_]*\)\(@@LINUX_4.15\)*!.global \2\n.set \2,0x\1!' \ -| grep '^\.' diff --git a/arch/riscv/kernel/vmlinux-xip.lds.S b/arch/riscv/kernel/vmlinux-xip.lds.S index af776555ded9..9c9f35091ef0 100644 --- a/arch/riscv/kernel/vmlinux-xip.lds.S +++ b/arch/riscv/kernel/vmlinux-xip.lds.S @@ -121,7 +121,6 @@ SECTIONS } BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0) - EXCEPTION_TABLE(0x10) .rel.dyn : AT(ADDR(.rel.dyn) - LOAD_OFFSET) { *(.rel.dyn*) diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 502d0826ecb1..5104f3a871e3 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -4,6 +4,8 @@ * Copyright (C) 2017 SiFive */ +#define RO_EXCEPTION_TABLE_ALIGN 16 + #ifdef CONFIG_XIP_KERNEL #include "vmlinux-xip.lds.S" #else @@ -112,8 +114,6 @@ SECTIONS *(.srodata*) } - EXCEPTION_TABLE(0x10) - . = ALIGN(SECTION_ALIGN); _data = .; diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 93720b015bb6..c0cddf0fc22d 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -19,6 +19,7 @@ #include <linux/set_memory.h> #include <linux/dma-map-ops.h> #include <linux/crash_dump.h> +#include <linux/hugetlb.h> #include <asm/fixmap.h> #include <asm/tlbflush.h> @@ -222,6 +223,8 @@ static void __init setup_bootmem(void) early_init_fdt_scan_reserved_mem(); dma_contiguous_reserve(dma32_phys_limit); + if (IS_ENABLED(CONFIG_64BIT)) + hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); memblock_allow_resize(); } @@ -234,14 +237,15 @@ static struct pt_alloc_ops _pt_ops __initdata; #define pt_ops _pt_ops #endif -unsigned long pfn_base __ro_after_init; -EXPORT_SYMBOL(pfn_base); +unsigned long riscv_pfn_base __ro_after_init; +EXPORT_SYMBOL(riscv_pfn_base); pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); +static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #ifdef CONFIG_XIP_KERNEL #define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir)) @@ -322,7 +326,6 @@ static void __init create_pte_mapping(pte_t *ptep, static pmd_t trampoline_pmd[PTRS_PER_PMD] __page_aligned_bss; static pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss; static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); -static pmd_t early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #ifdef CONFIG_XIP_KERNEL #define trampoline_pmd ((pmd_t *)XIP_FIXUP(trampoline_pmd)) @@ -408,6 +411,7 @@ static void __init create_pmd_mapping(pmd_t *pmdp, #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pte_mapping(__nextp, __va, __pa, __sz, __prot) #define fixmap_pgd_next fixmap_pte +#define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) #endif void __init create_pgd_mapping(pgd_t *pgdp, @@ -515,49 +519,80 @@ static __init pgprot_t pgprot_from_va(uintptr_t va) #endif #ifdef CONFIG_XIP_KERNEL -static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size, +static void __init create_kernel_page_table(pgd_t *pgdir, __always_unused bool early) { uintptr_t va, end_va; /* Map the flash resident part */ end_va = kernel_map.virt_addr + kernel_map.xiprom_sz; - for (va = kernel_map.virt_addr; va < end_va; va += map_size) + for (va = kernel_map.virt_addr; va < end_va; va += PMD_SIZE) create_pgd_mapping(pgdir, va, kernel_map.xiprom + (va - kernel_map.virt_addr), - map_size, PAGE_KERNEL_EXEC); + PMD_SIZE, PAGE_KERNEL_EXEC); /* Map the data in RAM */ end_va = kernel_map.virt_addr + XIP_OFFSET + kernel_map.size; - for (va = kernel_map.virt_addr + XIP_OFFSET; va < end_va; va += map_size) + for (va = kernel_map.virt_addr + XIP_OFFSET; va < end_va; va += PMD_SIZE) create_pgd_mapping(pgdir, va, kernel_map.phys_addr + (va - (kernel_map.virt_addr + XIP_OFFSET)), - map_size, PAGE_KERNEL); + PMD_SIZE, PAGE_KERNEL); } #else -static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size, - bool early) +static void __init create_kernel_page_table(pgd_t *pgdir, bool early) { uintptr_t va, end_va; end_va = kernel_map.virt_addr + kernel_map.size; - for (va = kernel_map.virt_addr; va < end_va; va += map_size) + for (va = kernel_map.virt_addr; va < end_va; va += PMD_SIZE) create_pgd_mapping(pgdir, va, kernel_map.phys_addr + (va - kernel_map.virt_addr), - map_size, + PMD_SIZE, early ? PAGE_KERNEL_EXEC : pgprot_from_va(va)); } #endif -asmlinkage void __init setup_vm(uintptr_t dtb_pa) +/* + * Setup a 4MB mapping that encompasses the device tree: for 64-bit kernel, + * this means 2 PMD entries whereas for 32-bit kernel, this is only 1 PGDIR + * entry. + */ +static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) { - uintptr_t __maybe_unused pa; - uintptr_t map_size; -#ifndef __PAGETABLE_PMD_FOLDED - pmd_t fix_bmap_spmd, fix_bmap_epmd; +#ifndef CONFIG_BUILTIN_DTB + uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); + + create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, + IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa, + PGDIR_SIZE, + IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); + + if (IS_ENABLED(CONFIG_64BIT)) { + create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, + pa, PMD_SIZE, PAGE_KERNEL); + create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA + PMD_SIZE, + pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); + } + + dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1)); +#else + /* + * For 64-bit kernel, __va can't be used since it would return a linear + * mapping address whereas dtb_early_va will be used before + * setup_vm_final installs the linear mapping. For 32-bit kernel, as the + * kernel is mapped in the linear mapping, that makes no difference. + */ + dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa)); #endif + dtb_early_pa = dtb_pa; +} + +asmlinkage void __init setup_vm(uintptr_t dtb_pa) +{ + pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; + kernel_map.virt_addr = KERNEL_LINK_ADDR; #ifdef CONFIG_XIP_KERNEL @@ -573,23 +608,14 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) kernel_map.phys_addr = (uintptr_t)(&_start); kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; #endif - kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr; -#ifdef CONFIG_64BIT kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; -#endif - pfn_base = PFN_DOWN(kernel_map.phys_addr); - - /* - * Enforce boot alignment requirements of RV32 and - * RV64 by only allowing PMD or PGD mappings. - */ - map_size = PMD_SIZE; + riscv_pfn_base = PFN_DOWN(kernel_map.phys_addr); /* Sanity check alignment and size */ BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); - BUG_ON((kernel_map.phys_addr % map_size) != 0); + BUG_ON((kernel_map.phys_addr % PMD_SIZE) != 0); #ifdef CONFIG_64BIT /* @@ -634,50 +660,10 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) * us to reach paging_init(). We map all memory banks later * in setup_vm_final() below. */ - create_kernel_page_table(early_pg_dir, map_size, true); + create_kernel_page_table(early_pg_dir, true); -#ifndef __PAGETABLE_PMD_FOLDED - /* Setup early PMD for DTB */ - create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, - (uintptr_t)early_dtb_pmd, PGDIR_SIZE, PAGE_TABLE); -#ifndef CONFIG_BUILTIN_DTB - /* Create two consecutive PMD mappings for FDT early scan */ - pa = dtb_pa & ~(PMD_SIZE - 1); - create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, - pa, PMD_SIZE, PAGE_KERNEL); - create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA + PMD_SIZE, - pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); - dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1)); -#else /* CONFIG_BUILTIN_DTB */ -#ifdef CONFIG_64BIT - /* - * __va can't be used since it would return a linear mapping address - * whereas dtb_early_va will be used before setup_vm_final installs - * the linear mapping. - */ - dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa)); -#else - dtb_early_va = __va(dtb_pa); -#endif /* CONFIG_64BIT */ -#endif /* CONFIG_BUILTIN_DTB */ -#else -#ifndef CONFIG_BUILTIN_DTB - /* Create two consecutive PGD mappings for FDT early scan */ - pa = dtb_pa & ~(PGDIR_SIZE - 1); - create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, - pa, PGDIR_SIZE, PAGE_KERNEL); - create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA + PGDIR_SIZE, - pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL); - dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1)); -#else /* CONFIG_BUILTIN_DTB */ -#ifdef CONFIG_64BIT - dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa)); -#else - dtb_early_va = __va(dtb_pa); -#endif /* CONFIG_64BIT */ -#endif /* CONFIG_BUILTIN_DTB */ -#endif - dtb_early_pa = dtb_pa; + /* Setup early mapping for FDT early scan */ + create_fdt_early_page_table(early_pg_dir, dtb_pa); /* * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap @@ -752,7 +738,7 @@ static void __init setup_vm_final(void) #ifdef CONFIG_64BIT /* Map the kernel */ - create_kernel_page_table(swapper_pg_dir, PMD_SIZE, false); + create_kernel_page_table(swapper_pg_dir, false); #endif /* Clear fixmap PTE and PMD mappings */ @@ -819,38 +805,22 @@ static void __init reserve_crashkernel(void) crash_size = PAGE_ALIGN(crash_size); - if (crash_base == 0) { - /* - * Current riscv boot protocol requires 2MB alignment for - * RV64 and 4MB alignment for RV32 (hugepage size) - */ - crash_base = memblock_find_in_range(search_start, search_end, - crash_size, PMD_SIZE); - - if (crash_base == 0) { - pr_warn("crashkernel: couldn't allocate %lldKB\n", - crash_size >> 10); - return; - } - } else { - /* User specifies base address explicitly. */ - if (!memblock_is_region_memory(crash_base, crash_size)) { - pr_warn("crashkernel: requested region is not memory\n"); - return; - } - - if (memblock_is_region_reserved(crash_base, crash_size)) { - pr_warn("crashkernel: requested region is reserved\n"); - return; - } - + if (crash_base) { + search_start = crash_base; + search_end = crash_base + crash_size; + } - if (!IS_ALIGNED(crash_base, PMD_SIZE)) { - pr_warn("crashkernel: requested region is misaligned\n"); - return; - } + /* + * Current riscv boot protocol requires 2MB alignment for + * RV64 and 4MB alignment for RV32 (hugepage size) + */ + crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, + search_start, search_end); + if (crash_base == 0) { + pr_warn("crashkernel: couldn't allocate %lldKB\n", + crash_size >> 10); + return; } - memblock_reserve(crash_base, crash_size); pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n", crash_base, crash_base + crash_size, crash_size >> 20); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 92c0a1b4c528..b86de61b8caa 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -110,6 +110,7 @@ config S390 select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC + select ARCH_SUPPORTS_HUGETLBFS select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF @@ -209,6 +210,7 @@ config S390 select SWIOTLB select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select TRACE_IRQFLAGS_SUPPORT select TTY select VIRT_CPU_ACCOUNTING select ZONE_DMA @@ -683,16 +685,6 @@ config STACK_GUARD The minimum size for the stack guard should be 256 for 31 bit and 512 for 64 bit. -config WARN_DYNAMIC_STACK - def_bool n - prompt "Emit compiler warnings for function with dynamic stack usage" - help - This option enables the compiler option -mwarn-dynamicstack. If the - compiler supports this options generates warnings for functions - that dynamically allocate stack space using alloca. - - Say N if you are unsure. - endmenu menu "I/O subsystem" diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug index 9ea6e61d5858..e94a2a7f6bf4 100644 --- a/arch/s390/Kconfig.debug +++ b/arch/s390/Kconfig.debug @@ -1,8 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config EARLY_PRINTK def_bool y diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 17dc4f1ac4fa..450b351dfa8e 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -70,7 +70,7 @@ cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include # cflags-$(CONFIG_FRAME_POINTER) += -fno-optimize-sibling-calls -ifeq ($(call cc-option-yn,-mpacked-stack -mbackchain -msoft-float),y) +ifneq ($(call cc-option,-mpacked-stack -mbackchain -msoft-float),) cflags-$(CONFIG_PACK_STACK) += -mpacked-stack -D__PACK_STACK aflags-$(CONFIG_PACK_STACK) += -D__PACK_STACK endif @@ -78,22 +78,15 @@ endif KBUILD_AFLAGS_DECOMPRESSOR += $(aflags-y) KBUILD_CFLAGS_DECOMPRESSOR += $(cflags-y) -ifeq ($(call cc-option-yn,-mstack-size=8192 -mstack-guard=128),y) +ifneq ($(call cc-option,-mstack-size=8192 -mstack-guard=128),) cflags-$(CONFIG_CHECK_STACK) += -mstack-size=$(STACK_SIZE) -ifneq ($(call cc-option-yn,-mstack-size=8192),y) +ifeq ($(call cc-option,-mstack-size=8192),) cflags-$(CONFIG_CHECK_STACK) += -mstack-guard=$(CONFIG_STACK_GUARD) endif endif -ifdef CONFIG_WARN_DYNAMIC_STACK - ifeq ($(call cc-option-yn,-mwarn-dynamicstack),y) - KBUILD_CFLAGS += -mwarn-dynamicstack - KBUILD_CFLAGS_DECOMPRESSOR += -mwarn-dynamicstack - endif -endif - ifdef CONFIG_EXPOLINE - ifeq ($(call cc-option-yn,$(CC_FLAGS_MARCH) -mindirect-branch=thunk),y) + ifneq ($(call cc-option,$(CC_FLAGS_MARCH) -mindirect-branch=thunk),) CC_FLAGS_EXPOLINE := -mindirect-branch=thunk CC_FLAGS_EXPOLINE += -mfunction-return=thunk CC_FLAGS_EXPOLINE += -mindirect-branch-table @@ -104,10 +97,10 @@ ifdef CONFIG_EXPOLINE endif ifdef CONFIG_FUNCTION_TRACER - ifeq ($(call cc-option-yn,-mfentry -mnop-mcount),n) + ifeq ($(call cc-option,-mfentry -mnop-mcount),) # make use of hotpatch feature if the compiler supports it cc_hotpatch := -mhotpatch=0,3 - ifeq ($(call cc-option-yn,$(cc_hotpatch)),y) + ifneq ($(call cc-option,$(cc_hotpatch)),) CC_FLAGS_FTRACE := $(cc_hotpatch) KBUILD_AFLAGS += -DCC_USING_HOTPATCH KBUILD_CFLAGS += -DCC_USING_HOTPATCH diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c index 209f6ae5a197..75bcbfa27941 100644 --- a/arch/s390/boot/pgm_check_info.c +++ b/arch/s390/boot/pgm_check_info.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> +#include <linux/stdarg.h> #include <linux/string.h> #include <linux/ctype.h> #include <asm/stacktrace.h> @@ -8,7 +9,6 @@ #include <asm/setup.h> #include <asm/sclp.h> #include <asm/uv.h> -#include <stdarg.h> #include "boot.h" const char hex_asc[] = "0123456789abcdef"; diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 11ffc7c37ada..6aad18ee131d 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -10,6 +10,7 @@ CONFIG_BPF_JIT=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_LSM=y CONFIG_PREEMPT=y +CONFIG_SCHED_CORE=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_TASKSTATS=y @@ -503,6 +504,7 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MICROSOFT is not set +# CONFIG_NET_VENDOR_LITEX is not set # CONFIG_NET_VENDOR_MARVELL is not set CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m @@ -661,7 +663,6 @@ CONFIG_NFSD_V3_ACL=y CONFIG_NFSD_V4=y CONFIG_NFSD_V4_SECURITY_LABEL=y CONFIG_CIFS=m -CONFIG_CIFS_WEAK_PW_HASH=y CONFIG_CIFS_UPCALL=y CONFIG_CIFS_XATTR=y CONFIG_CIFS_POSIX=y @@ -720,6 +721,8 @@ CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m CONFIG_CRYPTO_BLAKE2S=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m @@ -774,7 +777,6 @@ CONFIG_RANDOM32_SELFTEST=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=0 CONFIG_DMA_API_DEBUG=y -CONFIG_STRING_SELFTEST=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y @@ -804,6 +806,7 @@ CONFIG_DEBUG_VM_PGFLAGS=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m CONFIG_DEBUG_PER_CPU_MAPS=y +CONFIG_KFENCE=y CONFIG_DEBUG_SHIRQ=y CONFIG_PANIC_ON_OOPS=y CONFIG_DETECT_HUNG_TASK=y @@ -852,12 +855,12 @@ CONFIG_FAIL_FUNCTION=y CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y CONFIG_LKDTM=m CONFIG_TEST_MIN_HEAP=y -CONFIG_TEST_SORT=y CONFIG_KPROBES_SANITY_TEST=y CONFIG_RBTREE_TEST=y CONFIG_INTERVAL_TREE_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y +CONFIG_STRING_SELFTEST=y CONFIG_TEST_BITOPS=m CONFIG_TEST_BPF=m CONFIG_TEST_LIVEPATCH=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index e1642d2cba59..f08b161c9446 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -8,6 +8,7 @@ CONFIG_BPF_SYSCALL=y CONFIG_BPF_JIT=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_LSM=y +CONFIG_SCHED_CORE=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_TASKSTATS=y @@ -397,7 +398,6 @@ CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 -# CONFIG_BLK_DEV_XPRAM is not set CONFIG_VIRTIO_BLK=y CONFIG_BLK_DEV_RBD=m CONFIG_BLK_DEV_NVME=m @@ -495,6 +495,7 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MICROSOFT is not set +# CONFIG_NET_VENDOR_LITEX is not set # CONFIG_NET_VENDOR_MARVELL is not set CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m @@ -649,7 +650,6 @@ CONFIG_NFSD_V3_ACL=y CONFIG_NFSD_V4=y CONFIG_NFSD_V4_SECURITY_LABEL=y CONFIG_CIFS=m -CONFIG_CIFS_WEAK_PW_HASH=y CONFIG_CIFS_UPCALL=y CONFIG_CIFS_XATTR=y CONFIG_CIFS_POSIX=y @@ -709,6 +709,8 @@ CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m CONFIG_CRYPTO_BLAKE2S=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index d576aaab27c9..aceccf3b9a88 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -35,7 +35,6 @@ CONFIG_NET=y # CONFIG_ETHTOOL_NETLINK is not set CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_RAM=y -# CONFIG_BLK_DEV_XPRAM is not set # CONFIG_DCSSBLK is not set # CONFIG_DASD is not set CONFIG_ENCLOSURE_SERVICES=y diff --git a/arch/s390/include/asm/ccwgroup.h b/arch/s390/include/asm/ccwgroup.h index 36dbf5043fc0..aa995d91cd1d 100644 --- a/arch/s390/include/asm/ccwgroup.h +++ b/arch/s390/include/asm/ccwgroup.h @@ -55,7 +55,7 @@ int ccwgroup_create_dev(struct device *root, struct ccwgroup_driver *gdrv, int num_devices, const char *buf); extern int ccwgroup_set_online(struct ccwgroup_device *gdev); -extern int ccwgroup_set_offline(struct ccwgroup_device *gdev); +int ccwgroup_set_offline(struct ccwgroup_device *gdev, bool call_gdrv); extern int ccwgroup_probe_ccwdev(struct ccw_device *cdev); extern void ccwgroup_remove_ccwdev(struct ccw_device *cdev); diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 8d49505b4a43..cdc7ae72529d 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -176,16 +176,6 @@ static inline int is_compat_task(void) return test_thread_flag(TIF_31BIT); } -static inline void __user *arch_compat_alloc_user_space(long len) -{ - unsigned long stack; - - stack = KSTK_ESP(current); - if (is_compat_task()) - stack &= 0x7fffffffUL; - return (void __user *) (stack - len); -} - #endif struct compat_ipc64_perm { diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h index ca0e0e5ddbc4..f87a4788c19c 100644 --- a/arch/s390/include/asm/cpu_mcf.h +++ b/arch/s390/include/asm/cpu_mcf.h @@ -24,13 +24,6 @@ enum cpumf_ctr_set { #define CPUMF_LCCTL_ENABLE_SHIFT 16 #define CPUMF_LCCTL_ACTCTL_SHIFT 0 -static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = { - [CPUMF_CTR_SET_BASIC] = 0x02, - [CPUMF_CTR_SET_USER] = 0x04, - [CPUMF_CTR_SET_CRYPTO] = 0x08, - [CPUMF_CTR_SET_EXT] = 0x01, - [CPUMF_CTR_SET_MT_DIAG] = 0x20, -}; static inline void ctr_set_enable(u64 *state, u64 ctrsets) { diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d681ae462350..a604d51acfc8 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -244,6 +244,7 @@ struct kvm_s390_sie_block { __u8 fpf; /* 0x0060 */ #define ECB_GS 0x40 #define ECB_TE 0x10 +#define ECB_SPECI 0x08 #define ECB_SRSI 0x04 #define ECB_HOSTPROTINT 0x02 __u8 ecb; /* 0x0061 */ @@ -955,6 +956,7 @@ struct kvm_arch{ atomic64_t cmma_dirty_pages; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); + /* indexed by vcpu_idx */ DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct kvm_s390_gisa_interrupt gisa_int; struct kvm_s390_pv pv; diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index e317fd4866c1..f16f4d054ae2 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -18,6 +18,7 @@ extern struct mutex smp_cpu_state_mutex; extern unsigned int smp_cpu_mt_shift; extern unsigned int smp_cpu_mtid; extern __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS]; +extern cpumask_t cpu_setup_mask; extern int __cpu_up(unsigned int cpu, struct task_struct *tidle); diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h index 3d8a4b94c620..dd00d98804ec 100644 --- a/arch/s390/include/asm/stacktrace.h +++ b/arch/s390/include/asm/stacktrace.h @@ -34,16 +34,6 @@ static inline bool on_stack(struct stack_info *info, return addr >= info->begin && addr + len <= info->end; } -static __always_inline unsigned long get_stack_pointer(struct task_struct *task, - struct pt_regs *regs) -{ - if (regs) - return (unsigned long) kernel_stack_pointer(regs); - if (task == current) - return current_stack_pointer(); - return (unsigned long) task->thread.ksp; -} - /* * Stack layout of a C stack frame. */ @@ -74,6 +64,16 @@ struct stack_frame { ((unsigned long)__builtin_frame_address(0) - \ offsetof(struct stack_frame, back_chain)) +static __always_inline unsigned long get_stack_pointer(struct task_struct *task, + struct pt_regs *regs) +{ + if (regs) + return (unsigned long)kernel_stack_pointer(regs); + if (task == current) + return current_frame_address(); + return (unsigned long)task->thread.ksp; +} + /* * To keep this simple mark register 2-6 as being changed (volatile) * by the called function, even though register 6 is saved/nonvolatile. diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 9ed9aa37e836..ce550d06abc3 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -227,9 +227,6 @@ static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long s __get_user(x, ptr); \ }) -unsigned long __must_check -raw_copy_in_user(void __user *to, const void __user *from, unsigned long n); - /* * Copy a null terminated string from userspace. */ diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h index de9006b0cfeb..5ebf534ef753 100644 --- a/arch/s390/include/asm/unwind.h +++ b/arch/s390/include/asm/unwind.h @@ -55,10 +55,10 @@ static inline bool unwind_error(struct unwind_state *state) return state->error; } -static inline void unwind_start(struct unwind_state *state, - struct task_struct *task, - struct pt_regs *regs, - unsigned long first_frame) +static __always_inline void unwind_start(struct unwind_state *state, + struct task_struct *task, + struct pt_regs *regs, + unsigned long first_frame) { task = task ?: current; first_frame = first_frame ?: get_stack_pointer(task, regs); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index b9716a7e326d..4c9b967290ae 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -140,10 +140,10 @@ _LPP_OFFSET = __LC_LPP TSTMSK __LC_MCCK_CODE,(MCCK_CODE_STG_ERROR|MCCK_CODE_STG_KEY_ERROR) jnz \errlabel TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_DEGRAD - jz oklabel\@ + jz .Loklabel\@ TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_FAIL_ADDR jnz \errlabel -oklabel\@: +.Loklabel\@: .endm #if IS_ENABLED(CONFIG_KVM) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 0a464d328467..1d94ffdf347b 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -341,13 +341,13 @@ NOKPROBE_SYMBOL(prepare_ftrace_return); */ int ftrace_enable_ftrace_graph_caller(void) { - brcl_disable(__va(ftrace_graph_caller)); + brcl_disable(ftrace_graph_caller); return 0; } int ftrace_disable_ftrace_graph_caller(void) { - brcl_enable(__va(ftrace_graph_caller)); + brcl_enable(ftrace_graph_caller); return 0; } diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 2e3bb633acf6..4a99154fe651 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -158,6 +158,14 @@ static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, return need; } +static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = { + [CPUMF_CTR_SET_BASIC] = 0x02, + [CPUMF_CTR_SET_USER] = 0x04, + [CPUMF_CTR_SET_CRYPTO] = 0x08, + [CPUMF_CTR_SET_EXT] = 0x01, + [CPUMF_CTR_SET_MT_DIAG] = 0x20, +}; + /* Read out all counter sets and save them in the provided data buffer. * The last 64 byte host an artificial trailer entry. */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index fe14beb338e5..67e5fff96ee0 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -50,6 +50,7 @@ #include <linux/compat.h> #include <linux/start_kernel.h> #include <linux/hugetlb.h> +#include <linux/kmemleak.h> #include <asm/boot_data.h> #include <asm/ipl.h> @@ -356,9 +357,12 @@ void *restart_stack; unsigned long stack_alloc(void) { #ifdef CONFIG_VMAP_STACK - return (unsigned long)__vmalloc_node(THREAD_SIZE, THREAD_SIZE, - THREADINFO_GFP, NUMA_NO_NODE, - __builtin_return_address(0)); + void *ret; + + ret = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP, + NUMA_NO_NODE, __builtin_return_address(0)); + kmemleak_not_leak(ret); + return (unsigned long)ret; #else return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); #endif @@ -677,8 +681,9 @@ static void __init reserve_crashkernel(void) return; } low = crash_base ?: low; - crash_base = memblock_find_in_range(low, high, crash_size, - KEXEC_CRASH_MEM_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, + KEXEC_CRASH_MEM_ALIGN, + low, high); } if (!crash_base) { @@ -687,8 +692,10 @@ static void __init reserve_crashkernel(void) return; } - if (register_memory_notifier(&kdump_mem_nb)) + if (register_memory_notifier(&kdump_mem_nb)) { + memblock_free(crash_base, crash_size); return; + } if (!oldmem_data.start && MACHINE_IS_VM) diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size)); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 2a991e43ead3..1a04e5bdf655 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -95,6 +95,7 @@ __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS]; #endif static unsigned int smp_max_threads __initdata = -1U; +cpumask_t cpu_setup_mask; static int __init early_nosmt(char *s) { @@ -902,13 +903,14 @@ static void smp_start_secondary(void *cpuvoid) vtime_init(); vdso_getcpu_init(); pfault_init(); + cpumask_set_cpu(cpu, &cpu_setup_mask); + update_cpu_masks(); notify_cpu_starting(cpu); if (topology_cpu_dedicated(cpu)) set_cpu_flag(CIF_DEDICATED_CPU); else clear_cpu_flag(CIF_DEDICATED_CPU); set_cpu_online(cpu, true); - update_cpu_masks(); inc_irq_stat(CPU_RST); local_irq_enable(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); @@ -950,10 +952,13 @@ early_param("possible_cpus", _setup_possible_cpus); int __cpu_disable(void) { unsigned long cregs[16]; + int cpu; /* Handle possible pending IPIs */ smp_handle_ext_call(); - set_cpu_online(smp_processor_id(), false); + cpu = smp_processor_id(); + set_cpu_online(cpu, false); + cpumask_clear_cpu(cpu, &cpu_setup_mask); update_cpu_masks(); /* Disable pseudo page faults on this cpu. */ pfault_fini(); diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index aa705e1bd0dc..df5261e5cfe1 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -274,9 +274,9 @@ 265 common statfs64 sys_statfs64 compat_sys_statfs64 266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages -268 common mbind sys_mbind compat_sys_mbind -269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +268 common mbind sys_mbind sys_mbind +269 common get_mempolicy sys_get_mempolicy sys_get_mempolicy +270 common set_mempolicy sys_set_mempolicy sys_set_mempolicy 271 common mq_open sys_mq_open compat_sys_mq_open 272 common mq_unlink sys_mq_unlink sys_mq_unlink 273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 @@ -293,7 +293,7 @@ 284 common inotify_init sys_inotify_init sys_inotify_init 285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch 286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch -287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages +287 common migrate_pages sys_migrate_pages sys_migrate_pages 288 common openat sys_openat compat_sys_openat 289 common mkdirat sys_mkdirat sys_mkdirat 290 common mknodat sys_mknodat sys_mknodat @@ -317,7 +317,7 @@ 307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range 308 common tee sys_tee sys_tee 309 common vmsplice sys_vmsplice sys_vmsplice -310 common move_pages sys_move_pages compat_sys_move_pages +310 common move_pages sys_move_pages sys_move_pages 311 common getcpu sys_getcpu sys_getcpu 312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 313 common utimes sys_utimes sys_utimes_time32 @@ -449,3 +449,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease sys_process_mrelease diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index d2458a29618f..58f8291950cb 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -67,7 +67,7 @@ static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int c static cpumask_t mask; cpumask_clear(&mask); - if (!cpu_online(cpu)) + if (!cpumask_test_cpu(cpu, &cpu_setup_mask)) goto out; cpumask_set_cpu(cpu, &mask); switch (topology_mode) { @@ -88,7 +88,7 @@ static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int c case TOPOLOGY_MODE_SINGLE: break; } - cpumask_and(&mask, &mask, cpu_online_mask); + cpumask_and(&mask, &mask, &cpu_setup_mask); out: cpumask_copy(dst, &mask); } @@ -99,16 +99,16 @@ static void cpu_thread_map(cpumask_t *dst, unsigned int cpu) int i; cpumask_clear(&mask); - if (!cpu_online(cpu)) + if (!cpumask_test_cpu(cpu, &cpu_setup_mask)) goto out; cpumask_set_cpu(cpu, &mask); if (topology_mode != TOPOLOGY_MODE_HW) goto out; cpu -= cpu % (smp_cpu_mtid + 1); - for (i = 0; i <= smp_cpu_mtid; i++) - if (cpu_present(cpu + i)) + for (i = 0; i <= smp_cpu_mtid; i++) { + if (cpumask_test_cpu(cpu + i, &cpu_setup_mask)) cpumask_set_cpu(cpu + i, &mask); - cpumask_and(&mask, &mask, cpu_online_mask); + } out: cpumask_copy(dst, &mask); } @@ -569,6 +569,7 @@ void __init topology_init_early(void) alloc_masks(info, &book_info, 2); alloc_masks(info, &drawer_info, 3); out: + cpumask_set_cpu(0, &cpu_setup_mask); __arch_update_cpu_topology(); __arch_update_dedicated_flag(NULL); } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index d548d60caed2..16256e17a544 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); - set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) @@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) { - int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus); + int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus); struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_vcpu *vcpu; - for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { - vcpu = kvm_get_vcpu(kvm, vcpu_id); + for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_idx); if (psw_ioint_disabled(vcpu)) continue; deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); if (deliverable_mask) { /* lately kicked but not yet running */ - if (test_and_set_bit(vcpu_id, gi->kicked_mask)) + if (test_and_set_bit(vcpu_idx, gi->kicked_mask)) return; kvm_s390_vcpu_wakeup(vcpu); return; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index efda0615741f..752a0ffab9bf 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -66,8 +66,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, inject_service_signal), STATS_DESC_COUNTER(VM, inject_virtio) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -174,8 +172,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, instruction_diagnose_other), STATS_DESC_COUNTER(VCPU, pfault_sync) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -1953,7 +1949,7 @@ out: static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) { int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->lru_slot); + int slot = atomic_read(&slots->last_used_slot); struct kvm_memory_slot *memslots = slots->memslots; if (gfn >= memslots[slot].base_gfn && @@ -1974,7 +1970,7 @@ static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) if (gfn >= memslots[start].base_gfn && gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->lru_slot, start); + atomic_set(&slots->last_used_slot, start); } return start; @@ -3224,6 +3220,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ecb |= ECB_SRSI; if (test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= ECB_TE; + if (!kvm_is_ucontrol(vcpu->kvm)) + vcpu->arch.sie_block->ecb |= ECB_SPECI; if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi) vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI; @@ -4068,7 +4066,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) kvm_s390_patch_guest_per_regs(vcpu); } - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask); + clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask); vcpu->arch.sie_block->icptcode = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 9fad25109b0d..ecd741ee3276 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 4002a24bc43a..acda4b6fc851 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -510,6 +510,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) prefix_unmapped(vsie_page); scb_s->ecb |= ECB_TE; } + /* specification exception interpretation */ + scb_s->ecb |= scb_o->ecb & ECB_SPECI; /* branch prediction */ if (test_kvm_facility(vcpu->kvm, 82)) scb_s->fpf |= scb_o->fpf & FPF_BPBC; diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 94ca99bde59d..a596e69d3c47 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -204,69 +204,6 @@ unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long } EXPORT_SYMBOL(raw_copy_to_user); -static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from, - unsigned long size) -{ - unsigned long tmp1, tmp2; - - tmp1 = -4096UL; - /* FIXME: copy with reduced length. */ - asm volatile( - " lgr 0,%[spec]\n" - "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" - " jz 2f\n" - "1: algr %0,%3\n" - " slgr %1,%3\n" - " slgr %2,%3\n" - " j 0b\n" - "2:slgr %0,%0\n" - "3: \n" - EX_TABLE(0b,3b) - : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2) - : [spec] "d" (0x810081UL) - : "cc", "memory", "0"); - return size; -} - -static inline unsigned long copy_in_user_mvc(void __user *to, const void __user *from, - unsigned long size) -{ - unsigned long tmp1; - - asm volatile( - " sacf 256\n" - " aghi %0,-1\n" - " jo 5f\n" - " bras %3,3f\n" - "0: aghi %0,257\n" - "1: mvc 0(1,%1),0(%2)\n" - " la %1,1(%1)\n" - " la %2,1(%2)\n" - " aghi %0,-1\n" - " jnz 1b\n" - " j 5f\n" - "2: mvc 0(256,%1),0(%2)\n" - " la %1,256(%1)\n" - " la %2,256(%2)\n" - "3: aghi %0,-256\n" - " jnm 2b\n" - "4: ex %0,1b-0b(%3)\n" - "5: slgr %0,%0\n" - "6: sacf 768\n" - EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) - : "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1) - : : "cc", "memory"); - return size; -} - -unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - if (copy_with_mvcos()) - return copy_in_user_mvcos(to, from, n); - return copy_in_user_mvc(to, from, n); -} -EXPORT_SYMBOL(raw_copy_in_user); - static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size) { unsigned long tmp1, tmp2; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 9bb2c7512cd5..4d3b33ce81c6 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -27,7 +27,6 @@ /** * gmap_alloc - allocate and initialize a guest address space - * @mm: pointer to the parent mm_struct * @limit: maximum address of the gmap address space * * Returns a guest address space structure. @@ -504,7 +503,7 @@ EXPORT_SYMBOL_GPL(gmap_translate); /** * gmap_unlink - disconnect a page table from the gmap shadow tables - * @gmap: pointer to guest mapping meta data structure + * @mm: pointer to the parent mm_struct * @table: pointer to the host page table * @vmaddr: vm address associated with the host page table */ @@ -527,7 +526,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, unsigned long gaddr); /** - * gmap_link - set up shadow page tables to connect a host to a guest address + * __gmap_link - set up shadow page tables to connect a host to a guest address * @gmap: pointer to guest mapping meta data structure * @gaddr: guest address * @vmaddr: vm address @@ -1971,7 +1970,7 @@ out_free: EXPORT_SYMBOL_GPL(gmap_shadow_sgt); /** - * gmap_shadow_lookup_pgtable - find a shadow page table + * gmap_shadow_pgt_lookup - find a shadow page table * @sg: pointer to the shadow guest address space structure * @saddr: the address in the shadow aguest address space * @pgt: parent gmap address of the page table to get shadowed @@ -2165,7 +2164,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) } EXPORT_SYMBOL_GPL(gmap_shadow_page); -/** +/* * gmap_shadow_notify - handle notifications for shadow gmap * * Called with sg->parent->shadow_lock. @@ -2225,7 +2224,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, /** * ptep_notify - call all invalidation callbacks for a specific pte. * @mm: pointer to the process mm_struct - * @addr: virtual address in the process address space + * @vmaddr: virtual address in the process address space * @pte: pointer to the page table entry * @bits: bits from the pgste that caused the notify call * diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index f3db3caa8447..a04faf49001a 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -187,9 +187,9 @@ static void pv_init(void) return; /* make sure bounce buffers are shared */ + swiotlb_force = SWIOTLB_FORCE; swiotlb_init(1); swiotlb_update_mem_attributes(); - swiotlb_force = SWIOTLB_FORCE; } void __init mem_init(void) @@ -307,8 +307,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return rc; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index eec3a9d7176e..034721a68d8f 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -834,7 +834,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(set_guest_storage_key); -/** +/* * Conditionally set a guest storage key (handling csske). * oldkey will be updated when either mr or mc is set and a pointer is given. * @@ -867,7 +867,7 @@ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(cond_set_guest_storage_key); -/** +/* * Reset a guest reference bit (rrbe), returning the reference and changed bit. * * Returns < 0 in case of error, otherwise the cc to be reported to the guest. diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 88419263a89a..840d8594437d 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -248,8 +248,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \ ({ \ - /* Branch instruction needs 6 bytes */ \ - int rel = (addrs[(i) + (off) + 1] - (addrs[(i) + 1] - 6)) / 2;\ + int rel = (addrs[(i) + (off) + 1] - jit->prg) / 2; \ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ @@ -761,10 +760,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9080000, dst_reg, src_reg); break; case BPF_ALU | BPF_ADD | BPF_K: /* dst = (u32) dst + (u32) imm */ - if (!imm) - break; - /* alfi %dst,imm */ - EMIT6_IMM(0xc20b0000, dst_reg, imm); + if (imm != 0) { + /* alfi %dst,imm */ + EMIT6_IMM(0xc20b0000, dst_reg, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_ADD | BPF_K: /* dst = dst + imm */ @@ -786,17 +785,22 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9090000, dst_reg, src_reg); break; case BPF_ALU | BPF_SUB | BPF_K: /* dst = (u32) dst - (u32) imm */ - if (!imm) - break; - /* alfi %dst,-imm */ - EMIT6_IMM(0xc20b0000, dst_reg, -imm); + if (imm != 0) { + /* alfi %dst,-imm */ + EMIT6_IMM(0xc20b0000, dst_reg, -imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_SUB | BPF_K: /* dst = dst - imm */ if (!imm) break; - /* agfi %dst,-imm */ - EMIT6_IMM(0xc2080000, dst_reg, -imm); + if (imm == -0x80000000) { + /* algfi %dst,0x80000000 */ + EMIT6_IMM(0xc20a0000, dst_reg, 0x80000000); + } else { + /* agfi %dst,-imm */ + EMIT6_IMM(0xc2080000, dst_reg, -imm); + } break; /* * BPF_MUL @@ -811,10 +815,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb90c0000, dst_reg, src_reg); break; case BPF_ALU | BPF_MUL | BPF_K: /* dst = (u32) dst * (u32) imm */ - if (imm == 1) - break; - /* msfi %r5,imm */ - EMIT6_IMM(0xc2010000, dst_reg, imm); + if (imm != 1) { + /* msfi %r5,imm */ + EMIT6_IMM(0xc2010000, dst_reg, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_MUL | BPF_K: /* dst = dst * imm */ @@ -867,6 +871,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, if (BPF_OP(insn->code) == BPF_MOD) /* lhgi %dst,0 */ EMIT4_IMM(0xa7090000, dst_reg, 0); + else + EMIT_ZERO(dst_reg); break; } /* lhi %w0,0 */ @@ -999,10 +1005,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9820000, dst_reg, src_reg); break; case BPF_ALU | BPF_XOR | BPF_K: /* dst = (u32) dst ^ (u32) imm */ - if (!imm) - break; - /* xilf %dst,imm */ - EMIT6_IMM(0xc0070000, dst_reg, imm); + if (imm != 0) { + /* xilf %dst,imm */ + EMIT6_IMM(0xc0070000, dst_reg, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_XOR | BPF_K: /* dst = dst ^ imm */ @@ -1033,10 +1039,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, 0x000d, dst_reg, dst_reg, src_reg, 0); break; case BPF_ALU | BPF_LSH | BPF_K: /* dst = (u32) dst << (u32) imm */ - if (imm == 0) - break; - /* sll %dst,imm(%r0) */ - EMIT4_DISP(0x89000000, dst_reg, REG_0, imm); + if (imm != 0) { + /* sll %dst,imm(%r0) */ + EMIT4_DISP(0x89000000, dst_reg, REG_0, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_LSH | BPF_K: /* dst = dst << imm */ @@ -1058,10 +1064,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, 0x000c, dst_reg, dst_reg, src_reg, 0); break; case BPF_ALU | BPF_RSH | BPF_K: /* dst = (u32) dst >> (u32) imm */ - if (imm == 0) - break; - /* srl %dst,imm(%r0) */ - EMIT4_DISP(0x88000000, dst_reg, REG_0, imm); + if (imm != 0) { + /* srl %dst,imm(%r0) */ + EMIT4_DISP(0x88000000, dst_reg, REG_0, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_RSH | BPF_K: /* dst = dst >> imm */ @@ -1083,10 +1089,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, 0x000a, dst_reg, dst_reg, src_reg, 0); break; case BPF_ALU | BPF_ARSH | BPF_K: /* ((s32) dst >> imm */ - if (imm == 0) - break; - /* sra %dst,imm(%r0) */ - EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm); + if (imm != 0) { + /* sra %dst,imm(%r0) */ + EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_ARSH | BPF_K: /* ((s64) dst) >>= imm */ diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 51dc2215a2b7..be077b39da33 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -383,8 +383,8 @@ static int clp_find_pci(struct clp_req_rsp_list_pci *rrb, u32 fid, rc = clp_list_pci_req(rrb, &resume_token, &nentries); if (rc) return rc; + fh_list = rrb->response.fh_list; for (i = 0; i < nentries; i++) { - fh_list = rrb->response.fh_list; if (fh_list[i].fid == fid) { *entry = fh_list[i]; return 0; @@ -449,14 +449,17 @@ int clp_get_state(u32 fid, enum zpci_state *state) struct clp_fh_list_entry entry; int rc; - *state = ZPCI_FN_STATE_RESERVED; rrb = clp_alloc_block(GFP_ATOMIC); if (!rrb) return -ENOMEM; rc = clp_find_pci(rrb, fid, &entry); - if (!rc) + if (!rc) { *state = entry.config_state; + } else if (rc == -ENODEV) { + *state = ZPCI_FN_STATE_RESERVED; + rc = 0; + } clp_free_block(rrb); return rc; diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index ae683aa623ac..c5b35ea129cf 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -159,7 +159,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, mmap_read_lock(current->mm); ret = -EINVAL; - vma = find_vma(current->mm, mmio_addr); + vma = vma_lookup(current->mm, mmio_addr); if (!vma) goto out_unlock_mmap; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) @@ -298,7 +298,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, mmap_read_lock(current->mm); ret = -EINVAL; - vma = find_vma(current->mm, mmio_addr); + vma = vma_lookup(current->mm, mmio_addr); if (!vma) goto out_unlock_mmap; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index f37280e805ea..6904f4bdbf00 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -69,6 +69,7 @@ config SUPERH select RTC_LIB select SET_FS select SPARSE_IRQ + select TRACE_IRQFLAGS_SUPPORT help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug index 28a43d63bde1..958f790273ab 100644 --- a/arch/sh/Kconfig.debug +++ b/arch/sh/Kconfig.debug @@ -1,8 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config SH_STANDARD_BIOS bool "Use LinuxSH standard BIOS" help diff --git a/arch/sh/boot/Makefile b/arch/sh/boot/Makefile index 58592dfa5cb6..c081e7e2d6e7 100644 --- a/arch/sh/boot/Makefile +++ b/arch/sh/boot/Makefile @@ -80,30 +80,30 @@ $(obj)/vmlinux.bin.xz: $(obj)/vmlinux.bin FORCE $(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.bin FORCE $(call if_changed,lzo) -$(obj)/uImage.bz2: $(obj)/vmlinux.bin.bz2 +$(obj)/uImage.bz2: $(obj)/vmlinux.bin.bz2 FORCE $(call if_changed,uimage,bzip2) -$(obj)/uImage.gz: $(obj)/vmlinux.bin.gz +$(obj)/uImage.gz: $(obj)/vmlinux.bin.gz FORCE $(call if_changed,uimage,gzip) -$(obj)/uImage.lzma: $(obj)/vmlinux.bin.lzma +$(obj)/uImage.lzma: $(obj)/vmlinux.bin.lzma FORCE $(call if_changed,uimage,lzma) -$(obj)/uImage.xz: $(obj)/vmlinux.bin.xz +$(obj)/uImage.xz: $(obj)/vmlinux.bin.xz FORCE $(call if_changed,uimage,xz) -$(obj)/uImage.lzo: $(obj)/vmlinux.bin.lzo +$(obj)/uImage.lzo: $(obj)/vmlinux.bin.lzo FORCE $(call if_changed,uimage,lzo) -$(obj)/uImage.bin: $(obj)/vmlinux.bin +$(obj)/uImage.bin: $(obj)/vmlinux.bin FORCE $(call if_changed,uimage,none) OBJCOPYFLAGS_vmlinux.srec := -I binary -O srec -$(obj)/vmlinux.srec: $(obj)/compressed/vmlinux +$(obj)/vmlinux.srec: $(obj)/compressed/vmlinux FORCE $(call if_changed,objcopy) OBJCOPYFLAGS_uImage.srec := -I binary -O srec -$(obj)/uImage.srec: $(obj)/uImage +$(obj)/uImage.srec: $(obj)/uImage FORCE $(call if_changed,objcopy) $(obj)/uImage: $(obj)/uImage.$(suffix-y) diff --git a/arch/sh/boot/compressed/install.sh b/arch/sh/boot/compressed/install.sh deleted file mode 100644 index f9f41818b17e..000000000000 --- a/arch/sh/boot/compressed/install.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/sh -# -# arch/sh/boot/install.sh -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 1995 by Linus Torvalds -# -# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin -# Adapted from code in arch/i386/boot/install.sh by Russell King -# Adapted from code in arch/arm/boot/install.sh by Stuart Menefy -# -# "make install" script for sh architecture -# -# Arguments: -# $1 - kernel version -# $2 - kernel image file -# $3 - kernel map file -# $4 - default install path (blank if root directory) -# - -# User may have a custom install script - -if [ -x /sbin/${INSTALLKERNEL} ]; then - exec /sbin/${INSTALLKERNEL} "$@" -fi - -if [ "$2" = "zImage" ]; then -# Compressed install - echo "Installing compressed kernel" - if [ -f $4/vmlinuz-$1 ]; then - mv $4/vmlinuz-$1 $4/vmlinuz.old - fi - - if [ -f $4/System.map-$1 ]; then - mv $4/System.map-$1 $4/System.old - fi - - cat $2 > $4/vmlinuz-$1 - cp $3 $4/System.map-$1 -else -# Normal install - echo "Installing normal kernel" - if [ -f $4/vmlinux-$1 ]; then - mv $4/vmlinux-$1 $4/vmlinux.old - fi - - if [ -f $4/System.map ]; then - mv $4/System.map $4/System.old - fi - - cat $2 > $4/vmlinux-$1 - cp $3 $4/System.map -fi diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index 4486a865ff62..372afa82fee6 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -63,6 +63,8 @@ static inline void flush_anon_page(struct vm_area_struct *vma, if (boot_cpu_data.dcache.n_aliases && PageAnon(page)) __flush_anon_page(page, vmaddr); } + +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) { __flush_wback_region(addr, size); @@ -72,12 +74,6 @@ static inline void invalidate_kernel_vmap_range(void *addr, int size) __flush_invalidate_region(addr, size); } -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ - flush_dcache_page(page); -} - extern void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len); diff --git a/arch/sh/include/asm/pgtable-3level.h b/arch/sh/include/asm/pgtable-3level.h index 56bf35c2f29c..cdced80a7ffa 100644 --- a/arch/sh/include/asm/pgtable-3level.h +++ b/arch/sh/include/asm/pgtable-3level.h @@ -34,7 +34,7 @@ typedef struct { unsigned long long pmd; } pmd_t; static inline pmd_t *pud_pgtable(pud_t pud) { - return (pmd_t *)pud_val(pud); + return (pmd_t *)(unsigned long)pud_val(pud); } /* only used by the stubbed out hugetlb gup code, should never be called */ diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 7bbd6700ae4b..208f131659c5 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -449,3 +449,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index ce26c7f8950a..506784702430 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -414,8 +414,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index fa650e4eadba..b120ed947f50 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -47,6 +47,7 @@ config SPARC select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH select SET_FS + select TRACE_IRQFLAGS_SUPPORT config SPARC32 def_bool !64BIT diff --git a/arch/sparc/Kconfig.debug b/arch/sparc/Kconfig.debug index 50a918d496c8..6b2bec1888b3 100644 --- a/arch/sparc/Kconfig.debug +++ b/arch/sparc/Kconfig.debug @@ -1,9 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -config TRACE_IRQFLAGS_SUPPORT - bool - default y - config DEBUG_DCFLUSH bool "D-cache flush debugging" depends on SPARC64 && DEBUG_KERNEL diff --git a/arch/sparc/Makefile b/arch/sparc/Makefile index 4e65245bc755..24fb5a99f439 100644 --- a/arch/sparc/Makefile +++ b/arch/sparc/Makefile @@ -72,7 +72,8 @@ image zImage uImage tftpboot.img vmlinux.aout: vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ install: - $(Q)$(MAKE) $(build)=$(boot) $@ + sh $(srctree)/$(boot)/install.sh $(KERNELRELEASE) $(KBUILD_IMAGE) \ + System.map "$(INSTALL_PATH)" archclean: $(Q)$(MAKE) $(clean)=$(boot) diff --git a/arch/sparc/boot/Makefile b/arch/sparc/boot/Makefile index 380e2b018992..849236d4eca4 100644 --- a/arch/sparc/boot/Makefile +++ b/arch/sparc/boot/Makefile @@ -70,7 +70,3 @@ $(obj)/image: vmlinux FORCE $(obj)/tftpboot.img: $(obj)/image $(obj)/piggyback System.map $(ROOT_IMG) FORCE $(call if_changed,elftoaout) $(call if_changed,piggy) - -install: - sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/zImage \ - System.map "$(INSTALL_PATH)" diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index 8b63410e830f..bd949fcf9d63 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -116,25 +116,6 @@ struct compat_statfs { #define COMPAT_OFF_T_MAX 0x7fffffff -#ifdef CONFIG_COMPAT -static inline void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = current_thread_info()->kregs; - unsigned long usp = regs->u_regs[UREG_I6]; - - if (test_thread_64bit_stack(usp)) - usp += STACK_BIAS; - - if (test_thread_flag(TIF_32BIT)) - usp &= 0xffffffffUL; - - usp -= len; - usp &= ~0x7UL; - - return (void __user *) usp; -} -#endif - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index 8e1d72a16759..7ceae24b0ca9 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -356,7 +356,9 @@ err_nomem: void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { - if (!sparc_dma_free_resource(cpu_addr, PAGE_ALIGN(size))) + size = PAGE_ALIGN(size); + + if (!sparc_dma_free_resource(cpu_addr, size)) return; dma_make_coherent(dma_addr, size); diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c index 8e645ddac58e..30f171b7b00c 100644 --- a/arch/sparc/kernel/mdesc.c +++ b/arch/sparc/kernel/mdesc.c @@ -39,6 +39,7 @@ struct mdesc_hdr { u32 node_sz; /* node block size */ u32 name_sz; /* name block size */ u32 data_sz; /* data block size */ + char data[]; } __attribute__((aligned(16))); struct mdesc_elem { @@ -612,7 +613,7 @@ EXPORT_SYMBOL(mdesc_get_node_info); static struct mdesc_elem *node_block(struct mdesc_hdr *mdesc) { - return (struct mdesc_elem *) (mdesc + 1); + return (struct mdesc_elem *) mdesc->data; } static void *name_block(struct mdesc_hdr *mdesc) diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 93983d6d431d..bbbe0cfef746 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -8,9 +8,6 @@ /* * This file handles the architecture-dependent parts of process handling.. */ - -#include <stdarg.h> - #include <linux/elfcore.h> #include <linux/errno.h> #include <linux/module.h> diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 547b06b49ce3..d1cc410d2f64 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -9,9 +9,6 @@ /* * This file handles the architecture-dependent parts of process handling.. */ - -#include <stdarg.h> - #include <linux/errno.h> #include <linux/export.h> #include <linux/sched.h> @@ -458,7 +455,7 @@ static unsigned long clone_stackframe(unsigned long csp, unsigned long psp) distance = fp - psp; rval = (csp - distance); - if (copy_in_user((void __user *) rval, (void __user *) psp, distance)) + if (raw_copy_in_user((void __user *)rval, (void __user *)psp, distance)) rval = 0; else if (!stack_64bit) { if (put_user(((u32)csp), diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index 4276b9e003ca..6cc124a3bb98 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -435,9 +435,9 @@ static int setup_frame32(struct ksignal *ksig, struct pt_regs *regs, (_COMPAT_NSIG_WORDS - 1) * sizeof(unsigned int)); if (!wsaved) { - err |= copy_in_user((u32 __user *)sf, - (u32 __user *)(regs->u_regs[UREG_FP]), - sizeof(struct reg_window32)); + err |= raw_copy_in_user((u32 __user *)sf, + (u32 __user *)(regs->u_regs[UREG_FP]), + sizeof(struct reg_window32)); } else { struct reg_window *rp; @@ -567,9 +567,9 @@ static int setup_rt_frame32(struct ksignal *ksig, struct pt_regs *regs, err |= put_compat_sigset(&sf->mask, oldset, sizeof(compat_sigset_t)); if (!wsaved) { - err |= copy_in_user((u32 __user *)sf, - (u32 __user *)(regs->u_regs[UREG_FP]), - sizeof(struct reg_window32)); + err |= raw_copy_in_user((u32 __user *)sf, + (u32 __user *)(regs->u_regs[UREG_FP]), + sizeof(struct reg_window32)); } else { struct reg_window *rp; diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index cea23cf95600..2a78d2af1265 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -406,10 +406,10 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) err |= copy_to_user(&sf->mask, sigmask_to_save(), sizeof(sigset_t)); if (!wsaved) { - err |= copy_in_user((u64 __user *)sf, - (u64 __user *)(regs->u_regs[UREG_FP] + - STACK_BIAS), - sizeof(struct reg_window)); + err |= raw_copy_in_user((u64 __user *)sf, + (u64 __user *)(regs->u_regs[UREG_FP] + + STACK_BIAS), + sizeof(struct reg_window)); } else { struct reg_window *rp; diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index f520e9cd2c78..c37764dc764d 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -365,12 +365,12 @@ 299 common unshare sys_unshare 300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list 301 common get_robust_list sys_get_robust_list compat_sys_get_robust_list -302 common migrate_pages sys_migrate_pages compat_sys_migrate_pages -303 common mbind sys_mbind compat_sys_mbind -304 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -305 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +302 common migrate_pages sys_migrate_pages +303 common mbind sys_mbind +304 common get_mempolicy sys_get_mempolicy +305 common set_mempolicy sys_set_mempolicy 306 common kexec_load sys_kexec_load compat_sys_kexec_load -307 common move_pages sys_move_pages compat_sys_move_pages +307 common move_pages sys_move_pages 308 common getcpu sys_getcpu 309 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 310 32 utimensat sys_utimensat_time32 @@ -492,3 +492,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/sparc/lib/iomap.c b/arch/sparc/lib/iomap.c index c9da9f139694..f3a8cd491ce0 100644 --- a/arch/sparc/lib/iomap.c +++ b/arch/sparc/lib/iomap.c @@ -19,8 +19,10 @@ void ioport_unmap(void __iomem *addr) EXPORT_SYMBOL(ioport_map); EXPORT_SYMBOL(ioport_unmap); +#ifdef CONFIG_PCI void pci_iounmap(struct pci_dev *dev, void __iomem * addr) { /* nothing to do */ } EXPORT_SYMBOL(pci_iounmap); +#endif diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 77e66d3719f6..c18b45f75d41 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -22,7 +22,9 @@ config UML select GENERIC_CPU_DEVICES select HAVE_GCC_PLUGINS select SET_FS + select TRACE_IRQFLAGS_SUPPORT select TTY # Needed for line.c + select HAVE_ARCH_VMAP_STACK config MMU bool @@ -52,10 +54,6 @@ config ISA config SBUS bool -config TRACE_IRQFLAGS_SUPPORT - bool - default y - config LOCKDEP_SUPPORT bool default y diff --git a/arch/um/Makefile b/arch/um/Makefile index 12a7acef0357..f2fe63bfd819 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -41,8 +41,8 @@ endif HOST_DIR := arch/$(HEADER_ARCH) -include $(ARCH_DIR)/Makefile-skas -include $(HOST_DIR)/Makefile.um +include $(srctree)/$(ARCH_DIR)/Makefile-skas +include $(srctree)/$(HOST_DIR)/Makefile.um core-y += $(HOST_DIR)/um/ @@ -76,7 +76,7 @@ USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -I%,,$(KBUILD_CFLAGS))) \ -idirafter $(objtree)/include -D__KERNEL__ -D__UM_HOST__ #This will adjust *FLAGS accordingly to the platform. -include $(ARCH_DIR)/Makefile-os-$(OS) +include $(srctree)/$(ARCH_DIR)/Makefile-os-$(OS) KBUILD_CPPFLAGS += -I$(srctree)/$(HOST_DIR)/include \ -I$(srctree)/$(HOST_DIR)/include/uapi \ diff --git a/arch/um/drivers/rtc_user.c b/arch/um/drivers/rtc_user.c index 4016bc1d577e..7c3cec4c68cf 100644 --- a/arch/um/drivers/rtc_user.c +++ b/arch/um/drivers/rtc_user.c @@ -3,6 +3,7 @@ * Copyright (C) 2020 Intel Corporation * Author: Johannes Berg <johannes@sipsolutions.net> */ +#include <stdbool.h> #include <os.h> #include <errno.h> #include <sched.h> diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c index bae53220ce26..e4ffeb9a1fa4 100644 --- a/arch/um/drivers/vector_user.c +++ b/arch/um/drivers/vector_user.c @@ -3,6 +3,7 @@ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) */ +#include <stdbool.h> #include <stdio.h> #include <unistd.h> #include <stdarg.h> diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index 0b802834f40a..c08066633023 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -56,6 +56,13 @@ static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)]; #define UM_VIRT_PCI_MAXDELAY 40000 +struct um_pci_message_buffer { + struct virtio_pcidev_msg hdr; + u8 data[8]; +}; + +static struct um_pci_message_buffer __percpu *um_pci_msg_bufs; + static int um_pci_send_cmd(struct um_pci_device *dev, struct virtio_pcidev_msg *cmd, unsigned int cmd_size, @@ -68,11 +75,12 @@ static int um_pci_send_cmd(struct um_pci_device *dev, [1] = extra ? &extra_sg : &in_sg, [2] = extra ? &in_sg : NULL, }; + struct um_pci_message_buffer *buf; int delay_count = 0; int ret, len; bool posted; - if (WARN_ON(cmd_size < sizeof(*cmd))) + if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf))) return -EINVAL; switch (cmd->op) { @@ -88,6 +96,9 @@ static int um_pci_send_cmd(struct um_pci_device *dev, break; } + buf = get_cpu_var(um_pci_msg_bufs); + memcpy(buf, cmd, cmd_size); + if (posted) { u8 *ncmd = kmalloc(cmd_size + extra_size, GFP_ATOMIC); @@ -102,7 +113,10 @@ static int um_pci_send_cmd(struct um_pci_device *dev, } else { /* try without allocating memory */ posted = false; + cmd = (void *)buf; } + } else { + cmd = (void *)buf; } sg_init_one(&out_sg, cmd, cmd_size); @@ -118,11 +132,12 @@ static int um_pci_send_cmd(struct um_pci_device *dev, posted ? cmd : HANDLE_NO_FREE(cmd), GFP_ATOMIC); if (ret) - return ret; + goto out; if (posted) { virtqueue_kick(dev->cmd_vq); - return 0; + ret = 0; + goto out; } /* kick and poll for getting a response on the queue */ @@ -148,6 +163,8 @@ static int um_pci_send_cmd(struct um_pci_device *dev, } clear_bit(UM_PCI_STAT_WAITING, &dev->status); +out: + put_cpu_var(um_pci_msg_bufs); return ret; } @@ -161,12 +178,17 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, .size = size, .addr = offset, }; - /* maximum size - we may only use parts of it */ - u8 data[8]; + /* buf->data is maximum size - we may only use parts of it */ + struct um_pci_message_buffer *buf; + u8 *data; + unsigned long ret = ~0ULL; if (!dev) return ~0ULL; + buf = get_cpu_var(um_pci_msg_bufs); + data = buf->data; + memset(data, 0xff, sizeof(data)); switch (size) { @@ -179,27 +201,34 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, break; default: WARN(1, "invalid config space read size %d\n", size); - return ~0ULL; + goto out; } - if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, - data, sizeof(data))) - return ~0ULL; + if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, 8)) + goto out; switch (size) { case 1: - return data[0]; + ret = data[0]; + break; case 2: - return le16_to_cpup((void *)data); + ret = le16_to_cpup((void *)data); + break; case 4: - return le32_to_cpup((void *)data); + ret = le32_to_cpup((void *)data); + break; #ifdef CONFIG_64BIT case 8: - return le64_to_cpup((void *)data); + ret = le64_to_cpup((void *)data); + break; #endif default: - return ~0ULL; + break; } + +out: + put_cpu_var(um_pci_msg_bufs); + return ret; } static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size, @@ -272,8 +301,13 @@ static void um_pci_bar_copy_from(void *priv, void *buffer, static unsigned long um_pci_bar_read(void *priv, unsigned int offset, int size) { - /* maximum size - we may only use parts of it */ - u8 data[8]; + /* buf->data is maximum size - we may only use parts of it */ + struct um_pci_message_buffer *buf; + u8 *data; + unsigned long ret = ~0ULL; + + buf = get_cpu_var(um_pci_msg_bufs); + data = buf->data; switch (size) { case 1: @@ -285,25 +319,33 @@ static unsigned long um_pci_bar_read(void *priv, unsigned int offset, break; default: WARN(1, "invalid config space read size %d\n", size); - return ~0ULL; + goto out; } um_pci_bar_copy_from(priv, data, offset, size); switch (size) { case 1: - return data[0]; + ret = data[0]; + break; case 2: - return le16_to_cpup((void *)data); + ret = le16_to_cpup((void *)data); + break; case 4: - return le32_to_cpup((void *)data); + ret = le32_to_cpup((void *)data); + break; #ifdef CONFIG_64BIT case 8: - return le64_to_cpup((void *)data); + ret = le64_to_cpup((void *)data); + break; #endif default: - return ~0ULL; + break; } + +out: + put_cpu_var(um_pci_msg_bufs); + return ret; } static void um_pci_bar_copy_to(void *priv, unsigned int offset, @@ -810,7 +852,7 @@ void *pci_root_bus_fwnode(struct pci_bus *bus) return um_pci_fwnode; } -int um_pci_init(void) +static int um_pci_init(void) { int err, i; @@ -823,10 +865,16 @@ int um_pci_init(void) "No virtio device ID configured for PCI - no PCI support\n")) return 0; - bridge = pci_alloc_host_bridge(0); - if (!bridge) + um_pci_msg_bufs = alloc_percpu(struct um_pci_message_buffer); + if (!um_pci_msg_bufs) return -ENOMEM; + bridge = pci_alloc_host_bridge(0); + if (!bridge) { + err = -ENOMEM; + goto free; + } + um_pci_fwnode = irq_domain_alloc_named_fwnode("um-pci"); if (!um_pci_fwnode) { err = -ENOMEM; @@ -878,18 +926,22 @@ free: irq_domain_remove(um_pci_inner_domain); if (um_pci_fwnode) irq_domain_free_fwnode(um_pci_fwnode); - pci_free_resource_list(&bridge->windows); - pci_free_host_bridge(bridge); + if (bridge) { + pci_free_resource_list(&bridge->windows); + pci_free_host_bridge(bridge); + } + free_percpu(um_pci_msg_bufs); return err; } module_init(um_pci_init); -void um_pci_exit(void) +static void um_pci_exit(void) { unregister_virtio_driver(&um_pci_virtio_driver); irq_domain_remove(um_pci_msi_domain); irq_domain_remove(um_pci_inner_domain); pci_free_resource_list(&bridge->windows); pci_free_host_bridge(bridge); + free_percpu(um_pci_msg_bufs); } module_exit(um_pci_exit); diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 4412d6febade..d51e445df797 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -27,6 +27,7 @@ #include <linux/virtio_config.h> #include <linux/virtio_ring.h> #include <linux/time-internal.h> +#include <linux/virtio-uml.h> #include <shared/as-layout.h> #include <irq_kern.h> #include <init.h> @@ -1139,7 +1140,7 @@ static int virtio_uml_probe(struct platform_device *pdev) rc = os_connect_socket(pdata->socket_path); } while (rc == -EINTR); if (rc < 0) - return rc; + goto error_free; vu_dev->sock = rc; spin_lock_init(&vu_dev->sock_lock); @@ -1160,6 +1161,8 @@ static int virtio_uml_probe(struct platform_device *pdev) error_init: os_close_file(vu_dev->sock); +error_free: + kfree(vu_dev); return rc; } diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h index 065829f443ae..86a8a573b65c 100644 --- a/arch/um/include/shared/irq_user.h +++ b/arch/um/include/shared/irq_user.h @@ -7,7 +7,6 @@ #define __IRQ_USER_H__ #include <sysdep/ptrace.h> -#include <stdbool.h> enum um_irq_type { IRQ_READ, diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 60b84edc8a68..96d400387c93 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -8,7 +8,6 @@ #ifndef __OS_H__ #define __OS_H__ -#include <stdarg.h> #include <irq_user.h> #include <longjmp.h> #include <mm_id.h> diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c index 5afac0fef24e..ff5061f29167 100644 --- a/arch/um/kernel/skas/clone.c +++ b/arch/um/kernel/skas/clone.c @@ -24,8 +24,7 @@ void __attribute__ ((__section__ (".__syscall_stub"))) stub_clone_handler(void) { - int stack; - struct stub_data *data = (void *) ((unsigned long)&stack & ~(UM_KERN_PAGE_SIZE - 1)); + struct stub_data *data = get_stub_page(); long err; err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD, diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index ad12f78bda7e..3198c4767387 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -311,7 +311,3 @@ void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) { do_IRQ(WINCH_IRQ, regs); } - -void trap_init(void) -{ -} diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 6de99bb16113..6cf098c23a39 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -67,7 +67,7 @@ int signals_enabled; #ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT static int signals_blocked; #else -#define signals_blocked false +#define signals_blocked 0 #endif static unsigned int signals_pending; static unsigned int signals_active = 0; diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index 07327425d06e..41297ec404bf 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -3,6 +3,7 @@ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) */ +#include <stdarg.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1146b85d708b..ab83c22d274e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -259,6 +259,7 @@ config X86 select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE) select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select TRACE_IRQFLAGS_SUPPORT select USER_STACKTRACE_SUPPORT select VIRT_TO_BUS select HAVE_ARCH_KCSAN if X86_64 @@ -338,6 +339,11 @@ config NEED_PER_CPU_PAGE_FIRST_CHUNK config ARCH_HIBERNATION_POSSIBLE def_bool y +config ARCH_NR_GPIO + int + default 1024 if X86_64 + default 512 + config ARCH_SUSPEND_POSSIBLE def_bool y @@ -2604,7 +2610,6 @@ config PCI_OLPC config PCI_XEN def_bool y depends on PCI && XEN - select SWIOTLB_XEN config MMCONF_FAM10H def_bool y diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 80b57e7f4947..d3a6f74a94bd 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -1,8 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config TRACE_IRQFLAGS_NMI_SUPPORT def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index d82d01490dd3..7488cfbbd2f6 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -73,7 +73,7 @@ ifeq ($(CONFIG_X86_32),y) KBUILD_CFLAGS += $(cc_stack_align4) # CPU-specific tuning. Anything which can be shared with UML should go here. - include arch/x86/Makefile_32.cpu + include $(srctree)/arch/x86/Makefile_32.cpu KBUILD_CFLAGS += $(cflags-y) # temporary until string.h is fixed diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index cd3056759880..94834c4b5e5e 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -2,12 +2,12 @@ # CPU tuning section - shared with UML. # Must change only cflags-y (or [yn]), not CFLAGS! That makes a difference for UML. -#-mtune exists since gcc 3.4 -HAS_MTUNE := $(call cc-option-yn, -mtune=i386) -ifeq ($(HAS_MTUNE),y) tune = $(call cc-option,-mtune=$(1),$(2)) + +ifdef CONFIG_CC_IS_CLANG +align := -falign-functions=0 $(call cc-option,-falign-jumps=0) $(call cc-option,-falign-loops=0) else -tune = $(call cc-option,-mcpu=$(1),$(2)) +align := -falign-functions=0 -falign-jumps=0 -falign-loops=0 endif cflags-$(CONFIG_M486SX) += -march=i486 @@ -25,11 +25,11 @@ cflags-$(CONFIG_MK6) += -march=k6 # They make zero difference whatsosever to performance at this time. cflags-$(CONFIG_MK7) += -march=athlon cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) -cflags-$(CONFIG_MCRUSOE) += -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0 -cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) -falign-functions=0 -falign-jumps=0 -falign-loops=0 +cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align) +cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align) cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) -cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) -falign-functions=0 -falign-jumps=0 -falign-loops=0 +cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align) cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index ca866f1cca2e..34c9dbb6a47d 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -18,7 +18,7 @@ #ifndef __ASSEMBLY__ -#include <stdarg.h> +#include <linux/stdarg.h> #include <linux/types.h> #include <linux/edd.h> #include <asm/setup.h> diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 9c9c4a888b1d..e81885384f60 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -156,7 +156,6 @@ CONFIG_FORCEDETH=y CONFIG_8139TOO=y # CONFIG_8139TOO_PIO is not set CONFIG_R8169=y -CONFIG_INPUT_POLLDEV=y CONFIG_INPUT_EVDEV=y CONFIG_INPUT_JOYSTICK=y CONFIG_INPUT_TABLET=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index b60bd2d86034..e8a7a0af2bda 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -148,7 +148,6 @@ CONFIG_SKY2=y CONFIG_FORCEDETH=y CONFIG_8139TOO=y CONFIG_R8169=y -CONFIG_INPUT_POLLDEV=y CONFIG_INPUT_EVDEV=y CONFIG_INPUT_JOYSTICK=y CONFIG_INPUT_TABLET=y diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index a5beae6daf20..960a021d543e 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -286,7 +286,7 @@ 272 i386 fadvise64_64 sys_ia32_fadvise64_64 273 i386 vserver 274 i386 mbind sys_mbind -275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +275 i386 get_mempolicy sys_get_mempolicy 276 i386 set_mempolicy sys_set_mempolicy 277 i386 mq_open sys_mq_open compat_sys_mq_open 278 i386 mq_unlink sys_mq_unlink @@ -328,7 +328,7 @@ 314 i386 sync_file_range sys_ia32_sync_file_range 315 i386 tee sys_tee 316 i386 vmsplice sys_vmsplice -317 i386 move_pages sys_move_pages compat_sys_move_pages +317 i386 move_pages sys_move_pages 318 i386 getcpu sys_getcpu 319 i386 epoll_pwait sys_epoll_pwait 320 i386 utimensat sys_utimensat_time32 @@ -452,3 +452,4 @@ 445 i386 landlock_add_rule sys_landlock_add_rule 446 i386 landlock_restrict_self sys_landlock_restrict_self 447 i386 memfd_secret sys_memfd_secret +448 i386 process_mrelease sys_process_mrelease diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f6b57799c1ea..18b5500ea8bf 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -369,6 +369,7 @@ 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self 447 common memfd_secret sys_memfd_secret +448 common process_mrelease sys_process_mrelease # # Due to a historical design error, certain syscalls are numbered differently @@ -397,7 +398,7 @@ 530 x32 set_robust_list compat_sys_set_robust_list 531 x32 get_robust_list compat_sys_get_robust_list 532 x32 vmsplice sys_vmsplice -533 x32 move_pages compat_sys_move_pages +533 x32 move_pages sys_move_pages 534 x32 preadv compat_sys_preadv64 535 x32 pwritev compat_sys_pwritev64 536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 05c4abc2fdfd..a2dddcc189f6 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -131,7 +131,7 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE targets += vdsox32.lds $(vobjx32s-y) $(obj)/%.so: OBJCOPYFLAGS := -S --remove-section __ex_table -$(obj)/%.so: $(obj)/%.so.dbg +$(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 90e682a92820..32a1ad356c18 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -99,7 +99,8 @@ static void hv_apic_eoi_write(u32 reg, u32 val) /* * IPI implementation on Hyper-V. */ -static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) +static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, + bool exclude_self) { struct hv_send_ipi_ex **arg; struct hv_send_ipi_ex *ipi_arg; @@ -123,7 +124,10 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) if (!cpumask_equal(mask, cpu_present_mask)) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; - nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask); + if (exclude_self) + nr_bank = cpumask_to_vpset_noself(&(ipi_arg->vp_set), mask); + else + nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask); } if (nr_bank < 0) goto ipi_mask_ex_done; @@ -138,15 +142,25 @@ ipi_mask_ex_done: return hv_result_success(status); } -static bool __send_ipi_mask(const struct cpumask *mask, int vector) +static bool __send_ipi_mask(const struct cpumask *mask, int vector, + bool exclude_self) { - int cur_cpu, vcpu; + int cur_cpu, vcpu, this_cpu = smp_processor_id(); struct hv_send_ipi ipi_arg; u64 status; + unsigned int weight; trace_hyperv_send_ipi_mask(mask, vector); - if (cpumask_empty(mask)) + weight = cpumask_weight(mask); + + /* + * Do nothing if + * 1. the mask is empty + * 2. the mask only contains self when exclude_self is true + */ + if (weight == 0 || + (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask))) return true; if (!hv_hypercall_pg) @@ -172,6 +186,8 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector) ipi_arg.cpu_mask = 0; for_each_cpu(cur_cpu, mask) { + if (exclude_self && cur_cpu == this_cpu) + continue; vcpu = hv_cpu_number_to_vp_number(cur_cpu); if (vcpu == VP_INVAL) return false; @@ -191,7 +207,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector) return hv_result_success(status); do_ex_hypercall: - return __send_ipi_mask_ex(mask, vector); + return __send_ipi_mask_ex(mask, vector, exclude_self); } static bool __send_ipi_one(int cpu, int vector) @@ -208,7 +224,7 @@ static bool __send_ipi_one(int cpu, int vector) return false; if (vp >= 64) - return __send_ipi_mask_ex(cpumask_of(cpu), vector); + return __send_ipi_mask_ex(cpumask_of(cpu), vector, false); status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp)); return hv_result_success(status); @@ -222,20 +238,13 @@ static void hv_send_ipi(int cpu, int vector) static void hv_send_ipi_mask(const struct cpumask *mask, int vector) { - if (!__send_ipi_mask(mask, vector)) + if (!__send_ipi_mask(mask, vector, false)) orig_apic.send_IPI_mask(mask, vector); } static void hv_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned int this_cpu = smp_processor_id(); - struct cpumask new_mask; - const struct cpumask *local_mask; - - cpumask_copy(&new_mask, mask); - cpumask_clear_cpu(this_cpu, &new_mask); - local_mask = &new_mask; - if (!__send_ipi_mask(local_mask, vector)) + if (!__send_ipi_mask(mask, vector, true)) orig_apic.send_IPI_mask_allbutself(mask, vector); } @@ -246,7 +255,7 @@ static void hv_send_ipi_allbutself(int vector) static void hv_send_ipi_all(int vector) { - if (!__send_ipi_mask(cpu_online_mask, vector)) + if (!__send_ipi_mask(cpu_online_mask, vector, false)) orig_apic.send_IPI_all(vector); } diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 5e5b9fc2747f..9bd15241fadb 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -202,8 +202,7 @@ static int load_aout_binary(struct linux_binprm *bprm) error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | - MAP_32BIT, + MAP_FIXED | MAP_PRIVATE | MAP_32BIT, fd_offset); if (error != N_TXTADDR(ex)) @@ -211,8 +210,7 @@ static int load_aout_binary(struct linux_binprm *bprm) error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | - MAP_32BIT, + MAP_FIXED | MAP_PRIVATE | MAP_32BIT, fd_offset + ex.a_text); if (error != N_DATADDR(ex)) return error; @@ -293,7 +291,7 @@ static int load_aout_library(struct file *file) /* Now use mmap to map the library into memory. */ error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT, + MAP_FIXED | MAP_PRIVATE | MAP_32BIT, N_TXTOFF(ex)); retval = error; if (error != start_addr) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 4ae01cdb99de..7516e4199b3c 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -156,19 +156,6 @@ struct compat_shmid64_ds { (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) #endif -static inline void __user *arch_compat_alloc_user_space(long len) -{ - compat_uptr_t sp = task_pt_regs(current)->sp; - - /* - * -128 for the x32 ABI redzone. For IA32, it is not strictly - * necessary, but not harmful. - */ - sp -= 128; - - return (void __user *)round_down(sp - len, 16); -} - static inline bool in_x32_syscall(void) { #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index a12a4987154e..cefe1d81e2e8 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -72,7 +72,6 @@ KVM_X86_OP(enable_nmi_window) KVM_X86_OP(enable_irq_window) KVM_X86_OP(update_cr8_intercept) KVM_X86_OP(check_apicv_inhibit_reasons) -KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl) KVM_X86_OP(refresh_apicv_exec_ctrl) KVM_X86_OP(hwapic_irr_update) KVM_X86_OP(hwapic_isr_update) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index af6ce8d4c86a..f8f48a7ec577 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -37,9 +37,21 @@ #define __KVM_HAVE_ARCH_VCPU_DEBUGFS -#define KVM_MAX_VCPUS 288 -#define KVM_SOFT_MAX_VCPUS 240 -#define KVM_MAX_VCPU_ID 1023 +#define KVM_MAX_VCPUS 1024 +#define KVM_SOFT_MAX_VCPUS 710 + +/* + * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs + * might be larger than the actual number of VCPUs because the + * APIC ID encodes CPU topology information. + * + * In the worst case, we'll need less than one extra bit for the + * Core ID, and less than one extra bit for the Package (Die) ID, + * so ratio of 4 should be enough. + */ +#define KVM_VCPU_ID_RATIO 4 +#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) + /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 @@ -124,13 +136,6 @@ #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) -static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) -{ - /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ - return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); -} - #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64UL #define KVM_MMU_HASH_SHIFT 12 @@ -229,7 +234,8 @@ enum x86_intercept_stage; KVM_GUESTDBG_USE_HW_BP | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_INJECT_BP | \ - KVM_GUESTDBG_INJECT_DB) + KVM_GUESTDBG_INJECT_DB | \ + KVM_GUESTDBG_BLOCKIRQ) #define PFERR_PRESENT_BIT 0 @@ -447,6 +453,7 @@ struct kvm_mmu { u64 *pae_root; u64 *pml4_root; + u64 *pml5_root; /* * check zero bits on shadow page table entries, these @@ -482,6 +489,7 @@ struct kvm_pmc { * ctrl value for fixed counters. */ u64 current_config; + bool is_paused; }; struct kvm_pmu { @@ -522,7 +530,6 @@ struct kvm_pmu_ops; enum { KVM_DEBUGREG_BP_ENABLED = 1, KVM_DEBUGREG_WONT_EXIT = 2, - KVM_DEBUGREG_RELOAD = 4, }; struct kvm_mtrr_range { @@ -723,7 +730,6 @@ struct kvm_vcpu_arch { u64 reserved_gpa_bits; int maxphyaddr; - int max_tdp_level; /* emulate context */ @@ -988,6 +994,12 @@ struct kvm_hv { /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; + /* + * How many SynICs use 'AutoEOI' feature + * (protected by arch.apicv_update_lock) + */ + unsigned int synic_auto_eoi_used; + struct hv_partition_assist_pg *hv_pa_pg; struct kvm_hv_syndbg hv_syndbg; }; @@ -1002,9 +1014,8 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { bool long_mode; - bool shinfo_set; u8 upcall_vector; - struct gfn_to_hva_cache shinfo_cache; + gfn_t shinfo_gfn; }; enum kvm_irqchip_mode { @@ -1061,6 +1072,9 @@ struct kvm_arch { struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; + /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */ + struct mutex apicv_update_lock; + bool apic_access_memslot_enabled; unsigned long apicv_inhibit_reasons; @@ -1213,9 +1227,17 @@ struct kvm_vm_stat { u64 mmu_recycled; u64 mmu_cache_miss; u64 mmu_unsync; - u64 lpages; + union { + struct { + atomic64_t pages_4k; + atomic64_t pages_2m; + atomic64_t pages_1g; + }; + atomic64_t pages[KVM_NR_PAGE_SIZES]; + }; u64 nx_lpage_splits; u64 max_mmu_page_hash_collisions; + u64 max_mmu_rmap_size; }; struct kvm_vcpu_stat { @@ -1359,7 +1381,6 @@ struct kvm_x86_ops { void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); bool (*check_apicv_inhibit_reasons)(ulong bit); - void (*pre_update_apicv_exec_ctrl)(struct kvm *kvm, bool activate); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); @@ -1543,12 +1564,12 @@ void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, int start_level); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot); + const struct kvm_memory_slot *memslot); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); @@ -1744,6 +1765,9 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void kvm_request_apicv_update(struct kvm *kvm, bool activate, unsigned long bit); +void __kvm_request_apicv_update(struct kvm *kvm, bool activate, + unsigned long bit); + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, @@ -1754,8 +1778,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level); +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level); static inline u16 kvm_read_ldt(void) { @@ -1779,11 +1803,6 @@ static inline unsigned long read_msr(unsigned long msr) } #endif -static inline u32 get_rdx_init_val(void) -{ - return 0x600; /* P6 family */ -} - static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); @@ -1816,31 +1835,6 @@ enum { #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) -asmlinkage void kvm_spurious_fault(void); - -/* - * Hardware virtualization extension instructions may fault if a - * reboot turns off virtualization while processes are running. - * Usually after catching the fault we just panic; during reboot - * instead the instruction is ignored. - */ -#define __kvm_handle_fault_on_reboot(insn) \ - "666: \n\t" \ - insn "\n\t" \ - "jmp 668f \n\t" \ - "667: \n\t" \ - "1: \n\t" \ - ".pushsection .discard.instr_begin \n\t" \ - ".long 1b - . \n\t" \ - ".popsection \n\t" \ - "call kvm_spurious_fault \n\t" \ - "1: \n\t" \ - ".pushsection .discard.instr_end \n\t" \ - ".long 1b - . \n\t" \ - ".popsection \n\t" \ - "668: \n\t" \ - _ASM_EXTABLE(666b, 667b) - #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 5c7bcaa79623..1d5f14aff5f6 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H -#define ARCH_DEFAULT_PKEY 0 - /* * If more than 16 keys are ever supported, a thorough audit * will be necessary to ensure that the types that store key diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index f3fbb84ff8a7..68c257a3de0d 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -275,7 +275,7 @@ static inline int enqcmds(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } __iomem *__dst = dst; - int zf; + bool zf; /* * ENQCMDS %(rdx), rax diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index c9fa7be3df82..5c95d242f38d 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -301,8 +301,8 @@ do { \ unsigned int __gu_low, __gu_high; \ const unsigned int __user *__gu_ptr; \ __gu_ptr = (const void __user *)(ptr); \ - __get_user_asm(__gu_low, ptr, "l", "=r", label); \ - __get_user_asm(__gu_high, ptr+1, "l", "=r", label); \ + __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label); \ + __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label); \ (x) = ((unsigned long long)__gu_high << 32) | __gu_low; \ } while (0) #else diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index e7265a552f4f..45697e04d771 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -58,13 +58,6 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size) return copy_user_generic((__force void *)dst, src, size); } -static __always_inline __must_check -unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size) -{ - return copy_user_generic((__force void *)dst, - (__force void *)src, size); -} - extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index 6b56d0d45d15..66b4ddde7743 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -3,14 +3,10 @@ #define _ASM_X86_SWIOTLB_XEN_H #ifdef CONFIG_SWIOTLB_XEN -extern int xen_swiotlb; extern int __init pci_xen_swiotlb_detect(void); -extern void __init pci_xen_swiotlb_init(void); extern int pci_xen_swiotlb_init_late(void); #else -#define xen_swiotlb (0) -static inline int __init pci_xen_swiotlb_detect(void) { return 0; } -static inline void __init pci_xen_swiotlb_init(void) { } +#define pci_xen_swiotlb_detect NULL static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } #endif diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index a6c327f8ad9e..2ef1f6513c68 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -295,6 +295,7 @@ struct kvm_debug_exit_arch { #define KVM_GUESTDBG_USE_HW_BP 0x00020000 #define KVM_GUESTDBG_INJECT_DB 0x00040000 #define KVM_GUESTDBG_INJECT_BP 0x00080000 +#define KVM_GUESTDBG_BLOCKIRQ 0x00100000 /* for KVM_SET_GUEST_DEBUG */ struct kvm_guest_debug_arch { diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 294ed4392a0e..10562885f5fc 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -109,14 +109,13 @@ static u32 __init allocate_aperture(void) * memory. Unfortunately we cannot move it up because that would * make the IOMMU useless. */ - addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, - aper_size, aper_size); + addr = memblock_phys_alloc_range(aper_size, aper_size, + GART_MIN_ADDR, GART_MAX_ADDR); if (!addr) { pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", addr, addr + aper_size - 1, aper_size >> 10); return 0; } - memblock_reserve(addr, aper_size); pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", addr, addr + aper_size - 1, aper_size >> 10); register_nosave_region(addr >> PAGE_SHIFT, diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index d66af2950e06..b5e36bd0425b 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -985,7 +985,7 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->priv = base->nb; } -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -1014,7 +1014,7 @@ static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) id4_regs->id = c->apicid >> index_msb; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { unsigned int idx, ret; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -1033,6 +1033,3 @@ static int __populate_cache_leaves(unsigned int cpu) return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 8cb7816d03b4..193204aee880 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1253,6 +1253,9 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin static void kill_me_now(struct callback_head *ch) { + struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me); + + p->mce_count = 0; force_sig(SIGBUS); } @@ -1262,6 +1265,7 @@ static void kill_me_maybe(struct callback_head *cb) int flags = MF_ACTION_REQUIRED; int ret; + p->mce_count = 0; pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); if (!p->mce_ripv) @@ -1290,17 +1294,34 @@ static void kill_me_maybe(struct callback_head *cb) } } -static void queue_task_work(struct mce *m, int kill_current_task) +static void queue_task_work(struct mce *m, char *msg, int kill_current_task) { - current->mce_addr = m->addr; - current->mce_kflags = m->kflags; - current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); - current->mce_whole_page = whole_page(m); + int count = ++current->mce_count; - if (kill_current_task) - current->mce_kill_me.func = kill_me_now; - else - current->mce_kill_me.func = kill_me_maybe; + /* First call, save all the details */ + if (count == 1) { + current->mce_addr = m->addr; + current->mce_kflags = m->kflags; + current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); + current->mce_whole_page = whole_page(m); + + if (kill_current_task) + current->mce_kill_me.func = kill_me_now; + else + current->mce_kill_me.func = kill_me_maybe; + } + + /* Ten is likely overkill. Don't expect more than two faults before task_work() */ + if (count > 10) + mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + + /* Second or later call, make sure page address matches the one from first call */ + if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) + mce_panic("Consecutive machine checks to different user pages", m, msg); + + /* Do not call task_work_add() more than once */ + if (count > 1) + return; task_work_add(current, ¤t->mce_kill_me, TWA_RESUME); } @@ -1438,7 +1459,7 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - queue_task_work(&m, kill_current_task); + queue_task_work(&m, msg, kill_current_task); } else { /* @@ -1456,7 +1477,7 @@ noinstr void do_machine_check(struct pt_regs *regs) } if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, kill_current_task); + queue_task_work(&m, msg, kill_current_task); } out: mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a26643dc6bd6..b656456c3a94 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -884,10 +884,11 @@ static void kvm_wait(u8 *ptr, u8 val) } else { local_irq_disable(); + /* safe_halt() will enable IRQ */ if (READ_ONCE(*ptr) == val) safe_halt(); - - local_irq_enable(); + else + local_irq_enable(); } } diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index aa15132228da..525876e7b9f4 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -154,7 +154,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) if (num_entries > LDT_ENTRIES) return NULL; - new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); + new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT); if (!new_ldt) return NULL; @@ -168,9 +168,9 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) * than PAGE_SIZE. */ if (alloc_size > PAGE_SIZE) - new_ldt->entries = vzalloc(alloc_size); + new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else - new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL); + new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!new_ldt->entries) { kfree(new_ldt); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 79f164141116..40ed44ead063 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -830,6 +830,20 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + /* + * Do some memory reservations *before* memory is added to memblock, so + * memblock allocations won't overwrite it. + * + * After this point, everything still needed from the boot loader or + * firmware or kernel text should be early reserved or marked not RAM in + * e820. All other memory is free game. + * + * This call needs to happen before e820__memory_setup() which calls the + * xen_memory_setup() on Xen dom0 which relies on the fact that those + * early reservations have happened already. + */ + early_reserve_memory(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; e820__memory_setup(); parse_setup_data(); @@ -876,18 +890,6 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - /* - * Do some memory reservations *before* memory is added to - * memblock, so memblock allocations won't overwrite it. - * Do it after early param, so we could get (unlikely) panic from - * serial. - * - * After this point everything still needed from the boot loader or - * firmware or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - early_reserve_memory(); - #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 78a32b956e81..5afd98559193 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free(__pa(ptr), size); + memblock_free_ptr(ptr, size); } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 95a98413dc32..54a83a744538 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -7,6 +7,8 @@ #include <linux/kvm_host.h> #include <linux/debugfs.h> #include "lapic.h" +#include "mmu.h" +#include "mmu/mmu_internal.h" static int vcpu_get_timer_advance_ns(void *data, u64 *val) { @@ -73,3 +75,112 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ &vcpu_tsc_scaling_frac_fops); } } + +/* + * This covers statistics <1024 (11=log(1024)+1), which should be enough to + * cover RMAP_RECYCLE_THRESHOLD. + */ +#define RMAP_LOG_SIZE 11 + +static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" }; + +static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) +{ + struct kvm_rmap_head *rmap; + struct kvm *kvm = m->private; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + unsigned int lpage_size, index; + /* Still small enough to be on the stack */ + unsigned int *log[KVM_NR_PAGE_SIZES], *cur; + int i, j, k, l, ret; + + ret = -ENOMEM; + memset(log, 0, sizeof(log)); + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { + log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL); + if (!log[i]) + goto out; + } + + mutex_lock(&kvm->slots_lock); + write_lock(&kvm->mmu_lock); + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + for (j = 0; j < slots->used_slots; j++) { + slot = &slots->memslots[j]; + for (k = 0; k < KVM_NR_PAGE_SIZES; k++) { + rmap = slot->arch.rmap[k]; + lpage_size = kvm_mmu_slot_lpages(slot, k + 1); + cur = log[k]; + for (l = 0; l < lpage_size; l++) { + index = ffs(pte_list_count(&rmap[l])); + if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE)) + index = RMAP_LOG_SIZE - 1; + cur[index]++; + } + } + } + } + + write_unlock(&kvm->mmu_lock); + mutex_unlock(&kvm->slots_lock); + + /* index=0 counts no rmap; index=1 counts 1 rmap */ + seq_printf(m, "Rmap_Count:\t0\t1\t"); + for (i = 2; i < RMAP_LOG_SIZE; i++) { + j = 1 << (i - 1); + k = (1 << i) - 1; + seq_printf(m, "%d-%d\t", j, k); + } + seq_printf(m, "\n"); + + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { + seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]); + cur = log[i]; + for (j = 0; j < RMAP_LOG_SIZE; j++) + seq_printf(m, "%d\t", cur[j]); + seq_printf(m, "\n"); + } + + ret = 0; +out: + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) + kfree(log[i]); + + return ret; +} + +static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + return single_open(file, kvm_mmu_rmaps_stat_show, kvm); +} + +static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + + kvm_put_kvm(kvm); + + return single_release(inode, file); +} + +static const struct file_operations mmu_rmaps_stat_fops = { + .open = kvm_mmu_rmaps_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_mmu_rmaps_stat_release, +}; + +int kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ + debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm, + &mmu_rmaps_stat_fops); + return 0; +} diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 41d2a53c5dea..232a86a6faaf 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -88,6 +88,10 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, int vector) { + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + int auto_eoi_old, auto_eoi_new; + if (vector < HV_SYNIC_FIRST_VALID_VECTOR) return; @@ -96,10 +100,30 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, else __clear_bit(vector, synic->vec_bitmap); + auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256); + if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else __clear_bit(vector, synic->auto_eoi_bitmap); + + auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256); + + if (!!auto_eoi_old == !!auto_eoi_new) + return; + + mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + + if (auto_eoi_new) + hv->synic_auto_eoi_used++; + else + hv->synic_auto_eoi_used--; + + __kvm_request_apicv_update(vcpu->kvm, + !hv->synic_auto_eoi_used, + APICV_INHIBIT_REASON_HYPERV); + + mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, @@ -933,12 +957,6 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) synic = to_hv_synic(vcpu); - /* - * Hyper-V SynIC auto EOI SINT's are - * not compatible with APICV, so request - * to deactivate APICV permanently. - */ - kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; synic->control = HV_SYNIC_CONTROL_ENABLE; @@ -2476,6 +2494,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; if (!cpu_smt_possible()) ent->eax |= HV_X64_NO_NONARCH_CORESHARING; + + ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED; /* * Default number of spinlock retry attempts, matches * HyperV 2016. diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a6e218c6140d..5a69cce4d72d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -220,7 +220,8 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) struct kvm_pit *pit = vcpu->kvm->arch.vpit; struct hrtimer *timer; - if (!kvm_vcpu_is_bsp(vcpu) || !pit) + /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */ + if (vcpu->vcpu_id || !pit) return; timer = &pit->pit_state.timer; diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 11e4065e1617..bbd4a5d18b5d 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -35,11 +35,7 @@ struct kvm_vcpu; #define IOAPIC_INIT 0x5 #define IOAPIC_EXTINT 0x7 -#ifdef CONFIG_X86 #define RTC_GSI 8 -#else -#define RTC_GSI -1U -#endif struct dest_map { /* vcpu bitmap where IRQ has been sent */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba5a27879f1d..76fb00921203 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -192,6 +192,9 @@ void kvm_recalculate_apic_map(struct kvm *kvm) if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) return; + WARN_ONCE(!irqchip_in_kernel(kvm), + "Dirty APIC map without an in-kernel local APIC"); + mutex_lock(&kvm->arch.apic_map_lock); /* * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map @@ -2265,9 +2268,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) - value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) @@ -2323,6 +2323,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) struct kvm_lapic *apic = vcpu->arch.apic; int i; + if (!init_event) { + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + } + if (!apic) return; @@ -2330,8 +2337,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) hrtimer_cancel(&apic->lapic_timer.timer); if (!init_event) { - kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE); + apic->base_address = APIC_DEFAULT_PHYS_BASE; + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); } kvm_apic_set_version(apic->vcpu); @@ -2364,9 +2371,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); - if (kvm_vcpu_is_bsp(vcpu)) - kvm_lapic_set_base(vcpu, - vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP); + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (vcpu->arch.apicv_active) { @@ -2476,11 +2481,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) lapic_timer_advance_dynamic = false; } - /* - * APIC is created enabled. This will prevent kvm_lapic_set_base from - * thinking that APIC state has changed. - */ - vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 83e6c6965f1e..e9688a9f7b57 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -240,4 +240,29 @@ static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) return smp_load_acquire(&kvm->arch.memslots_have_rmaps); } +static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) +{ + /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ + return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); +} + +static inline unsigned long +__kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages, + int level) +{ + return gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; +} + +static inline unsigned long +kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level) +{ + return __kvm_mmu_slot_lpages(slot, slot->npages, level); +} + +static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count) +{ + atomic64_add(count, &kvm->stat.pages[level - 1]); +} #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 47b765270239..2d7e61122af8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); bool tdp_enabled = false; static int max_huge_page_level __read_mostly; +static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; enum { @@ -137,12 +138,22 @@ module_param(dbg, bool, 0644); #include <trace/events/kvm.h> -/* make pte_list_desc fit well in cache line */ -#define PTE_LIST_EXT 3 +/* make pte_list_desc fit well in cache lines */ +#define PTE_LIST_EXT 14 +/* + * Slight optimization of cacheline layout, by putting `more' and `spte_count' + * at the start; then accessing it will only use one single cacheline for + * either full (entries==PTE_LIST_EXT) case or entries<=6. + */ struct pte_list_desc { - u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; + /* + * Stores number of entries stored in the pte_list_desc. No need to be + * u64 but just for easier alignment. When PTE_LIST_EXT, means full. + */ + u64 spte_count; + u64 *sptes[PTE_LIST_EXT]; }; struct kvm_shadow_walk_iterator { @@ -193,7 +204,7 @@ struct kvm_mmu_role_regs { * the single source of truth for the MMU's state. */ #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ -static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ +static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ { \ return !!(regs->reg & flag); \ } @@ -215,7 +226,7 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); * and the vCPU may be incorrect/irrelevant. */ #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ -static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \ +static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ { \ return !!(mmu->mmu_role. base_or_ext . reg##_##name); \ } @@ -323,12 +334,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { - /* Check if guest physical address doesn't exceed guest maximum */ - if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { - exception->error_code |= PFERR_RSVD_MASK; - return UNMAPPED_GVA; - } - return gpa; } @@ -592,12 +597,13 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. - * Returns non-zero if the PTE was previously valid. + * Returns the old PTE. */ -static int mmu_spte_clear_track_bits(u64 *sptep) +static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; + int level = sptep_to_sp(sptep)->role.level; if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, 0ull); @@ -605,7 +611,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep) old_spte = __update_clear_spte_slow(sptep, 0ull); if (!is_shadow_present_pte(old_spte)) - return 0; + return old_spte; + + kvm_update_page_stats(kvm, level, -1); pfn = spte_to_pfn(old_spte); @@ -622,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) if (is_dirty_spte(old_spte)) kvm_set_pfn_dirty(pfn); - return 1; + return old_spte; } /* @@ -686,28 +694,36 @@ static bool mmu_spte_age(u64 *sptep) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - /* - * Prevent page table teardown by making any free-er wait during - * kvm_flush_remote_tlbs() IPI to all active vcpus. - */ - local_irq_disable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_begin(); + } else { + /* + * Prevent page table teardown by making any free-er wait during + * kvm_flush_remote_tlbs() IPI to all active vcpus. + */ + local_irq_disable(); - /* - * Make sure a following spte read is not reordered ahead of the write - * to vcpu->mode. - */ - smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + /* + * Make sure a following spte read is not reordered ahead of the write + * to vcpu->mode. + */ + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + } } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - /* - * Make sure the write to vcpu->mode is not reordered in front of - * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us - * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. - */ - smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); - local_irq_enable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_end(); + } else { + /* + * Make sure the write to vcpu->mode is not reordered in front of + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us + * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. + */ + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); + local_irq_enable(); + } } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) @@ -786,7 +802,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } -static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, +static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; @@ -799,12 +815,12 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, } } -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, 1); } -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, -1); } @@ -893,7 +909,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; - int i, count = 0; + int count = 0; if (!rmap_head->val) { rmap_printk("%p %llx 0->1\n", spte, *spte); @@ -903,24 +919,24 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, desc = mmu_alloc_pte_list_desc(vcpu); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; + desc->spte_count = 2; rmap_head->val = (unsigned long)desc | 1; ++count; } else { rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - while (desc->sptes[PTE_LIST_EXT-1]) { + while (desc->spte_count == PTE_LIST_EXT) { count += PTE_LIST_EXT; - if (!desc->more) { desc->more = mmu_alloc_pte_list_desc(vcpu); desc = desc->more; + desc->spte_count = 0; break; } desc = desc->more; } - for (i = 0; desc->sptes[i]; ++i) - ++count; - desc->sptes[i] = spte; + count += desc->spte_count; + desc->sptes[desc->spte_count++] = spte; } return count; } @@ -930,13 +946,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i, struct pte_list_desc *prev_desc) { - int j; + int j = desc->spte_count - 1; - for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) - ; desc->sptes[i] = desc->sptes[j]; desc->sptes[j] = NULL; - if (j != 0) + desc->spte_count--; + if (desc->spte_count) return; if (!prev_desc && !desc->more) rmap_head->val = 0; @@ -969,7 +984,7 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { + for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { pte_list_desc_remove_entry(rmap_head, desc, i, prev_desc); @@ -984,30 +999,68 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } } -static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep) +static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + u64 *sptep) { - mmu_spte_clear_track_bits(sptep); + mmu_spte_clear_track_bits(kvm, sptep); __pte_list_remove(sptep, rmap_head); } -static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, - struct kvm_memory_slot *slot) +/* Return true if rmap existed, false otherwise */ +static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { - unsigned long idx; + struct pte_list_desc *desc, *next; + int i; - idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; + if (!rmap_head->val) + return false; + + if (!(rmap_head->val & 1)) { + mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); + goto out; + } + + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + + for (; desc; desc = next) { + for (i = 0; i < desc->spte_count; i++) + mmu_spte_clear_track_bits(kvm, desc->sptes[i]); + next = desc->more; + mmu_free_pte_list_desc(desc); + } +out: + /* rmap_head is meaningless now, remember to reset it */ + rmap_head->val = 0; + return true; } -static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, - struct kvm_mmu_page *sp) +unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; + struct pte_list_desc *desc; + unsigned int count = 0; - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, gfn); - return __gfn_to_rmap(gfn, sp->role.level, slot); + if (!rmap_head->val) + return 0; + else if (!(rmap_head->val & 1)) + return 1; + + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + + while (desc) { + count += desc->spte_count; + desc = desc->more; + } + + return count; +} + +static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, + const struct kvm_memory_slot *slot) +{ + unsigned long idx; + + idx = gfn_to_index(gfn, slot->base_gfn, level); + return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -1020,24 +1073,39 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { + struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); return pte_list_add(vcpu, spte, rmap_head); } + static void rmap_remove(struct kvm *kvm, u64 *spte) { + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; gfn_t gfn; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmap_head = gfn_to_rmap(kvm, gfn, sp); + + /* + * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the + * context of a vCPU so have to determine which memslots to use based + * on context information in sp->role. + */ + slots = kvm_memslots_for_spte_role(kvm, sp->role); + + slot = __gfn_to_memslot(slots, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); + __pte_list_remove(spte, rmap_head); } @@ -1119,7 +1187,9 @@ out: static void drop_spte(struct kvm *kvm, u64 *sptep) { - if (mmu_spte_clear_track_bits(sptep)) + u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep); + + if (is_shadow_present_pte(old_spte)) rmap_remove(kvm, sptep); } @@ -1129,7 +1199,6 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) if (is_large_pte(*sptep)) { WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); - --kvm->stat.lpages; return true; } @@ -1218,7 +1287,7 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) * Returns true iff any D or W bits were cleared. */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1256,8 +1325,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, return; while (mask) { - rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PG_LEVEL_4K, slot); __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ @@ -1289,8 +1358,8 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, return; while (mask) { - rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PG_LEVEL_4K, slot); __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ @@ -1356,7 +1425,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); + rmap_head = gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmap_head, true); } } @@ -1377,20 +1446,9 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) } static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { - u64 *sptep; - struct rmap_iterator iter; - bool flush = false; - - while ((sptep = rmap_get_first(rmap_head, &iter))) { - rmap_printk("spte %p %llx.\n", sptep, *sptep); - - pte_list_remove(rmap_head, sptep); - flush = true; - } - - return flush; + return pte_list_destroy(kvm, rmap_head); } static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1421,13 +1479,13 @@ restart: need_flush = 1; if (pte_write(pte)) { - pte_list_remove(rmap_head, sptep); + pte_list_remove(kvm, rmap_head, sptep); goto restart; } else { new_spte = kvm_mmu_changed_pte_notifier_make_spte( *sptep, new_pfn); - mmu_spte_clear_track_bits(sptep); + mmu_spte_clear_track_bits(kvm, sptep); mmu_spte_set(sptep, new_spte); } } @@ -1442,7 +1500,7 @@ restart: struct slot_rmap_walk_iterator { /* input fields. */ - struct kvm_memory_slot *slot; + const struct kvm_memory_slot *slot; gfn_t start_gfn; gfn_t end_gfn; int start_level; @@ -1462,14 +1520,13 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; - iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); - iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, - iterator->slot); + iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot); + iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); } static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, - struct kvm_memory_slot *slot, int start_level, + const struct kvm_memory_slot *slot, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn) { iterator->slot = slot; @@ -1584,12 +1641,13 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { + struct kvm_memory_slot *slot; struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; sp = sptep_to_sp(spte); - - rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, @@ -2232,8 +2290,6 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); - if (is_large_pte(pte)) - --kvm->stat.lpages; } else { child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); @@ -2716,15 +2772,12 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pgprintk("%s: setting spte %llx\n", __func__, *sptep); trace_kvm_mmu_set_spte(level, gfn, sptep); - if (!was_rmapped && is_large_pte(*sptep)) - ++vcpu->kvm->stat.lpages; - if (is_shadow_present_pte(*sptep)) { - if (!was_rmapped) { - rmap_count = rmap_add(vcpu, sptep, gfn); - if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, sptep, gfn); - } + if (!was_rmapped) { + kvm_update_page_stats(vcpu->kvm, level, 1); + rmap_count = rmap_add(vcpu, sptep, gfn); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) + rmap_recycle(vcpu, sptep, gfn); } return ret; @@ -2852,6 +2905,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, int max_level) { struct kvm_lpage_info *linfo; + int host_level; max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { @@ -2863,7 +2917,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - return host_pfn_mapping_level(kvm, gfn, pfn, slot); + host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); + return min(host_level, max_level); } int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2887,17 +2942,12 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); - if (level == PG_LEVEL_4K) - return level; - - *req_level = level = min(level, max_level); - /* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - if (huge_page_disallowed) + *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); + if (level == PG_LEVEL_4K || huge_page_disallowed) return PG_LEVEL_4K; /* @@ -2965,15 +3015,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, break; drop_large_spte(vcpu, it.sptep); - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, - it.level - 1, true, ACC_ALL); - - link_shadow_page(vcpu, it.sptep, sp); - if (is_tdp && huge_page_disallowed && - req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); - } + if (is_shadow_present_pte(*it.sptep)) + continue; + + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, + it.level - 1, true, ACC_ALL); + + link_shadow_page(vcpu, it.sptep, sp); + if (is_tdp && huge_page_disallowed && + req_level >= it.level) + account_huge_nx_page(vcpu->kvm, sp); } ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, @@ -3122,15 +3173,40 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) } /* - * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between walk_shadow_page_lockless_{begin,end}. + * - The returned sptep must not be used after walk_shadow_page_lockless_end. */ -static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) { struct kvm_shadow_walk_iterator iterator; + u64 old_spte; + u64 *sptep = NULL; + + for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { + sptep = iterator.sptep; + *spte = old_spte; + + if (!is_shadow_present_pte(old_spte)) + break; + } + + return sptep; +} + +/* + * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. + */ +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) +{ struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; u64 spte = 0ull; + u64 *sptep = NULL; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) @@ -3141,14 +3217,15 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) - if (!is_shadow_present_pte(spte)) - break; + if (is_tdp_mmu(vcpu->arch.mmu)) + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte); + else + sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte); if (!is_shadow_present_pte(spte)) break; - sp = sptep_to_sp(iterator.sptep); + sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3206,8 +3283,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, - new_spte)) { + if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } @@ -3220,8 +3296,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); - trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, ret); + trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; @@ -3455,15 +3530,22 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK | shadow_me_mask; - if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; if (WARN_ON_ONCE(!mmu->pml4_root)) { r = -EIO; goto out_unlock; } - mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; + + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) { + if (WARN_ON_ONCE(!mmu->pml5_root)) { + r = -EIO; + goto out_unlock; + } + mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; + } } for (i = 0; i < 4; ++i) { @@ -3482,7 +3564,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) mmu->pae_root[i] = root | pm_mask; } - if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) + mmu->root_hpa = __pa(mmu->pml5_root); + else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) mmu->root_hpa = __pa(mmu->pml4_root); else mmu->root_hpa = __pa(mmu->pae_root); @@ -3498,7 +3582,10 @@ out_unlock: static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; - u64 *pml4_root, *pae_root; + bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL; + u64 *pml5_root = NULL; + u64 *pml4_root = NULL; + u64 *pae_root; /* * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP @@ -3511,20 +3598,21 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) return 0; /* - * This mess only works with 4-level paging and needs to be updated to - * work with 5-level paging. + * NPT, the only paging mode that uses this horror, uses a fixed number + * of levels for the shadow page tables, e.g. all MMUs are 4-level or + * all MMus are 5-level. Thus, this can safely require that pml5_root + * is allocated if the other roots are valid and pml5 is needed, as any + * prior MMU would also have required pml5. */ - if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL)) - return -EIO; - - if (mmu->pae_root && mmu->pml4_root) + if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root)) return 0; /* * The special roots should always be allocated in concert. Yell and * bail if KVM ends up in a state where only one of the roots is valid. */ - if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root)) + if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || + (need_pml5 && mmu->pml5_root))) return -EIO; /* @@ -3535,16 +3623,31 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) if (!pae_root) return -ENOMEM; +#ifdef CONFIG_X86_64 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml4_root) { - free_page((unsigned long)pae_root); - return -ENOMEM; + if (!pml4_root) + goto err_pml4; + + if (need_pml5) { + pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pml5_root) + goto err_pml5; } +#endif mmu->pae_root = pae_root; mmu->pml4_root = pml4_root; + mmu->pml5_root = pml5_root; return 0; + +#ifdef CONFIG_X86_64 +err_pml5: + free_page((unsigned long)pml4_root); +err_pml4: + free_page((unsigned long)pae_root); + return -ENOMEM; +#endif } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -3640,6 +3743,8 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between walk_shadow_page_lockless_{begin,end}. */ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { @@ -3647,8 +3752,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level int leaf = -1; u64 spte; - walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), *root_level = iterator.level; shadow_walk_okay(&iterator); @@ -3662,8 +3765,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level break; } - walk_shadow_page_lockless_end(vcpu); - return leaf; } @@ -3675,11 +3776,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf, level; bool reserved = false; + walk_shadow_page_lockless_begin(vcpu); + if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); + walk_shadow_page_lockless_end(vcpu); + if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; @@ -3795,9 +3900,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, - bool write, bool *writable) + bool write, bool *writable, int *r) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; @@ -3808,13 +3913,26 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) - return true; - - /* Don't expose private memslots to L2. */ - if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { - *pfn = KVM_PFN_NOSLOT; - *writable = false; - return false; + goto out_retry; + + if (!kvm_is_visible_memslot(slot)) { + /* Don't expose private memslots to L2. */ + if (is_guest_mode(vcpu)) { + *pfn = KVM_PFN_NOSLOT; + *writable = false; + return false; + } + /* + * If the APIC access page exists but is disabled, go directly + * to emulation without caching the MMIO access or creating a + * MMIO SPTE. That way the cache doesn't need to be purged + * when the AVIC is re-enabled. + */ + if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && + !kvm_apicv_activated(vcpu->kvm)) { + *r = RET_PF_EMULATE; + return true; + } } async = false; @@ -3828,14 +3946,17 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (kvm_find_async_pf_gfn(vcpu, gfn)) { trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); - return true; + goto out_retry; } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) - return true; + goto out_retry; } *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable, hva); - return false; + +out_retry: + *r = RET_PF_RETRY; + return true; } static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, @@ -3854,11 +3975,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (!is_tdp_mmu_fault) { - r = fast_page_fault(vcpu, gpa, error_code); - if (r != RET_PF_INVALID) - return r; - } + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; r = mmu_topup_memory_caches(vcpu, false); if (r) @@ -3867,9 +3986,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, - write, &map_writable)) - return RET_PF_RETRY; + if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, + write, &map_writable, &r)) + return r; if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) return r; @@ -4588,6 +4707,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { + /* tdp_root_level is architecture forced level, use it if nonzero */ + if (tdp_root_level) + return tdp_root_level; + /* Use 5-level TDP if and only if it's useful/necessary. */ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) return 4; @@ -5160,7 +5283,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); - if (WARN_ON_ONCE(r == RET_PF_INVALID)) + if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } @@ -5279,10 +5402,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) */ } -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level) { tdp_enabled = enable_tdp; + tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; /* @@ -5302,12 +5426,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot); +typedef bool (*slot_level_handler) (struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot); /* The caller should hold mmu-lock before calling this function. */ static __always_inline bool -slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, bool flush) @@ -5334,7 +5459,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, bool flush_on_yield) { @@ -5345,7 +5470,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, bool flush_on_yield) { return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, @@ -5358,6 +5483,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu) set_memory_encrypted((unsigned long)mmu->pae_root, 1); free_page((unsigned long)mmu->pae_root); free_page((unsigned long)mmu->pml4_root); + free_page((unsigned long)mmu->pml5_root); } static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) @@ -5587,6 +5713,10 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_mmu_uninit_tdp_mmu(kvm); } +/* + * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end + * (not including it) + */ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; @@ -5594,8 +5724,11 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; bool flush = false; + write_lock(&kvm->mmu_lock); + + kvm_inc_notifier_count(kvm, gfn_start, gfn_end); + if (kvm_memslots_have_rmaps(kvm)) { - write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) { @@ -5606,41 +5739,44 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - flush = slot_handle_level_range(kvm, memslot, + flush = slot_handle_level_range(kvm, + (const struct kvm_memory_slot *) memslot, kvm_zap_rmapp, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } } if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); - write_unlock(&kvm->mmu_lock); + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end - gfn_start); } if (is_tdp_mmu_enabled(kvm)) { - flush = false; - - read_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, - gfn_end, flush, true); + gfn_end, flush); if (flush) kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end); - - read_unlock(&kvm->mmu_lock); + gfn_end - gfn_start); } + + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + + kvm_dec_notifier_count(kvm, gfn_start, gfn_end); + + write_unlock(&kvm->mmu_lock); } static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { return __rmap_write_protect(kvm, rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, int start_level) { bool flush = false; @@ -5676,7 +5812,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -5699,7 +5835,7 @@ restart: if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, pfn, PG_LEVEL_NUM)) { - pte_list_remove(rmap_head, sptep); + pte_list_remove(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, @@ -5715,10 +5851,8 @@ restart: } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *slot) { - /* FIXME: const-ify all uses of struct kvm_memory_slot. */ - struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; bool flush = false; if (kvm_memslots_have_rmaps(kvm)) { @@ -5754,7 +5888,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { bool flush = false; diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index cedc17b2f60e..9e7dcf999f08 100644 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -147,7 +147,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot); + rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot); if (!rmap_head->val) { if (!__ratelimit(&ratelimit_state)) return; @@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); - rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); for_each_rmap_spte(rmap_head, &iter, sptep) { if (is_writable_pte(*sptep)) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 35567293c1fd..bf2bdbf333c2 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -31,13 +31,16 @@ extern bool dbg; #define IS_VALID_PAE_ROOT(x) (!!(x)) struct kvm_mmu_page { + /* + * Note, "link" through "spt" fit in a single 64 byte cache line on + * 64-bit kernels, keep it that way unless there's a reason not to. + */ struct list_head link; struct hlist_node hash_link; - struct list_head lpage_disallowed_link; + bool tdp_mmu_page; bool unsync; u8 mmu_valid_gen; - bool mmio_cached; bool lpage_disallowed; /* Can't be replaced by an equiv large page */ /* @@ -59,6 +62,7 @@ struct kvm_mmu_page { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ DECLARE_BITMAP(unsync_child_bitmap, 512); + struct list_head lpage_disallowed_link; #ifdef CONFIG_X86_32 /* * Used out of the mmu-lock to avoid reading spte values while an @@ -71,8 +75,6 @@ struct kvm_mmu_page { atomic_t write_flooding_count; #ifdef CONFIG_X86_64 - bool tdp_mmu_page; - /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; #endif @@ -124,13 +126,14 @@ static inline bool is_nx_huge_page_enabled(void) int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); +unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); /* * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). @@ -140,6 +143,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + * + * Any names added to this enum should be exported to userspace for use in + * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h */ enum { RET_PF_RETRY = 0, diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index efbad33a0645..2924a4081a19 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -54,6 +54,12 @@ { PFERR_RSVD_MASK, "RSVD" }, \ { PFERR_FETCH_MASK, "F" } +TRACE_DEFINE_ENUM(RET_PF_RETRY); +TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_INVALID); +TRACE_DEFINE_ENUM(RET_PF_FIXED); +TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); + /* * A pagetable walk has started */ diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 91a9f7e0fd91..269f11f92fd0 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -16,6 +16,7 @@ #include <asm/kvm_page_track.h> +#include "mmu.h" #include "mmu_internal.h" void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ee044d357b5f..7d03e9b7ccfa 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -881,9 +881,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, - write_fault, &map_writable)) - return RET_PF_RETRY; + if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva, + write_fault, &map_writable, &r)) + return r; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) return r; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d80cb122b5f3..64ccfc1fa553 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -10,7 +10,7 @@ #include <asm/cmpxchg.h> #include <trace/events/kvm.h> -static bool __read_mostly tdp_mmu_enabled = false; +static bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ @@ -255,26 +255,17 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, * * @kvm: kvm instance * @sp: the new page - * @shared: This operation may not be running under the exclusive use of - * the MMU lock and the operation must synchronize with other - * threads that might be adding or removing pages. * @account_nx: This page replaces a NX large page and should be marked for * eventual reclaim. */ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool shared, bool account_nx) + bool account_nx) { - if (shared) - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - else - lockdep_assert_held_write(&kvm->mmu_lock); - + spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add(&sp->link, &kvm->arch.tdp_mmu_pages); if (account_nx) account_huge_nx_page(kvm, sp); - - if (shared) - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } /** @@ -445,13 +436,6 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); - if (is_large_pte(old_spte) != is_large_pte(new_spte)) { - if (is_large_pte(old_spte)) - atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); - else - atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); - } - /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ @@ -477,6 +461,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, return; } + if (is_leaf != was_leaf) + kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); if (was_leaf && is_dirty_spte(old_spte) && (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) @@ -526,6 +512,10 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, if (is_removed_spte(iter->old_spte)) return false; + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and + * does not hold the mmu_lock. + */ if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, new_spte) != iter->old_spte) return false; @@ -537,15 +527,40 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, return true; } -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +/* + * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a + * TDP page fault. + * + * @vcpu: The vcpu instance that took the TDP page fault. + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * + * Returns: true if the SPTE was set, false if it was not. If false is returned, + * this function will have no side-effects. + */ +static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu, + struct tdp_iter *iter, + u64 new_spte) { + struct kvm *kvm = vcpu->kvm; + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) return false; - handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, - iter->old_spte, new_spte, iter->level); + /* + * Use kvm_vcpu_gfn_to_memslot() instead of going through + * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot. + */ + if (is_writable_pte(new_spte)) { + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn); + + if (slot && kvm_slot_dirty_track_enabled(slot)) { + /* Enforced by kvm_mmu_hugepage_adjust. */ + WARN_ON_ONCE(iter->level > PG_LEVEL_4K); + mark_page_dirty_in_slot(kvm, slot, iter->gfn); + } + } + return true; } @@ -558,7 +573,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE)) return false; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, @@ -789,21 +804,15 @@ retry: * non-root pages mapping GFNs strictly within that range. Returns true if * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. - * - * If shared is true, this thread holds the MMU lock in read mode and must - * account for the possibility that other threads are modifying the paging - * structures concurrently. If shared is false, this thread should hold the - * MMU in write mode. */ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush, - bool shared) + gfn_t end, bool can_yield, bool flush) { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared) + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false) flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, - shared); + false); return flush; } @@ -814,8 +823,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) int i; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, - flush, false); + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush); if (flush) kvm_flush_remote_tlbs(kvm); @@ -940,7 +948,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte)) return RET_PF_RETRY; /* @@ -1044,9 +1052,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, - new_spte)) { - tdp_mmu_link_page(vcpu->kvm, sp, true, + if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) { + tdp_mmu_link_page(vcpu->kvm, sp, huge_page_disallowed && req_level >= iter.level); @@ -1255,8 +1262,8 @@ retry: * only affect leaf SPTEs down to min_level. * Returns true if an SPTE has been changed and the TLBs need to be flushed. */ -bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, - int min_level) +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot, int min_level) { struct kvm_mmu_page *root; bool spte_set = false; @@ -1326,7 +1333,8 @@ retry: * each SPTE. Returns true if an SPTE has been changed and the TLBs need to * be flushed. */ -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; bool spte_set = false; @@ -1529,6 +1537,8 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) @@ -1540,14 +1550,47 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->shadow_root_level; - rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } - rcu_read_unlock(); - return leaf; } + +/* + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. + * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. + * + * WARNING: This function is only intended to be called during fast_page_fault. + */ +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + gfn_t gfn = addr >> PAGE_SHIFT; + tdp_ptep_t sptep = NULL; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + *spte = iter.old_spte; + sptep = iter.sptep; + } + + /* + * Perform the rcu_dereference to get the raw spte pointer value since + * we are passing it up to fast_page_fault, which is shared with the + * legacy MMU and thus does not retain the TDP MMU-specific __rcu + * annotation. + * + * This is safe since fast_page_fault obeys the contracts of this + * function as well as all TDP MMU contracts around modifying SPTEs + * outside of mmu_lock. + */ + return rcu_dereference(sptep); +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1cae4485b3bc..358f447d4012 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -20,14 +20,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared); bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush, - bool shared); + gfn_t end, bool can_yield, bool flush); static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, - gfn_t start, gfn_t end, bool flush, - bool shared) + gfn_t start, gfn_t end, bool flush) { - return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush, - shared); + return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush); } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -44,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) */ lockdep_assert_held_write(&kvm->mmu_lock); return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp), - sp->gfn, end, false, false, false); + sp->gfn, end, false, false); } void kvm_tdp_mmu_zap_all(struct kvm *kvm); @@ -61,10 +58,10 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, - int min_level); +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot, int min_level); bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, - struct kvm_memory_slot *slot); + const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, @@ -77,8 +74,20 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level); +static inline void kvm_tdp_mmu_walk_lockless_begin(void) +{ + rcu_read_lock(); +} + +static inline void kvm_tdp_mmu_walk_lockless_end(void) +{ + rcu_read_unlock(); +} + int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte); #ifdef CONFIG_X86_64 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 827886c12c16..0772bad9165c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -137,18 +137,20 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc->perf_event = event; pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); + pmc->is_paused = false; } static void pmc_pause_counter(struct kvm_pmc *pmc) { u64 counter = pmc->counter; - if (!pmc->perf_event) + if (!pmc->perf_event || pmc->is_paused) return; /* update counter, reset event value to avoid redundant accumulation */ counter += perf_event_pause(pmc->perf_event, true); pmc->counter = counter & pmc_bitmask(pmc); + pmc->is_paused = true; } static bool pmc_resume_counter(struct kvm_pmc *pmc) @@ -163,6 +165,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); + pmc->is_paused = false; clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); return true; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 67e753edfa22..0e4f2b1fa9fb 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -55,7 +55,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) u64 counter, enabled, running; counter = pmc->counter; - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); /* FIXME: Scaling needed? */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a8ad78a2faa1..8052d92069e0 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -197,6 +197,8 @@ void avic_init_vmcb(struct vcpu_svm *svm) vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; + vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; + if (kvm_apicv_activated(svm->vcpu.kvm)) vmcb->control.int_ctl |= AVIC_ENABLE_MASK; else @@ -225,31 +227,26 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, * field of the VMCB. Therefore, we set up the * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. */ -static int avic_update_access_page(struct kvm *kvm, bool activate) +static int avic_alloc_access_page(struct kvm *kvm) { void __user *ret; int r = 0; mutex_lock(&kvm->slots_lock); - /* - * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger - * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT - * memory region. So, we need to ensure that kvm->mm == current->mm. - */ - if ((kvm->arch.apic_access_memslot_enabled == activate) || - (kvm->mm != current->mm)) + + if (kvm->arch.apic_access_memslot_enabled) goto out; ret = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, - activate ? PAGE_SIZE : 0); + PAGE_SIZE); if (IS_ERR(ret)) { r = PTR_ERR(ret); goto out; } - kvm->arch.apic_access_memslot_enabled = activate; + kvm->arch.apic_access_memslot_enabled = true; out: mutex_unlock(&kvm->slots_lock); return r; @@ -270,7 +267,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (kvm_apicv_activated(vcpu->kvm)) { int ret; - ret = avic_update_access_page(vcpu->kvm, true); + ret = avic_alloc_access_page(vcpu->kvm); if (ret) return ret; } @@ -587,17 +584,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate) -{ - if (!enable_apicv || !lapic_in_kernel(vcpu)) - return; - - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - kvm_request_apicv_update(vcpu->kvm, activate, - APICV_INHIBIT_REASON_IRQWIN); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); -} - void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { return; @@ -667,6 +653,11 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } vmcb_mark_dirty(vmcb, VMCB_AVIC); + if (activated) + avic_vcpu_load(vcpu, vcpu->cpu); + else + avic_vcpu_put(vcpu); + svm_set_pi_irte_mode(vcpu, activated); } @@ -918,10 +909,6 @@ bool svm_check_apicv_inhibit_reasons(ulong bit) return supported & BIT(bit); } -void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate) -{ - avic_update_access_page(kvm, activate); -} static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) @@ -960,9 +947,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - /* * Since the host physical APIC id is 8 bits, * we can support host APIC ID upto 255. @@ -990,9 +974,6 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) u64 entry; struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - entry = READ_ONCE(*(svm->avic_physical_id_cache)); if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) avic_update_iommu_vcpu_affinity(vcpu, -1, 0); @@ -1009,6 +990,10 @@ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) struct vcpu_svm *svm = to_svm(vcpu); svm->avic_is_running = is_run; + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + if (is_run) avic_vcpu_load(vcpu, vcpu->cpu); else diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index e5515477c30a..2545d0c61985 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -666,11 +666,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) goto out; } - - /* Clear internal status */ - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); - /* * Since vmcb01 is not in use, we can use it to store some of the L1 * state. diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7fbce342eec4..75e0b21ad07c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -28,8 +28,6 @@ #include "cpuid.h" #include "trace.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - #ifndef CONFIG_KVM_AMD_SEV /* * When this config is not defined, SEV feature is not supported and APIs in @@ -584,6 +582,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xcr0 = svm->vcpu.arch.xcr0; save->pkru = svm->vcpu.arch.pkru; save->xss = svm->vcpu.arch.ia32_xss; + save->dr6 = svm->vcpu.arch.dr6; /* * SEV-ES will use a VMSA that is pointed to by the VMCB, not diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 69639f9624f5..05e8d4d27969 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -46,8 +46,6 @@ #include "kvm_onhyperv.h" #include "svm_onhyperv.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -261,7 +259,7 @@ u32 svm_msrpm_offset(u32 msr) static int get_max_npt_level(void) { #ifdef CONFIG_X86_64 - return PT64_ROOT_4LEVEL; + return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; #else return PT32E_ROOT_LEVEL; #endif @@ -462,11 +460,6 @@ static int has_svm(void) return 0; } - if (pgtable_l5_enabled()) { - pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n"); - return 0; - } - return 1; } @@ -1015,7 +1008,9 @@ static __init int svm_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; - kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); + /* Force VM NPT level equal to the host's max NPT level */ + kvm_configure_mmu(npt_enabled, get_max_npt_level(), + get_max_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); /* Note, SEV setup consumes npt_enabled. */ @@ -1161,8 +1156,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - vcpu->arch.hflags = 0; - svm_set_intercept(svm, INTERCEPT_CR0_READ); svm_set_intercept(svm, INTERCEPT_CR3_READ); svm_set_intercept(svm, INTERCEPT_CR4_READ); @@ -1241,29 +1234,14 @@ static void init_vmcb(struct kvm_vcpu *vcpu) SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; save->cs.limit = 0xffff; + save->gdtr.base = 0; save->gdtr.limit = 0xffff; + save->idtr.base = 0; save->idtr.limit = 0xffff; init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - svm_set_cr4(vcpu, 0); - svm_set_efer(vcpu, 0); - save->dr6 = 0xffff0ff0; - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - save->rip = 0x0000fff0; - vcpu->arch.regs[VCPU_REGS_RIP] = save->rip; - - /* - * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. - * It also updates the guest-visible cr0 value. - */ - svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); - kvm_mmu_reset_context(vcpu); - - save->cr4 = X86_CR4_PAE; - /* rdx = ?? */ - if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; @@ -1273,14 +1251,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = vcpu->arch.pat; save->cr3 = 0; - save->cr4 = 0; } svm->current_vmcb->asid_generation = 0; svm->asid = 0; svm->nested.vmcb12_gpa = INVALID_GPA; svm->nested.last_vmcb12_gpa = INVALID_GPA; - vcpu->arch.hflags = 0; if (!kvm_pause_in_guest(vcpu->kvm)) { control->pause_filter_count = pause_filter_count; @@ -1330,25 +1306,11 @@ static void init_vmcb(struct kvm_vcpu *vcpu) static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); - u32 dummy; - u32 eax = 1; svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; - if (!init_event) { - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; - } init_vmcb(vcpu); - - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false); - kvm_rdx_write(vcpu, eax); - - if (kvm_vcpu_apicv_active(vcpu) && !init_event) - avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) @@ -1513,12 +1475,15 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) sd->current_vmcb = svm->vmcb; indirect_branch_prediction_barrier(); } - avic_vcpu_load(vcpu, cpu); + if (kvm_vcpu_apicv_active(vcpu)) + avic_vcpu_load(vcpu, cpu); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) { - avic_vcpu_put(vcpu); + if (kvm_vcpu_apicv_active(vcpu)) + avic_vcpu_put(vcpu); + svm_prepare_host_switch(vcpu); ++vcpu->stat.host_state_reload; @@ -1560,7 +1525,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); break; default: - WARN_ON_ONCE(1); + KVM_BUG_ON(1, vcpu->kvm); } } @@ -2078,11 +2043,15 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) return -EINVAL; /* - * VMCB is undefined after a SHUTDOWN intercept - * so reinitialize it. + * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put + * the VMCB in a known good state. Unfortuately, KVM doesn't have + * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking + * userspace. At a platform view, INIT is acceptable behavior as + * there exist bare metal platforms that automatically INIT the CPU + * in response to shutdown. */ clear_page(svm->vmcb); - init_vmcb(vcpu); + kvm_vcpu_reset(vcpu, true); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; @@ -2993,10 +2962,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->msr_decfg = data; break; } - case MSR_IA32_APICBASE: - if (kvm_vcpu_apicv_active(vcpu)) - avic_update_vapic_bar(to_svm(vcpu), data); - fallthrough; default: return kvm_set_msr_common(vcpu, msr); } @@ -3021,7 +2986,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) * In this case AVIC was temporarily disabled for * requesting the IRQ window and we have to re-enable it. */ - svm_toggle_avic_for_irq_window(vcpu, true); + kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN); ++vcpu->stat.irq_window_exits; return 1; @@ -3269,12 +3234,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } -static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) +static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code) { - if (exit_code < ARRAY_SIZE(svm_exit_handlers) && - svm_exit_handlers[exit_code]) - return 0; + return (exit_code < ARRAY_SIZE(svm_exit_handlers) && + svm_exit_handlers[exit_code]); +} +static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) +{ vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); dump_vmcb(vcpu); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -3282,14 +3249,13 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_code; vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; - - return -EINVAL; + return 0; } int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (svm_handle_invalid_exit(vcpu, exit_code)) - return 0; + if (!svm_check_exit_valid(vcpu, exit_code)) + return svm_handle_invalid_exit(vcpu, exit_code); #ifdef CONFIG_RETPOLINE if (exit_code == SVM_EXIT_MSR) @@ -3573,7 +3539,7 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu) * via AVIC. In such case, we need to temporarily disable AVIC, * and fallback to injecting IRQ via V_IRQ. */ - svm_toggle_avic_for_irq_window(vcpu, false); + kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN); svm_set_vintr(svm); } } @@ -3808,6 +3774,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) pre_svm_run(vcpu); + WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); + sync_lapic_to_cr8(vcpu); if (unlikely(svm->asid != svm->vmcb->control.asid)) { @@ -4610,7 +4578,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_virtual_apic_mode = svm_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, - .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index bd0fe94c2920..524d943f3efc 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -503,12 +503,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) -{ - svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; - vmcb_mark_dirty(svm->vmcb, VMCB_AVIC); -} - static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -524,7 +518,6 @@ int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm); -void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate); int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu); int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); @@ -534,7 +527,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu); void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); bool svm_check_apicv_inhibit_reasons(ulong bit); -void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate); void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 8170f2a5a16f..22e2b019de37 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -4,7 +4,7 @@ #include <linux/compiler_types.h> -#include <asm/kvm_host.h> +#include "x86.h" #define svm_asm(insn, clobber...) \ do { \ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 896b2a50b4aa..0dab1b7b529f 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -14,7 +14,6 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs); #if IS_ENABLED(CONFIG_HYPERV) -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ {EVMCS1_OFFSET(name), clean_field} diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 2ec9b46f0d0c..152ab0aa82cf 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -73,8 +73,6 @@ struct evmcs_field { extern const struct evmcs_field vmcs_field_to_evmcs_1[]; extern const unsigned int nr_evmcs_1_fields; -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) - static __always_inline int get_evmcs_offset(unsigned long field, u16 *clean_field) { @@ -95,8 +93,6 @@ static __always_inline int get_evmcs_offset(unsigned long field, return evmcs_field->offset; } -#undef ROL16 - static inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b3f77d18eb5a..ccb03d69546c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2207,7 +2207,8 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, } } -static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, + struct vmcs12 *vmcs12) { u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); @@ -2218,23 +2219,22 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* * PIN CONTROLS */ - exec_control = vmx_pin_based_exec_ctrl(vmx); + exec_control = __pin_controls_get(vmcs01); exec_control |= (vmcs12->pin_based_vm_exec_control & ~PIN_BASED_VMX_PREEMPTION_TIMER); /* Posted interrupts setting is only taken from vmcs12. */ - if (nested_cpu_has_posted_intr(vmcs12)) { + vmx->nested.pi_pending = false; + if (nested_cpu_has_posted_intr(vmcs12)) vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - vmx->nested.pi_pending = false; - } else { + else exec_control &= ~PIN_BASED_POSTED_INTR; - } pin_controls_set(vmx, exec_control); /* * EXEC CONTROLS */ - exec_control = vmx_exec_control(vmx); /* L0's desires */ + exec_control = __exec_controls_get(vmcs01); /* L0's desires */ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; exec_control &= ~CPU_BASED_TPR_SHADOW; @@ -2271,10 +2271,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * SECONDARY EXEC CONTROLS */ if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmx->secondary_exec_control; + exec_control = __secondary_exec_controls_get(vmcs01); /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_XSAVES | @@ -2282,7 +2283,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_TSC_SCALING); + SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_DESC); + if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -2322,8 +2325,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * on the related bits (if supported by the CPU) in the hope that * we can avoid VMWrites during vmx_set_efer(). */ - exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & - ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; + exec_control = __vm_entry_controls_get(vmcs01); + exec_control |= vmcs12->vm_entry_controls; + exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) exec_control |= VM_ENTRY_IA32E_MODE; @@ -2339,9 +2343,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ - exec_control = vmx_vmexit_ctrl(); + exec_control = __vm_exit_controls_get(vmcs01); if (cpu_has_load_ia32_efer() && guest_efer != host_efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; + else + exec_control &= ~VM_EXIT_LOAD_IA32_EFER; vm_exit_controls_set(vmx, exec_control); /* @@ -3384,7 +3390,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); - prepare_vmcs02_early(vmx, vmcs12); + prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); if (from_vmentry) { if (unlikely(!nested_get_vmcs12_pages(vcpu))) { @@ -4304,7 +4310,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, seg.l = 1; else seg.db = 1; - vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); seg = (struct kvm_segment) { .base = 0, .limit = 0xFFFFFFFF, @@ -4315,17 +4321,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, .g = 1 }; seg.selector = vmcs12->host_ds_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); seg.selector = vmcs12->host_es_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); seg.selector = vmcs12->host_ss_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); seg.selector = vmcs12->host_fs_selector; seg.base = vmcs12->host_fs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); seg.selector = vmcs12->host_gs_selector; seg.base = vmcs12->host_gs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); seg = (struct kvm_segment) { .base = vmcs12->host_tr_base, .limit = 0x67, @@ -4333,14 +4339,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, .type = 11, .present = 1 }; - vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + + memset(&seg, 0, sizeof(seg)); + seg.unusable = 1; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); @@ -4419,9 +4426,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - /* * This nasty bit of open coding is a compromise between blindly * loading L1's MSRs using the exit load lists (incorrect emulation diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 9efc1a6b8693..10cc4f65c4ef 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -437,13 +437,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 4b9957e2bf5b..6e5de2e2b0da 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -11,6 +11,8 @@ #include "capabilities.h" +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) + struct vmcs_hdr { u32 revision_id:31; u32 shadow_vmcs:1; diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index d9f5d7c56ae3..cab6ba7a5005 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -2,7 +2,6 @@ #include "vmcs12.h" -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) #define FIELD64(number, name) \ diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 5e0e1b39f495..2a45f026ee11 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -364,8 +364,6 @@ static inline void vmx_check_vmcs12_offsets(void) extern const unsigned short vmcs_field_to_offset_table[]; extern const unsigned int nr_vmcs12_fields; -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) - static inline short vmcs_field_to_offset(unsigned long field) { unsigned short offset; @@ -385,8 +383,6 @@ static inline short vmcs_field_to_offset(unsigned long field) return offset; } -#undef ROL16 - static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, u16 offset) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 927a552393b9..0c2c0d5ae873 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -136,8 +136,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ - X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) @@ -1648,11 +1647,12 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, } /* - * Set up the vmcs to automatically save and restore system - * msrs. Don't touch the 64-bit msrs if the guest is in legacy - * mode, as fiddling with msrs is very expensive. + * Configuring user return MSRs to automatically save, load, and restore MSRs + * that need to be shoved into hardware when running the guest. Note, omitting + * an MSR here does _NOT_ mean it's not emulated, only that it will not be + * loaded into hardware when running the guest. */ -static void setup_msrs(struct vcpu_vmx *vmx) +static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 bool load_syscall_msrs; @@ -1682,9 +1682,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) */ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(&vmx->vcpu); - /* * The set of MSRs to load may have changed, reload MSRs before the * next VM-Enter. @@ -2263,8 +2260,11 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: - if (is_unrestricted_guest(vcpu) || - (enable_ept && is_paging(vcpu))) + /* + * When intercepting CR3 loads, e.g. for shadowing paging, KVM's + * CR3 is loaded into hardware, not the guest's CR3. + */ + if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4: @@ -2274,7 +2274,7 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; break; default: - WARN_ON_ONCE(1); + KVM_BUG_ON(1, vcpu->kvm); break; } } @@ -2733,7 +2733,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, save->dpl = save->selector & SEGMENT_RPL_MASK; save->s = 1; } - vmx_set_segment(vcpu, save, seg); + __vmx_set_segment(vcpu, save, seg); } static void enter_pmode(struct kvm_vcpu *vcpu) @@ -2754,7 +2754,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 0; - vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); + __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; @@ -2852,8 +2852,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - - kvm_mmu_reset_context(vcpu); } int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -2874,7 +2872,7 @@ int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } - setup_msrs(vmx); + vmx_setup_uret_msrs(vmx); return 0; } @@ -2997,42 +2995,24 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } -static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, - unsigned long cr0, - struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - vmx_cache_reg(vcpu, VCPU_EXREG_CR3); - if (!(cr0 & X86_CR0_PG)) { - /* From paging/starting to nonpaging */ - exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } else if (!is_paging(vcpu)) { - /* From nonpaging to paging */ - exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } - - if (!(cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; -} +#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING) void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long hw_cr0; + unsigned long hw_cr0, old_cr0_pg; + u32 tmp; + + old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); if (is_unrestricted_guest(vcpu)) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; + if (!enable_ept) + hw_cr0 |= X86_CR0_WP; if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -3041,22 +3021,60 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); } + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, hw_cr0); + vcpu->arch.cr0 = cr0; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); + #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) + if (!old_cr0_pg && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) + else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) exit_lmode(vcpu); } #endif - if (enable_ept && !is_unrestricted_guest(vcpu)) - ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + if (enable_ept && !is_unrestricted_guest(vcpu)) { + /* + * Ensure KVM has an up-to-date snapshot of the guest's CR3. If + * the below code _enables_ CR3 exiting, vmx_cache_reg() will + * (correctly) stop reading vmcs.GUEST_CR3 because it thinks + * KVM's CR3 is installed. + */ + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + vmx_cache_reg(vcpu, VCPU_EXREG_CR3); - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, hw_cr0); - vcpu->arch.cr0 = cr0; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); + /* + * When running with EPT but not unrestricted guest, KVM must + * intercept CR3 accesses when paging is _disabled_. This is + * necessary because restricted guests can't actually run with + * paging disabled, and so KVM stuffs its own CR3 in order to + * run the guest when identity mapped page tables. + * + * Do _NOT_ check the old CR0.PG, e.g. to optimize away the + * update, it may be stale with respect to CR3 interception, + * e.g. after nested VM-Enter. + * + * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or + * stores to forward them to L1, even if KVM does not need to + * intercept them to preserve its identity mapped page tables. + */ + if (!(cr0 & X86_CR0_PG)) { + exec_controls_setbit(vmx, CR3_EXITING_BITS); + } else if (!is_guest_mode(vcpu)) { + exec_controls_clearbit(vmx, CR3_EXITING_BITS); + } else { + tmp = exec_controls_get(vmx); + tmp &= ~CR3_EXITING_BITS; + tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; + exec_controls_set(vmx, tmp); + } + + /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ + if ((old_cr0_pg ^ cr0) & X86_CR0_PG) + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->emulation_required = emulation_required(vcpu); @@ -3271,7 +3289,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) return ar; } -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -3284,7 +3302,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) vmcs_write16(sf->selector, var->selector); else if (var->s) fix_rmode_seg(seg, &vmx->rmode.segs[seg]); - goto out; + return; } vmcs_writel(sf->base, var->base); @@ -3306,9 +3324,13 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) var->type |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); +} -out: - vmx->emulation_required = emulation_required(vcpu); +static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + __vmx_set_segment(vcpu, var, seg); + + to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3790,21 +3812,6 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) -{ - u8 mode = 0; - - if (cpu_has_secondary_exec_ctrls() && - (secondary_exec_controls_get(to_vmx(vcpu)) & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - mode |= MSR_BITMAP_MODE_X2APIC; - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) - mode |= MSR_BITMAP_MODE_X2APIC_APICV; - } - - return mode; -} - static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) { unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; @@ -3822,11 +3829,29 @@ static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) } } -static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + u8 mode; + if (!cpu_has_vmx_msr_bitmap()) return; + if (cpu_has_secondary_exec_ctrls() && + (secondary_exec_controls_get(vmx) & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { + mode = MSR_BITMAP_MODE_X2APIC; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) + mode |= MSR_BITMAP_MODE_X2APIC_APICV; + } else { + mode = 0; + } + + if (mode == vmx->x2apic_msr_bitmap_mode) + return; + + vmx->x2apic_msr_bitmap_mode = mode; + vmx_reset_x2apic_msrs(vcpu, mode); /* @@ -3843,21 +3868,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) } } -void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u8 mode = vmx_msr_bitmap_mode(vcpu); - u8 changed = mode ^ vmx->msr_bitmap_mode; - - if (!changed) - return; - - if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(vcpu, mode); - - vmx->msr_bitmap_mode = mode; -} - void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3914,7 +3924,6 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) } pt_update_intercept_for_msr(vcpu); - vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu)); } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, @@ -4086,7 +4095,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); } -u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; @@ -4102,6 +4111,30 @@ u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) return pin_based_exec_ctrl; } +static u32 vmx_vmentry_ctrl(void) +{ + u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; + + if (vmx_pt_mode_is_system()) + vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | + VM_ENTRY_LOAD_IA32_RTIT_CTL); + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmentry_ctrl & + ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); +} + +static u32 vmx_vmexit_ctrl(void) +{ + u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; + + if (vmx_pt_mode_is_system()) + vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL); + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmexit_ctrl & + ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); +} + static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4118,11 +4151,10 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu); } -u32 vmx_exec_control(struct vcpu_vmx *vmx) +static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; @@ -4204,7 +4236,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) -static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) +static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; @@ -4290,7 +4322,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; - vmx->secondary_exec_control = exec_control; + return exec_control; } #define VMX_XSS_EXIT_BITMAP 0 @@ -4314,10 +4346,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) exec_controls_set(vmx, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - secondary_exec_controls_set(vmx, vmx->secondary_exec_control); - } + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); @@ -4388,32 +4418,35 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } + + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (cpu_need_tpr_shadow(&vmx->vcpu)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + __pa(vmx->vcpu.arch.apic->regs)); + vmcs_write32(TPR_THRESHOLD, 0); + } + + vmx_setup_uret_msrs(vmx); } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct msr_data apic_base_msr; - u64 cr0; vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; vmx->msr_ia32_umwait_control = 0; - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); - if (!init_event) { - apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - apic_base_msr.data |= MSR_IA32_APICBASE_BSP; - apic_base_msr.host_initiated = true; - kvm_set_apic_base(vcpu, &apic_base_msr); - } - vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); @@ -4436,16 +4469,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - if (!init_event) { - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - } - - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - kvm_rip_write(vcpu, 0xfff0); - vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -4458,31 +4481,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_mpx_supported()) vmcs_write64(GUEST_BNDCFGS, 0); - setup_msrs(vmx); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - if (cpu_has_vmx_tpr_shadow() && !init_event) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (cpu_need_tpr_shadow(vcpu)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - __pa(vcpu->arch.apic->regs)); - vmcs_write32(TPR_THRESHOLD, 0); - } - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx->vcpu.arch.cr0 = cr0; - vmx_set_cr0(vcpu, cr0); /* enter rmode */ - vmx_set_cr4(vcpu, 0); - vmx_set_efer(vcpu, 0); - - vmx_update_exception_bitmap(vcpu); - vpid_sync_context(vmx->vpid); - if (init_event) - vmx_clear_hlt(vcpu); } static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) @@ -4996,6 +4999,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, err); case 3: WARN_ON_ONCE(enable_unrestricted_guest); + err = kvm_set_cr3(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 4: @@ -5021,14 +5025,13 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 2: /* clts */ - WARN_ONCE(1, "Guest should always own CR0.TS"); - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - return kvm_skip_emulated_instruction(vcpu); + KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); + return -EIO; case 1: /*mov from cr*/ switch (cr) { case 3: WARN_ON_ONCE(enable_unrestricted_guest); + val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); @@ -5129,6 +5132,12 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); + + /* + * exc_debug expects dr6 to be cleared after it runs, avoid that it sees + * a stale dr6 from the guest. + */ + set_debugreg(DR6_RESERVED, 6); } static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) @@ -5338,7 +5347,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { - WARN_ON_ONCE(!enable_vnmi); + if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) + return -EIO; + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5896,7 +5907,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * below) should never happen as that means we incorrectly allowed a * nested VM-Enter with an invalid vmcs12. */ - WARN_ON_ONCE(vmx->nested.nested_run_pending); + if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) + return -EIO; /* If guest state is invalid, start emulating */ if (vmx->emulation_required) @@ -6189,7 +6201,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) } secondary_exec_controls_set(vmx, sec_exec_control); - vmx_update_msr_bitmap(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu); } static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) @@ -6274,7 +6286,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) int max_irr; bool max_irr_updated; - WARN_ON(!vcpu->arch.apicv_active); + if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm)) + return -EIO; + if (pi_test_on(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* @@ -6357,7 +6371,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; gate_desc *desc = (gate_desc *)host_idt_base + vector; - if (WARN_ONCE(!is_external_intr(intr_info), + if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; @@ -6368,6 +6382,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vmx->emulation_required) + return; + if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) @@ -6639,6 +6656,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + set_debugreg(vcpu->arch.dr6, 6); + /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -6838,7 +6859,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } - vmx->msr_bitmap_mode = 0; vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); @@ -6997,7 +7017,7 @@ exit: return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; } -static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) +static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) { /* * These bits in the secondary execution controls field @@ -7011,7 +7031,6 @@ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC; - u32 new_ctl = vmx->secondary_exec_control; u32 cur_ctl = secondary_exec_controls_get(vmx); secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); @@ -7154,10 +7173,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ vcpu->arch.xsaves_enabled = false; - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - vmcs_set_secondary_exec_control(vmx); - } + vmx_setup_uret_msrs(vmx); + + if (cpu_has_secondary_exec_ctrls()) + vmcs_set_secondary_exec_control(vmx, + vmx_secondary_exec_control(vmx)); if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= @@ -7803,7 +7823,8 @@ static __init int hardware_setup(void) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; - kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), + ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 17a1cb4b059d..4858c5fd95f2 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -227,7 +227,7 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; u8 fail; - u8 msr_bitmap_mode; + u8 x2apic_msr_bitmap_mode; /* * If true, host state has been stored in vmx->loaded_vmcs for @@ -263,8 +263,6 @@ struct vcpu_vmx { u64 spec_ctrl; u32 msr_ia32_umwait_control; - u32 secondary_exec_control; - /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested @@ -371,12 +369,11 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); -void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); @@ -419,9 +416,13 @@ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ vmx->loaded_vmcs->controls_shadow.lname = val; \ } \ } \ +static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ { \ - return vmx->loaded_vmcs->controls_shadow.lname; \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ } \ static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ { \ @@ -451,31 +452,6 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = 0; } -static inline u32 vmx_vmentry_ctrl(void) -{ - u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; - if (vmx_pt_mode_is_system()) - vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | - VM_ENTRY_LOAD_IA32_RTIT_CTL); - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmentry_ctrl & - ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); -} - -static inline u32 vmx_vmexit_ctrl(void) -{ - u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; - if (vmx_pt_mode_is_system()) - vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | - VM_EXIT_CLEAR_IA32_RTIT_CTL); - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmexit_ctrl & - ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); -} - -u32 vmx_exec_control(struct vcpu_vmx *vmx); -u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx); - static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { return container_of(kvm, struct kvm_vmx, kvm); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 164b64f65a8f..9e9ef47e988c 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -4,13 +4,11 @@ #include <linux/nospec.h> -#include <asm/kvm_host.h> #include <asm/vmx.h> #include "evmcs.h" #include "vmcs.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) +#include "x86.h" asmlinkage void vmread_error(unsigned long field, bool fault); __attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5d5c5ed7dd4..28ef14155726 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -233,12 +233,13 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, mmu_recycled), STATS_DESC_COUNTER(VM, mmu_cache_miss), STATS_DESC_ICOUNTER(VM, mmu_unsync), - STATS_DESC_ICOUNTER(VM, lpages), + STATS_DESC_ICOUNTER(VM, pages_4k), + STATS_DESC_ICOUNTER(VM, pages_2m), + STATS_DESC_ICOUNTER(VM, pages_1g), STATS_DESC_ICOUNTER(VM, nx_lpage_splits), + STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -278,8 +279,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_ICOUNTER(VCPU, guest_mode) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -485,7 +484,14 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible noinstr void kvm_spurious_fault(void) +/* + * Handle a fault on a hardware virtualization (VMX or SVM) instruction. + * + * Hardware virtualization extension instructions may fault if a reboot turns + * off virtualization while processes are running. Usually after catching the + * fault we just panic; during reboot instead the instruction is ignored. + */ +noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); @@ -1180,7 +1186,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { for (i = 0; i < KVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; } } @@ -3316,6 +3321,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); + /* Before back to guest, tsc_timestamp must be adjusted + * as well, otherwise guest's percpu pvclock time could jump. + */ + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } vcpu->arch.ia32_tsc_adjust_msr = data; } @@ -4310,12 +4319,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); - /* - * If userspace has set any breakpoints or watchpoints, dr6 is restored - * on every vmexit, but if not, we might have a stale dr6 from the - * guest. do_debug expects dr6 to be cleared after it runs, do the same. - */ - set_debugreg(0, 6); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, @@ -6567,9 +6570,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, * there is no pkey in EPT page table for L1 guest or EPT * shadow page table for L2 guest. */ - if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.mmio_access, 0, access)) { + if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) || + !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.mmio_access, 0, access))) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -8578,6 +8581,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated); static void kvm_apicv_init(struct kvm *kvm) { + mutex_init(&kvm->arch.apicv_update_lock); + if (enable_apicv) clear_bit(APICV_INHIBIT_REASON_DISABLE, &kvm->arch.apicv_inhibit_reasons); @@ -8891,6 +8896,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) can_inject = false; } + /* Don't inject interrupts if the user asked to avoid doing so */ + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) + return 0; + /* * Finally, inject interrupt events. If an event cannot be injected * due to architectural conditions (e.g. IF=0) a window-open exit @@ -9236,10 +9245,18 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + bool activate; + if (!lapic_in_kernel(vcpu)) return; - vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); + mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + + activate = kvm_apicv_activated(vcpu->kvm); + if (vcpu->arch.apicv_active == activate) + goto out; + + vcpu->arch.apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); @@ -9251,54 +9268,45 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) */ if (!vcpu->arch.apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); + +out: + mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); -/* - * NOTE: Do not hold any lock prior to calling this. - * - * In particular, kvm_request_apicv_update() expects kvm->srcu not to be - * locked, because it calls __x86_set_memory_region() which does - * synchronize_srcu(&kvm->srcu). - */ -void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { - struct kvm_vcpu *except; - unsigned long old, new, expected; + unsigned long old, new; if (!kvm_x86_ops.check_apicv_inhibit_reasons || !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; - old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); - do { - expected = new = old; - if (activate) - __clear_bit(bit, &new); - else - __set_bit(bit, &new); - if (new == old) - break; - old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); - } while (old != expected); - - if (!!old == !!new) - return; + old = new = kvm->arch.apicv_inhibit_reasons; - trace_kvm_apicv_update_request(activate, bit); - if (kvm_x86_ops.pre_update_apicv_exec_ctrl) - static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate); + if (activate) + __clear_bit(bit, &new); + else + __set_bit(bit, &new); + + if (!!old != !!new) { + trace_kvm_apicv_update_request(activate, bit); + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); + kvm->arch.apicv_inhibit_reasons = new; + if (new) { + unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); + kvm_zap_gfn_range(kvm, gfn, gfn+1); + } + } else + kvm->arch.apicv_inhibit_reasons = new; +} +EXPORT_SYMBOL_GPL(__kvm_request_apicv_update); - /* - * Sending request to update APICV for all other vcpus, - * while update the calling vcpu immediately instead of - * waiting for another #VMEXIT to handle the request. - */ - except = kvm_get_running_vcpu(); - kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE, - except); - if (except) - kvm_vcpu_update_apicv(except); +void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +{ + mutex_lock(&kvm->arch.apicv_update_lock); + __kvm_request_apicv_update(kvm, activate, bit); + mutex_unlock(&kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -9395,6 +9403,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) { + r = -EIO; + goto out; + } if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; @@ -9608,8 +9620,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); - set_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } @@ -9639,7 +9649,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } /* @@ -9976,7 +9985,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || + (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { r = -EINVAL; goto out; } @@ -10581,9 +10591,6 @@ static void store_regs(struct kvm_vcpu *vcpu) static int sync_regs(struct kvm_vcpu *vcpu) { - if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS) - return -EINVAL; - if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { __set_regs(vcpu, &vcpu->run->s.regs.regs); vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; @@ -10799,6 +10806,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long new_cr0; + u32 eax, dummy; kvm_lapic_reset(vcpu, init_event); @@ -10865,10 +10874,41 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + /* + * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) + * if no CPUID match is found. Note, it's impossible to get a match at + * RESET since KVM emulates RESET before exposing the vCPU to userspace, + * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET. + * But, go through the motions in case that's ever remedied. + */ + eax = 1; + if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) + eax = 0x600; + kvm_rdx_write(vcpu, eax); + vcpu->arch.ia32_xss = 0; static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0xfff0); + + /* + * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions + * of Intel's SDM list CD/NW as being set on INIT, but they contradict + * (or qualify) that with a footnote stating that CD/NW are preserved. + */ + new_cr0 = X86_CR0_ET; + if (init_event) + new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD)); + else + new_cr0 |= X86_CR0_NW | X86_CR0_CD; + + static_call(kvm_x86_set_cr0)(vcpu, new_cr0); + static_call(kvm_x86_set_cr4)(vcpu, 0); + static_call(kvm_x86_set_efer)(vcpu, 0); + static_call(kvm_x86_update_exception_bitmap)(vcpu); + /* * Reset the MMU context if paging was enabled prior to INIT (which is * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the @@ -10879,7 +10919,20 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) */ if (old_cr0 & X86_CR0_PG) kvm_mmu_reset_context(vcpu); + + /* + * Intel's SDM states that all TLB entries are flushed on INIT. AMD's + * APM states the TLBs are untouched by INIT, but it also states that + * the TLBs are flushed on "External initialization of the processor." + * Flush the guest TLB regardless of vendor, there is no meaningful + * benefit in relying on the guest to flush the TLB immediately after + * INIT. A spurious TLB flush is benign and likely negligible from a + * performance perspective. + */ + if (init_event) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } +EXPORT_SYMBOL_GPL(kvm_vcpu_reset); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { @@ -11123,6 +11176,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_hv_init_vm(kvm); kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); + kvm_xen_init_vm(kvm); return static_call(kvm_x86_vm_init)(kvm); } @@ -11312,8 +11366,7 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { int level = i + 1; - int lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + int lpages = __kvm_mmu_slot_lpages(slot, npages, level); WARN_ON(slot->arch.rmap[i]); @@ -11396,8 +11449,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, int lpages; int level = i + 1; - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + lpages = __kvm_mmu_slot_lpages(slot, npages, level); linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) @@ -11481,7 +11533,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *old, - struct kvm_memory_slot *new, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; @@ -11561,10 +11613,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_mmu_change_mmu_pages(kvm, kvm_mmu_calculate_default_mmu_pages(kvm)); - /* - * FIXME: const-ify all uses of struct kvm_memory_slot. - */ - kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change); + kvm_mmu_slot_apply_flags(kvm, old, new, change); /* Free the arrays associated with the old memslot. */ if (change == KVM_MR_MOVE) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 44ae10312740..7d66d63dc55a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,8 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +void kvm_spurious_fault(void); + static __always_inline void kvm_guest_enter_irqoff(void) { /* diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index ae17250e1efe..9ea9c3dabe37 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -25,15 +25,14 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) { gpa_t gpa = gfn_to_gpa(gfn); int wc_ofs, sec_hi_ofs; - int ret; + int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache, - gpa, PAGE_SIZE); - if (ret) + if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) { + ret = -EFAULT; goto out; - - kvm->arch.xen.shinfo_set = true; + } + kvm->arch.xen.shinfo_gfn = gfn; /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); @@ -245,7 +244,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) case KVM_XEN_ATTR_TYPE_SHARED_INFO: if (data->u.shared_info.gfn == GPA_INVALID) { - kvm->arch.xen.shinfo_set = false; + kvm->arch.xen.shinfo_gfn = GPA_INVALID; r = 0; break; } @@ -283,10 +282,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (kvm->arch.xen.shinfo_set) - data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); - else - data->u.shared_info.gfn = GPA_INVALID; + data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn); r = 0; break; @@ -646,6 +642,11 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) return 0; } +void kvm_xen_init_vm(struct kvm *kvm) +{ + kvm->arch.xen.shinfo_gfn = GPA_INVALID; +} + void kvm_xen_destroy_vm(struct kvm *kvm) { if (kvm->arch.xen_hvm_config.msr) diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index 463a7844a8ca..cc0cf5f37450 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -21,6 +21,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data); int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); +void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); static inline bool kvm_xen_msr_enabled(struct kvm *kvm) @@ -50,6 +51,10 @@ static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) return 1; } +static inline void kvm_xen_init_vm(struct kvm *kvm) +{ +} + static inline void kvm_xen_destroy_vm(struct kvm *kvm) { } diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 058f19b20465..c565def611e2 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -37,10 +37,10 @@ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ - ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); leXX_to_cpu(t, r); }) + ({ t r; memcpy(&r, insn->next_byte, sizeof(t)); insn->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ - ({ t r = *(t*)((insn)->next_byte + n); leXX_to_cpu(t, r); }) + ({ t r; memcpy(&r, (insn)->next_byte + n, sizeof(t)); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b2eefdefc108..84a2c8c4af73 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -710,7 +710,8 @@ oops: static noinline void kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int signal, int si_code) + unsigned long address, int signal, int si_code, + u32 pkey) { WARN_ON_ONCE(user_mode(regs)); @@ -735,8 +736,12 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, set_signal_archinfo(address, error_code); - /* XXX: hwpoison faults will set the wrong code. */ - force_sig_fault(signal, si_code, (void __user *)address); + if (si_code == SEGV_PKUERR) { + force_sig_pkuerr((void __user *)address, pkey); + } else { + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_fault(signal, si_code, (void __user *)address); + } } /* @@ -798,7 +803,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; if (!user_mode(regs)) { - kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGSEGV, si_code, pkey); return; } @@ -930,7 +936,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { - kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } @@ -1396,7 +1403,8 @@ good_area: */ if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, - SIGBUS, BUS_ADRERR); + SIGBUS, BUS_ADRERR, + ARCH_DEFAULT_PKEY); return; } @@ -1416,7 +1424,8 @@ good_area: return; if (fatal_signal_pending(current) && !user_mode(regs)) { - kernelmode_fixup_or_oops(regs, error_code, address, 0, 0); + kernelmode_fixup_or_oops(regs, error_code, address, + 0, 0, ARCH_DEFAULT_PKEY); return; } @@ -1424,7 +1433,8 @@ good_area: /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, - SIGSEGV, SEGV_MAPERR); + SIGSEGV, SEGV_MAPERR, + ARCH_DEFAULT_PKEY); return; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 75ef19aa8903..23a14d82e783 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -127,14 +127,12 @@ __ref void *alloc_low_pages(unsigned int num) unsigned long ret = 0; if (min_pfn_mapped < max_pfn_mapped) { - ret = memblock_find_in_range( + ret = memblock_phys_alloc_range( + PAGE_SIZE * num, PAGE_SIZE, min_pfn_mapped << PAGE_SHIFT, - max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE * num , PAGE_SIZE); + max_pfn_mapped << PAGE_SHIFT); } - if (ret) - memblock_reserve(ret, PAGE_SIZE * num); - else if (can_use_brk_pgt) + if (!ret && can_use_brk_pgt) ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); if (!ret) @@ -610,8 +608,17 @@ static void __init memory_map_top_down(unsigned long map_start, unsigned long addr; unsigned long mapped_ram_size = 0; - /* xen has big range in reserved near end of ram, skip it at first.*/ - addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); + /* + * Systems that have many reserved areas near top of the memory, + * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will + * require lots of 4K mappings which may exhaust pgt_buf. + * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure + * there is enough mapped memory that can be allocated from + * memblock. + */ + addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, + map_end); + memblock_free(addr, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 74b78840182d..bd90b8fe81e4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -801,8 +801,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start_pfn, nr_pages, params); } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ddeaba947eb3..36098226a957 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1255,8 +1255,7 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end) remove_pagetable(start, end, true, NULL); } -void __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -1433,18 +1432,18 @@ int kern_addr_valid(unsigned long addr) return 0; p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) + if (!p4d_present(*p4d)) return 0; pud = pud_offset(p4d, addr); - if (pud_none(*pud)) + if (!pud_present(*pud)) return 0; if (pud_large(*pud)) return pfn_valid(pud_pfn(*pud)); pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) return 0; if (pmd_large(*pmd)) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 1a50434c8a4d..ef885370719a 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -49,8 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, p = early_alloc(PMD_SIZE, nid, false); if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) return; - else if (p) - memblock_free(__pa(p), PMD_SIZE); + memblock_free_ptr(p, PMD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); @@ -86,8 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, p = early_alloc(PUD_SIZE, nid, false); if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) return; - else if (p) - memblock_free(__pa(p), PUD_SIZE); + memblock_free_ptr(p, PUD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index e94da744386f..1e9b93b088db 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -355,7 +355,7 @@ void __init numa_reset_distance(void) /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) - memblock_free(__pa(numa_distance), size); + memblock_free_ptr(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; /* enable table creation */ } @@ -376,15 +376,14 @@ static int __init numa_alloc_distance(void) cnt++; size = cnt * cnt * sizeof(numa_distance[0]); - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - size, PAGE_SIZE); + phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!phys) { pr_warn("Warning: can't allocate distance table!\n"); /* don't retry until explicitly reset */ numa_distance = (void *)1LU; return -ENOMEM; } - memblock_reserve(phys, size); numa_distance = __va(phys); numa_distance_cnt = cnt; diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 87d77cc52f86..e801e30089c4 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -447,13 +447,12 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) if (numa_dist_cnt) { u64 phys; - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - phys_size, PAGE_SIZE); + phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!phys) { pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } - memblock_reserve(phys, phys_size); phys_dist = __va(phys); for (i = 0; i < numa_dist_cnt; i++) @@ -518,8 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) } /* free the copied physical distance table */ - if (phys_dist) - memblock_free(__pa(phys_dist), phys_size); + memblock_free_ptr(phys_dist, phys_size); return; no_emu: diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 3112ca7786ed..4ba2a3ee4bce 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -583,7 +583,12 @@ int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type, int err = 0; start = sanitize_phys(start); - end = sanitize_phys(end); + + /* + * The end address passed into this function is exclusive, but + * sanitize_phys() expects an inclusive address. + */ + end = sanitize_phys(end - 1) + 1; if (start >= end) { WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, start, end - 1, cattr_name(req_type)); diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c index 01a085d9135a..4f0147d4e225 100644 --- a/arch/x86/pci/numachip.c +++ b/arch/x86/pci/numachip.c @@ -12,6 +12,7 @@ #include <linux/pci.h> #include <asm/pci_x86.h> +#include <asm/numachip/numachip.h> static u8 limit __read_mostly; diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 7d2525691854..101081ad64b6 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -146,8 +146,7 @@ static void sta2x11_map_ep(struct pci_dev *pdev) dev_err(dev, "sta2x11: could not set DMA offset\n"); dev->bus_dma_limit = max_amba_addr; - pci_set_consistent_dma_mask(pdev, max_amba_addr); - pci_set_dma_mask(pdev, max_amba_addr); + dma_set_mask_and_coherent(&pdev->dev, max_amba_addr); /* Configure AHB mapping */ pci_write_config_dword(pdev, AHB_PEXLBASE(0), 0); diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 6534c92d0f83..31b5856010cb 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -28,7 +28,7 @@ void __init reserve_real_mode(void) WARN_ON(slab_is_available()); /* Has to be under 1M so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); + mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1<<20); if (!mem) pr_info("No sub-1M memory is available for the trampoline\n"); else diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index b95db9daf0e8..4c6c2be0c899 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -101,4 +101,16 @@ static inline void remap_stack_and_trap(void) "memory"); } +static __always_inline void *get_stub_page(void) +{ + unsigned long ret; + + asm volatile ( + "movl %%esp,%0 ;" + "andl %1,%0" + : "=a" (ret) + : "g" (~(UM_KERN_PAGE_SIZE - 1))); + + return (void *)ret; +} #endif diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index 6e2626b77a2e..e9c4b2b38803 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -108,4 +108,16 @@ static inline void remap_stack_and_trap(void) __syscall_clobber, "r10", "r8", "r9"); } +static __always_inline void *get_stub_page(void) +{ + unsigned long ret; + + asm volatile ( + "movq %%rsp,%0 ;" + "andq %1,%0" + : "=a" (ret) + : "g" (~(UM_KERN_PAGE_SIZE - 1))); + + return (void *)ret; +} #endif diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c index 21836eaf1725..f7eefba034f9 100644 --- a/arch/x86/um/stub_segv.c +++ b/arch/x86/um/stub_segv.c @@ -11,9 +11,8 @@ void __attribute__ ((__section__ (".__syscall_stub"))) stub_segv_handler(int sig, siginfo_t *info, void *p) { - int stack; + struct faultinfo *f = get_stub_page(); ucontext_t *uc = p; - struct faultinfo *f = (void *)(((unsigned long)&stack) & ~(UM_KERN_PAGE_SIZE - 1)); GET_FAULTINFO_FROM_MC(*f, &uc->uc_mcontext); trap_myself(); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 753f63734c13..6e0d0754f94f 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -755,8 +755,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) preempt_enable(); } -static void xen_convert_trap_info(const struct desc_ptr *desc, - struct trap_info *traps) +static unsigned xen_convert_trap_info(const struct desc_ptr *desc, + struct trap_info *traps, bool full) { unsigned in, out, count; @@ -766,17 +766,18 @@ static void xen_convert_trap_info(const struct desc_ptr *desc, for (in = out = 0; in < count; in++) { gate_desc *entry = (gate_desc *)(desc->address) + in; - if (cvt_gate_to_trap(in, entry, &traps[out])) + if (cvt_gate_to_trap(in, entry, &traps[out]) || full) out++; } - traps[out].address = 0; + + return out; } void xen_copy_trap_info(struct trap_info *traps) { const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); - xen_convert_trap_info(desc, traps); + xen_convert_trap_info(desc, traps, true); } /* Load a new IDT into Xen. In principle this can be per-CPU, so we @@ -786,6 +787,7 @@ static void xen_load_idt(const struct desc_ptr *desc) { static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; + unsigned out; trace_xen_cpu_load_idt(desc); @@ -793,7 +795,8 @@ static void xen_load_idt(const struct desc_ptr *desc) memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); - xen_convert_trap_info(desc, traps); + out = xen_convert_trap_info(desc, traps, false); + memset(&traps[out], 0, sizeof(traps[0])); xen_mc_flush(); if (HYPERVISOR_set_trap_table(traps)) @@ -1214,6 +1217,11 @@ static void __init xen_dom0_set_legacy_features(void) x86_platform.legacy.rtc = 1; } +static void __init xen_domu_set_legacy_features(void) +{ + x86_platform.legacy.rtc = 0; +} + /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { @@ -1359,6 +1367,8 @@ asmlinkage __visible void __init xen_start_kernel(void) add_preferred_console("xenboot", 0, NULL); if (pci_xen) x86_init.pci.arch_init = pci_xen_init; + x86_platform.set_legacy_features = + xen_domu_set_legacy_features; } else { const struct dom0_vga_console_info *info = (void *)((char *)xen_start_info + diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1df5f01529e5..8d751939c6f3 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1518,14 +1518,17 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, if (pinned) { struct page *page = pfn_to_page(pfn); - if (static_branch_likely(&xen_struct_pages_ready)) + pinned = false; + if (static_branch_likely(&xen_struct_pages_ready)) { + pinned = PagePinned(page); SetPagePinned(page); + } xen_mc_batch(); __set_pfn_prot(pfn, PAGE_KERNEL_RO); - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned) __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); xen_mc_issue(PARAVIRT_LAZY_MMU); diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 54f9aa7e8457..46df59aeaa06 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -18,7 +18,7 @@ #endif #include <linux/export.h> -int xen_swiotlb __read_mostly; +static int xen_swiotlb __read_mostly; /* * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary @@ -56,7 +56,7 @@ int __init pci_xen_swiotlb_detect(void) return xen_swiotlb; } -void __init pci_xen_swiotlb_init(void) +static void __init pci_xen_swiotlb_init(void) { if (xen_swiotlb) { xen_swiotlb_init_early(); diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 96afadf9878e..7ed56c6075b0 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -290,8 +290,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) gdt = get_cpu_gdt_rw(cpu); - memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - /* * Bring up the CPU in cpu_bringup_and_idle() with the stack * pointing just below where pt_regs would be if it were a normal @@ -308,8 +306,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) xen_copy_trap_info(ctxt->trap_ctxt); - ctxt->ldt_ents = 0; - BUG_ON((unsigned long)gdt & ~PAGE_MASK); gdt_mfn = arbitrary_virt_to_mfn(gdt); diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 9bf2a9bc8539..0e56bad058fa 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -42,6 +42,7 @@ config XTENSA select MODULES_USE_ELF_RELA select PERF_USE_VMALLOC select SET_FS + select TRACE_IRQFLAGS_SUPPORT select VIRT_TO_BUS help Xtensa processors are 32-bit RISC machines designed by Tensilica @@ -73,9 +74,6 @@ config LOCKDEP_SUPPORT config STACKTRACE_SUPPORT def_bool y -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config MMU def_bool n diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index b3d1bc8a9095..104b327f8ac9 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -417,3 +417,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease |