diff options
Diffstat (limited to 'arch')
561 files changed, 16358 insertions, 5627 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 31c4fdc4a4ba..763b1b5e4f41 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -24,6 +24,13 @@ config KEXEC_ELF config HAVE_IMA_KEXEC bool +config ARCH_HAS_SUBPAGE_FAULTS + bool + help + Select if the architecture can check permissions at sub-page + granularity (e.g. arm64 MTE). The probe_user_*() functions + must be implemented. + config HOTPLUG_SMT bool @@ -35,6 +42,7 @@ config KPROBES depends on MODULES depends on HAVE_KPROBES select KALLSYMS + select TASKS_RCU if PREEMPTION help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes @@ -46,6 +54,7 @@ config JUMP_LABEL bool "Optimize very unlikely/likely branches" depends on HAVE_ARCH_JUMP_LABEL depends on CC_HAS_ASM_GOTO + select OBJTOOL if HAVE_JUMP_LABEL_HACK help This option enables a transparent branch optimization that makes certain almost-always-true or almost-always-false branch @@ -723,10 +732,7 @@ config ARCH_SUPPORTS_CFI_CLANG config CFI_CLANG bool "Use Clang's Control Flow Integrity (CFI)" depends on LTO_CLANG && ARCH_SUPPORTS_CFI_CLANG - # Clang >= 12: - # - https://bugs.llvm.org/show_bug.cgi?id=46258 - # - https://bugs.llvm.org/show_bug.cgi?id=47479 - depends on CLANG_VERSION >= 120000 + depends on CLANG_VERSION >= 140000 select KALLSYMS help This option enables Clang’s forward-edge Control Flow Integrity @@ -1026,11 +1032,23 @@ config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT depends on MMU select ARCH_HAS_ELF_RANDOMIZE +config HAVE_OBJTOOL + bool + +config HAVE_JUMP_LABEL_HACK + bool + +config HAVE_NOINSTR_HACK + bool + +config HAVE_NOINSTR_VALIDATION + bool + config HAVE_STACK_VALIDATION bool help - Architecture supports the 'objtool check' host tool command, which - performs compile-time stack metadata validation. + Architecture supports objtool compile-time frame pointer rule + validation. config HAVE_RELIABLE_STACKTRACE bool @@ -1300,6 +1318,7 @@ config HAVE_STATIC_CALL config HAVE_STATIC_CALL_INLINE bool depends on HAVE_STATIC_CALL + select OBJTOOL config HAVE_PREEMPT_DYNAMIC bool diff --git a/arch/alpha/include/asm/timex.h b/arch/alpha/include/asm/timex.h index b565cc6f408e..f89798da8a14 100644 --- a/arch/alpha/include/asm/timex.h +++ b/arch/alpha/include/asm/timex.h @@ -28,5 +28,6 @@ static inline cycles_t get_cycles (void) __asm__ __volatile__ ("rpcc %0" : "=r"(ret)); return ret; } +#define get_cycles get_cycles #endif diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 7d81535893af..739891b94136 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -135,6 +135,8 @@ #define SO_TXREHASH 74 +#define SO_RCVMARK 75 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2e8091e2d8a8..0dcf88e7f9cf 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -972,6 +972,17 @@ config ARM_ERRATA_764369 relevant cache maintenance functions and sets a specific bit in the diagnostic control register of the SCU. +config ARM_ERRATA_764319 + bool "ARM errata: Read to DBGPRSR and DBGOSLSR may generate Undefined instruction" + depends on CPU_V7 + help + This option enables the workaround for the 764319 Cortex A-9 erratum. + CP14 read accesses to the DBGPRSR and DBGOSLSR registers generate an + unexpected Undefined Instruction exception when the DBGSWENABLE + external pin is set to 0, even when the CP14 accesses are performed + from a privileged mode. This work around catches the exception in a + way the kernel does not stop execution. + config ARM_ERRATA_775420 bool "ARM errata: A data cache maintenance operation which aborts, might lead to deadlock" depends on CPU_V7 diff --git a/arch/arm/boot/dts/aspeed-bmc-asrock-romed8hm3.dts b/arch/arm/boot/dts/aspeed-bmc-asrock-romed8hm3.dts index e71ccfd1df63..ff4c07c69af1 100644 --- a/arch/arm/boot/dts/aspeed-bmc-asrock-romed8hm3.dts +++ b/arch/arm/boot/dts/aspeed-bmc-asrock-romed8hm3.dts @@ -100,12 +100,14 @@ lm25066@40 { compatible = "lm25066"; reg = <0x40>; + shunt-resistor-micro-ohms = <1000>; }; /* 12VSB PMIC */ lm25066@41 { compatible = "lm25066"; reg = <0x41>; + shunt-resistor-micro-ohms = <10000>; }; }; @@ -196,7 +198,7 @@ gpio-line-names = /* A */ "LOCATORLED_STATUS_N", "BMC_MAC2_INTB", "NMI_BTN_N", "BMC_NMI", "", "", "", "", - /* B */ "DDR_MEM_TEMP", "", "", "", "", "", "", "", + /* B */ "POST_COMPLETE_N", "", "", "", "", "", "", "", /* C */ "", "", "", "", "PCIE_HP_SEL_N", "PCIE_SATA_SEL_N", "LOCATORBTN", "", /* D */ "BMC_PSIN", "BMC_PSOUT", "BMC_RESETCON", "RESETCON", "", "", "", "PSU_FAN_FAIL_N", diff --git a/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi b/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi index e4775bbceecc..7cd4f075e325 100644 --- a/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi +++ b/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi @@ -117,9 +117,9 @@ groups = "FWSPID"; }; - pinctrl_fwqspid_default: fwqspid_default { - function = "FWSPID"; - groups = "FWQSPID"; + pinctrl_fwqspi_default: fwqspi_default { + function = "FWQSPI"; + groups = "FWQSPI"; }; pinctrl_fwspiwp_default: fwspiwp_default { @@ -653,12 +653,12 @@ }; pinctrl_qspi1_default: qspi1_default { - function = "QSPI1"; + function = "SPI1"; groups = "QSPI1"; }; pinctrl_qspi2_default: qspi2_default { - function = "QSPI2"; + function = "SPI2"; groups = "QSPI2"; }; diff --git a/arch/arm/boot/dts/aspeed-g6.dtsi b/arch/arm/boot/dts/aspeed-g6.dtsi index 3d5ce9da42c3..3c1011678ce6 100644 --- a/arch/arm/boot/dts/aspeed-g6.dtsi +++ b/arch/arm/boot/dts/aspeed-g6.dtsi @@ -181,6 +181,7 @@ status = "disabled"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_mdio1_default>; + resets = <&syscon ASPEED_RESET_MII>; }; mdio1: mdio@1e650008 { @@ -191,6 +192,7 @@ status = "disabled"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_mdio2_default>; + resets = <&syscon ASPEED_RESET_MII>; }; mdio2: mdio@1e650010 { @@ -201,6 +203,7 @@ status = "disabled"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_mdio3_default>; + resets = <&syscon ASPEED_RESET_MII>; }; mdio3: mdio@1e650018 { @@ -211,6 +214,7 @@ status = "disabled"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_mdio4_default>; + resets = <&syscon ASPEED_RESET_MII>; }; mac0: ftgmac@1e660000 { @@ -389,6 +393,16 @@ reg = <0x1e6f2000 0x1000>; }; + video: video@1e700000 { + compatible = "aspeed,ast2600-video-engine"; + reg = <0x1e700000 0x1000>; + clocks = <&syscon ASPEED_CLK_GATE_VCLK>, + <&syscon ASPEED_CLK_GATE_ECLK>; + clock-names = "vclk", "eclk"; + interrupts = <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + gpio0: gpio@1e780000 { #gpio-cells = <2>; gpio-controller; diff --git a/arch/arm/boot/dts/imx6qdl-sr-som.dtsi b/arch/arm/boot/dts/imx6qdl-sr-som.dtsi index f86efd0ccc40..ce543e325cd3 100644 --- a/arch/arm/boot/dts/imx6qdl-sr-som.dtsi +++ b/arch/arm/boot/dts/imx6qdl-sr-som.dtsi @@ -83,6 +83,16 @@ qca,clk-out-frequency = <125000000>; qca,smarteee-tw-us-1g = <24>; }; + + /* + * ADIN1300 (som rev 1.9 or later) is always at address 1. It + * will be enabled automatically by U-Boot if detected. + */ + ethernet-phy@1 { + reg = <1>; + adi,phy-output-clock = "125mhz-free-running"; + status = "disabled"; + }; }; }; diff --git a/arch/arm/configs/lpc18xx_defconfig b/arch/arm/configs/lpc18xx_defconfig index be882ea0eee4..688c9849eec8 100644 --- a/arch/arm/configs/lpc18xx_defconfig +++ b/arch/arm/configs/lpc18xx_defconfig @@ -30,7 +30,6 @@ CONFIG_ARM_APPENDED_DTB=y # CONFIG_BLK_DEV_BSG is not set CONFIG_BINFMT_FLAT=y CONFIG_BINFMT_ZFLAT=y -CONFIG_BINFMT_SHARED_FLAT=y # CONFIG_COREDUMP is not set CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig index 89f4a6ff30bd..c1e98e33a348 100644 --- a/arch/arm/configs/mps2_defconfig +++ b/arch/arm/configs/mps2_defconfig @@ -23,7 +23,6 @@ CONFIG_PREEMPT_VOLUNTARY=y CONFIG_ZBOOT_ROM_TEXT=0x0 CONFIG_ZBOOT_ROM_BSS=0x0 CONFIG_BINFMT_FLAT=y -CONFIG_BINFMT_SHARED_FLAT=y # CONFIG_COREDUMP is not set # CONFIG_SUSPEND is not set CONFIG_NET=y diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig index 551db328009d..71d6bfcf4551 100644 --- a/arch/arm/configs/stm32_defconfig +++ b/arch/arm/configs/stm32_defconfig @@ -28,7 +28,6 @@ CONFIG_ZBOOT_ROM_BSS=0x0 CONFIG_XIP_KERNEL=y CONFIG_XIP_PHYS_ADDR=0x08008000 CONFIG_BINFMT_FLAT=y -CONFIG_BINFMT_SHARED_FLAT=y # CONFIG_COREDUMP is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y diff --git a/arch/arm/configs/vf610m4_defconfig b/arch/arm/configs/vf610m4_defconfig index a89f035c3b01..70fdbfd83484 100644 --- a/arch/arm/configs/vf610m4_defconfig +++ b/arch/arm/configs/vf610m4_defconfig @@ -18,7 +18,6 @@ CONFIG_XIP_KERNEL=y CONFIG_XIP_PHYS_ADDR=0x0f000080 CONFIG_BINFMT_FLAT=y CONFIG_BINFMT_ZFLAT=y -CONFIG_BINFMT_SHARED_FLAT=y # CONFIG_SUSPEND is not set # CONFIG_UEVENT_HELPER is not set # CONFIG_STANDALONE is not set diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index 413abfb42989..f82a819eb0db 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -48,6 +48,7 @@ static inline u32 read_ ## a64(void) \ return read_sysreg(a32); \ } \ +CPUIF_MAP(ICC_EOIR1, ICC_EOIR1_EL1) CPUIF_MAP(ICC_PMR, ICC_PMR_EL1) CPUIF_MAP(ICC_AP0R0, ICC_AP0R0_EL1) CPUIF_MAP(ICC_AP0R1, ICC_AP0R1_EL1) @@ -63,12 +64,6 @@ CPUIF_MAP(ICC_AP1R3, ICC_AP1R3_EL1) /* Low-level accessors */ -static inline void gic_write_eoir(u32 irq) -{ - write_sysreg(irq, ICC_EOIR1); - isb(); -} - static inline void gic_write_dir(u32 val) { write_sysreg(val, ICC_DIR); diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 34fe8d2dd5d1..90fbe4a3f9c8 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -666,12 +666,11 @@ THUMB( orr \reg , \reg , #PSR_T_BIT ) __adldst_l str, \src, \sym, \tmp, \cond .endm - .macro __ldst_va, op, reg, tmp, sym, cond + .macro __ldst_va, op, reg, tmp, sym, cond, offset #if __LINUX_ARM_ARCH__ >= 7 || \ !defined(CONFIG_ARM_HAS_GROUP_RELOCS) || \ (defined(MODULE) && defined(CONFIG_ARM_MODULE_PLTS)) mov_l \tmp, \sym, \cond - \op\cond \reg, [\tmp] #else /* * Avoid a literal load, by emitting a sequence of ADD/LDR instructions @@ -683,24 +682,29 @@ THUMB( orr \reg , \reg , #PSR_T_BIT ) .reloc .L0_\@, R_ARM_ALU_PC_G0_NC, \sym .reloc .L1_\@, R_ARM_ALU_PC_G1_NC, \sym .reloc .L2_\@, R_ARM_LDR_PC_G2, \sym -.L0_\@: sub\cond \tmp, pc, #8 -.L1_\@: sub\cond \tmp, \tmp, #4 -.L2_\@: \op\cond \reg, [\tmp, #0] +.L0_\@: sub\cond \tmp, pc, #8 - \offset +.L1_\@: sub\cond \tmp, \tmp, #4 - \offset +.L2_\@: #endif + \op\cond \reg, [\tmp, #\offset] .endm /* * ldr_va - load a 32-bit word from the virtual address of \sym */ - .macro ldr_va, rd:req, sym:req, cond - __ldst_va ldr, \rd, \rd, \sym, \cond + .macro ldr_va, rd:req, sym:req, cond, tmp, offset=0 + .ifnb \tmp + __ldst_va ldr, \rd, \tmp, \sym, \cond, \offset + .else + __ldst_va ldr, \rd, \rd, \sym, \cond, \offset + .endif .endm /* * str_va - store a 32-bit word to the virtual address of \sym */ .macro str_va, rn:req, sym:req, tmp:req, cond - __ldst_va str, \rn, \tmp, \sym, \cond + __ldst_va str, \rn, \tmp, \sym, \cond, 0 .endm /* @@ -727,9 +731,11 @@ THUMB( orr \reg , \reg , #PSR_T_BIT ) * are permitted to overlap with 'rd' if != sp */ .macro ldr_this_cpu, rd:req, sym:req, t1:req, t2:req -#if __LINUX_ARM_ARCH__ >= 7 || \ - !defined(CONFIG_ARM_HAS_GROUP_RELOCS) || \ - (defined(MODULE) && defined(CONFIG_ARM_MODULE_PLTS)) +#ifndef CONFIG_SMP + ldr_va \rd, \sym, tmp=\t1 +#elif __LINUX_ARM_ARCH__ >= 7 || \ + !defined(CONFIG_ARM_HAS_GROUP_RELOCS) || \ + (defined(MODULE) && defined(CONFIG_ARM_MODULE_PLTS)) this_cpu_offset \t1 mov_l \t2, \sym ldr \rd, [\t1, \t2] diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 0c70eb688a00..2a0739a2350b 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -440,6 +440,9 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *addr); #define ARCH_HAS_VALID_PHYS_ADDR_RANGE extern int valid_phys_addr_range(phys_addr_t addr, size_t size); extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); +extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags); +#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap #endif /* diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h index cfffae67c04e..5546c9751478 100644 --- a/arch/arm/include/asm/module.h +++ b/arch/arm/include/asm/module.h @@ -3,20 +3,10 @@ #define _ASM_ARM_MODULE_H #include <asm-generic/module.h> - -struct unwind_table; +#include <asm/unwind.h> #ifdef CONFIG_ARM_UNWIND -enum { - ARM_SEC_INIT, - ARM_SEC_DEVINIT, - ARM_SEC_CORE, - ARM_SEC_EXIT, - ARM_SEC_DEVEXIT, - ARM_SEC_HOT, - ARM_SEC_UNLIKELY, - ARM_SEC_MAX, -}; +#define ELF_SECTION_UNWIND 0x70000001 #endif #define PLT_ENT_STRIDE L1_CACHE_BYTES @@ -36,7 +26,8 @@ struct mod_plt_sec { struct mod_arch_specific { #ifdef CONFIG_ARM_UNWIND - struct unwind_table *unwind[ARM_SEC_MAX]; + struct list_head unwind_list; + struct unwind_table *init_table; #endif #ifdef CONFIG_ARM_MODULE_PLTS struct mod_plt_sec core; diff --git a/arch/arm/include/asm/timex.h b/arch/arm/include/asm/timex.h index 7c3b3671d6c2..6d1337c169cd 100644 --- a/arch/arm/include/asm/timex.h +++ b/arch/arm/include/asm/timex.h @@ -11,5 +11,6 @@ typedef unsigned long cycles_t; #define get_cycles() ({ cycles_t c; read_current_timer(&c) ? 0 : c; }) +#define random_get_entropy() (((unsigned long)get_cycles()) ?: random_get_entropy_fallback()) #endif diff --git a/arch/arm/include/asm/unwind.h b/arch/arm/include/asm/unwind.h index 0f8a3439902d..b51f85417f58 100644 --- a/arch/arm/include/asm/unwind.h +++ b/arch/arm/include/asm/unwind.h @@ -24,6 +24,7 @@ struct unwind_idx { struct unwind_table { struct list_head list; + struct list_head mod_list; const struct unwind_idx *start; const struct unwind_idx *origin; const struct unwind_idx *stop; diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 06508698abb8..c39303e5c234 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -61,9 +61,8 @@ .macro pabt_helper @ PABORT handler takes pt_regs in r2, fault address in r4 and psr in r5 #ifdef MULTI_PABORT - ldr ip, .LCprocfns - mov lr, pc - ldr pc, [ip, #PROCESSOR_PABT_FUNC] + ldr_va ip, processor, offset=PROCESSOR_PABT_FUNC + bl_r ip #else bl CPU_PABORT_HANDLER #endif @@ -82,9 +81,8 @@ @ the fault status register in r1. r9 must be preserved. @ #ifdef MULTI_DABORT - ldr ip, .LCprocfns - mov lr, pc - ldr pc, [ip, #PROCESSOR_DABT_FUNC] + ldr_va ip, processor, offset=PROCESSOR_DABT_FUNC + bl_r ip #else bl CPU_DABORT_HANDLER #endif @@ -302,16 +300,6 @@ __fiq_svc: UNWIND(.fnend ) ENDPROC(__fiq_svc) - .align 5 -.LCcralign: - .word cr_alignment -#ifdef MULTI_DABORT -.LCprocfns: - .word processor -#endif -.LCfp: - .word fp_enter - /* * Abort mode handlers */ @@ -370,7 +358,7 @@ ENDPROC(__fiq_abt) THUMB( stmia sp, {r0 - r12} ) ATRAP( mrc p15, 0, r7, c1, c0, 0) - ATRAP( ldr r8, .LCcralign) + ATRAP( ldr_va r8, cr_alignment) ldmia r0, {r3 - r5} add r0, sp, #S_PC @ here for interlock avoidance @@ -379,8 +367,6 @@ ENDPROC(__fiq_abt) str r3, [sp] @ save the "real" r0 copied @ from the exception stack - ATRAP( ldr r8, [r8, #0]) - @ @ We are now ready to fill in the remaining blanks on the stack: @ @@ -505,9 +491,7 @@ __und_usr_thumb: */ #if __LINUX_ARM_ARCH__ < 7 /* If the target CPU may not be Thumb-2-capable, a run-time check is needed: */ -#define NEED_CPU_ARCHITECTURE - ldr r5, .LCcpu_architecture - ldr r5, [r5] + ldr_va r5, cpu_architecture cmp r5, #CPU_ARCH_ARMv7 blo __und_usr_fault_16 @ 16bit undefined instruction /* @@ -654,12 +638,6 @@ call_fpe: ret.w lr @ CP#14 (Debug) ret.w lr @ CP#15 (Control) -#ifdef NEED_CPU_ARCHITECTURE - .align 2 -.LCcpu_architecture: - .word __cpu_architecture -#endif - #ifdef CONFIG_NEON .align 6 @@ -685,9 +663,8 @@ call_fpe: #endif do_fpe: - ldr r4, .LCfp add r10, r10, #TI_FPSTATE @ r10 = workspace - ldr pc, [r4] @ Call FP module USR entry point + ldr_va pc, fp_enter, tmp=r4 @ Call FP module USR entry point /* * The FP module is called with these registers set: @@ -1101,6 +1078,12 @@ __kuser_helper_end: */ .macro vector_stub, name, mode, correction=0 .align 5 +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +vector_bhb_bpiall_\name: + mcr p15, 0, r0, c7, c5, 6 @ BPIALL + @ isb not needed due to "movs pc, lr" in the vector stub + @ which gives a "context synchronisation". +#endif vector_\name: .if \correction @@ -1111,7 +1094,8 @@ vector_\name: stmia sp, {r0, lr} @ save r0, lr @ Save spsr_<exception> (parent CPSR) -2: mrs lr, spsr +.Lvec_\name: + mrs lr, spsr str lr, [sp, #8] @ save spsr @ @@ -1145,28 +1129,14 @@ vector_bhb_loop8_\name: @ bhb workaround mov r0, #8 -3: b . + 4 +3: W(b) . + 4 subs r0, r0, #1 bne 3b - dsb - isb - b 2b -ENDPROC(vector_bhb_loop8_\name) - -vector_bhb_bpiall_\name: - .if \correction - sub lr, lr, #\correction - .endif - - @ Save r0, lr_<exception> (parent PC) - stmia sp, {r0, lr} - - @ bhb workaround - mcr p15, 0, r0, c7, c5, 6 @ BPIALL + dsb nsh @ isb not needed due to "movs pc, lr" in the vector stub @ which gives a "context synchronisation". - b 2b -ENDPROC(vector_bhb_bpiall_\name) + b .Lvec_\name +ENDPROC(vector_bhb_loop8_\name) .previous #endif @@ -1176,10 +1146,15 @@ ENDPROC(vector_bhb_bpiall_\name) .endm .section .stubs, "ax", %progbits - @ This must be the first word + @ These need to remain at the start of the section so that + @ they are in range of the 'SWI' entries in the vector tables + @ located 4k down. +.L__vector_swi: .word vector_swi #ifdef CONFIG_HARDEN_BRANCH_HISTORY +.L__vector_bhb_loop8_swi: .word vector_bhb_loop8_swi +.L__vector_bhb_bpiall_swi: .word vector_bhb_bpiall_swi #endif @@ -1322,10 +1297,11 @@ vector_addrexcptn: .globl vector_fiq .section .vectors, "ax", %progbits -.L__vectors_start: W(b) vector_rst W(b) vector_und - W(ldr) pc, .L__vectors_start + 0x1000 +ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_swi ) +THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_swi ) + W(ldr) pc, . W(b) vector_pabt W(b) vector_dabt W(b) vector_addrexcptn @@ -1334,10 +1310,11 @@ vector_addrexcptn: #ifdef CONFIG_HARDEN_BRANCH_HISTORY .section .vectors.bhb.loop8, "ax", %progbits -.L__vectors_bhb_loop8_start: W(b) vector_rst W(b) vector_bhb_loop8_und - W(ldr) pc, .L__vectors_bhb_loop8_start + 0x1004 +ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_loop8_swi ) +THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_bhb_loop8_swi ) + W(ldr) pc, . W(b) vector_bhb_loop8_pabt W(b) vector_bhb_loop8_dabt W(b) vector_addrexcptn @@ -1345,10 +1322,11 @@ vector_addrexcptn: W(b) vector_bhb_loop8_fiq .section .vectors.bhb.bpiall, "ax", %progbits -.L__vectors_bhb_bpiall_start: W(b) vector_rst W(b) vector_bhb_bpiall_und - W(ldr) pc, .L__vectors_bhb_bpiall_start + 0x1008 +ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_bpiall_swi ) +THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_bhb_bpiall_swi ) + W(ldr) pc, . W(b) vector_bhb_bpiall_pabt W(b) vector_bhb_bpiall_dabt W(b) vector_addrexcptn diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 90d40f4d56cf..7aa3ded4af92 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -164,7 +164,7 @@ ENTRY(vector_bhb_loop8_swi) 1: b 2f 2: subs r8, r8, #1 bne 1b - dsb + dsb nsh isb b 3f ENDPROC(vector_bhb_loop8_swi) @@ -198,7 +198,7 @@ ENTRY(vector_swi) #endif reload_current r10, ip zero_fp - alignment_trap r10, ip, __cr_alignment + alignment_trap r10, ip, cr_alignment asm_trace_hardirqs_on save=0 enable_irq_notrace ct_user_exit save=0 @@ -328,14 +328,6 @@ __sys_trace_return: bl syscall_trace_exit b ret_slow_syscall - .align 5 -#ifdef CONFIG_ALIGNMENT_TRAP - .type __cr_alignment, #object -__cr_alignment: - .word cr_alignment -#endif - .ltorg - .macro syscall_table_start, sym .equ __sys_nr, 0 .type \sym, #object diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S index 9a1dc142f782..5865621bf691 100644 --- a/arch/arm/kernel/entry-header.S +++ b/arch/arm/kernel/entry-header.S @@ -48,8 +48,7 @@ .macro alignment_trap, rtmp1, rtmp2, label #ifdef CONFIG_ALIGNMENT_TRAP mrc p15, 0, \rtmp2, c1, c0, 0 - ldr \rtmp1, \label - ldr \rtmp1, [\rtmp1] + ldr_va \rtmp1, \label teq \rtmp1, \rtmp2 mcrne p15, 0, \rtmp1, c1, c0, 0 #endif diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index b1423fb130ea..054e9199f30d 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -941,6 +941,23 @@ static int hw_breakpoint_pending(unsigned long addr, unsigned int fsr, return ret; } +#ifdef CONFIG_ARM_ERRATA_764319 +static int oslsr_fault; + +static int debug_oslsr_trap(struct pt_regs *regs, unsigned int instr) +{ + oslsr_fault = 1; + instruction_pointer(regs) += 4; + return 0; +} + +static struct undef_hook debug_oslsr_hook = { + .instr_mask = 0xffffffff, + .instr_val = 0xee115e91, + .fn = debug_oslsr_trap, +}; +#endif + /* * One-time initialisation. */ @@ -974,7 +991,16 @@ static bool core_has_os_save_restore(void) case ARM_DEBUG_ARCH_V7_1: return true; case ARM_DEBUG_ARCH_V7_ECP14: +#ifdef CONFIG_ARM_ERRATA_764319 + oslsr_fault = 0; + register_undef_hook(&debug_oslsr_hook); ARM_DBG_READ(c1, c1, 4, oslsr); + unregister_undef_hook(&debug_oslsr_hook); + if (oslsr_fault) + return false; +#else + ARM_DBG_READ(c1, c1, 4, oslsr); +#endif if (oslsr & ARM_OSLSR_OSLM0) return true; fallthrough; diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 549abcedf795..d59c36dc0494 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -459,46 +459,40 @@ int module_finalize(const Elf32_Ehdr *hdr, const Elf_Shdr *sechdrs, #ifdef CONFIG_ARM_UNWIND const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; const Elf_Shdr *sechdrs_end = sechdrs + hdr->e_shnum; - struct mod_unwind_map maps[ARM_SEC_MAX]; - int i; + struct list_head *unwind_list = &mod->arch.unwind_list; - memset(maps, 0, sizeof(maps)); + INIT_LIST_HEAD(unwind_list); + mod->arch.init_table = NULL; for (s = sechdrs; s < sechdrs_end; s++) { const char *secname = secstrs + s->sh_name; + const char *txtname; + const Elf_Shdr *txt_sec; - if (!(s->sh_flags & SHF_ALLOC)) + if (!(s->sh_flags & SHF_ALLOC) || + s->sh_type != ELF_SECTION_UNWIND) continue; - if (strcmp(".ARM.exidx.init.text", secname) == 0) - maps[ARM_SEC_INIT].unw_sec = s; - else if (strcmp(".ARM.exidx", secname) == 0) - maps[ARM_SEC_CORE].unw_sec = s; - else if (strcmp(".ARM.exidx.exit.text", secname) == 0) - maps[ARM_SEC_EXIT].unw_sec = s; - else if (strcmp(".ARM.exidx.text.unlikely", secname) == 0) - maps[ARM_SEC_UNLIKELY].unw_sec = s; - else if (strcmp(".ARM.exidx.text.hot", secname) == 0) - maps[ARM_SEC_HOT].unw_sec = s; - else if (strcmp(".init.text", secname) == 0) - maps[ARM_SEC_INIT].txt_sec = s; - else if (strcmp(".text", secname) == 0) - maps[ARM_SEC_CORE].txt_sec = s; - else if (strcmp(".exit.text", secname) == 0) - maps[ARM_SEC_EXIT].txt_sec = s; - else if (strcmp(".text.unlikely", secname) == 0) - maps[ARM_SEC_UNLIKELY].txt_sec = s; - else if (strcmp(".text.hot", secname) == 0) - maps[ARM_SEC_HOT].txt_sec = s; - } + if (!strcmp(".ARM.exidx", secname)) + txtname = ".text"; + else + txtname = secname + strlen(".ARM.exidx"); + txt_sec = find_mod_section(hdr, sechdrs, txtname); + + if (txt_sec) { + struct unwind_table *table = + unwind_table_add(s->sh_addr, + s->sh_size, + txt_sec->sh_addr, + txt_sec->sh_size); - for (i = 0; i < ARM_SEC_MAX; i++) - if (maps[i].unw_sec && maps[i].txt_sec) - mod->arch.unwind[i] = - unwind_table_add(maps[i].unw_sec->sh_addr, - maps[i].unw_sec->sh_size, - maps[i].txt_sec->sh_addr, - maps[i].txt_sec->sh_size); + list_add(&table->mod_list, unwind_list); + + /* save init table for module_arch_freeing_init */ + if (strcmp(".ARM.exidx.init.text", secname) == 0) + mod->arch.init_table = table; + } + } #endif #ifdef CONFIG_ARM_PATCH_PHYS_VIRT s = find_mod_section(hdr, sechdrs, ".pv_table"); @@ -519,19 +513,27 @@ void module_arch_cleanup(struct module *mod) { #ifdef CONFIG_ARM_UNWIND - int i; + struct unwind_table *tmp; + struct unwind_table *n; - for (i = 0; i < ARM_SEC_MAX; i++) { - unwind_table_del(mod->arch.unwind[i]); - mod->arch.unwind[i] = NULL; + list_for_each_entry_safe(tmp, n, + &mod->arch.unwind_list, mod_list) { + list_del(&tmp->mod_list); + unwind_table_del(tmp); } + mod->arch.init_table = NULL; #endif } void __weak module_arch_freeing_init(struct module *mod) { #ifdef CONFIG_ARM_UNWIND - unwind_table_del(mod->arch.unwind[ARM_SEC_INIT]); - mod->arch.unwind[ARM_SEC_INIT] = NULL; + struct unwind_table *init = mod->arch.init_table; + + if (init) { + mod->arch.init_table = NULL; + list_del(&init->mod_list); + unwind_table_del(init); + } #endif } diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 459abc5d1819..ea128e32e8ca 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -708,6 +708,7 @@ static_assert(offsetof(siginfo_t, si_upper) == 0x18); static_assert(offsetof(siginfo_t, si_pkey) == 0x14); static_assert(offsetof(siginfo_t, si_perf_data) == 0x10); static_assert(offsetof(siginfo_t, si_perf_type) == 0x14); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x18); static_assert(offsetof(siginfo_t, si_band) == 0x0c); static_assert(offsetof(siginfo_t, si_fd) == 0x10); static_assert(offsetof(siginfo_t, si_call_addr) == 0x0c); diff --git a/arch/arm/mach-sunxi/Kconfig b/arch/arm/mach-sunxi/Kconfig index e5c2fce281cd..abdb99fe1e97 100644 --- a/arch/arm/mach-sunxi/Kconfig +++ b/arch/arm/mach-sunxi/Kconfig @@ -4,10 +4,7 @@ menuconfig ARCH_SUNXI depends on ARCH_MULTI_V5 || ARCH_MULTI_V7 select ARCH_HAS_RESET_CONTROLLER select CLKSRC_MMIO - select GENERIC_IRQ_CHIP select GPIOLIB - select IRQ_DOMAIN_HIERARCHY - select IRQ_FASTEOI_HIERARCHY_HANDLERS select PINCTRL select PM_OPP select SUN4I_TIMER @@ -22,10 +19,12 @@ if ARCH_MULTI_V7 config MACH_SUN4I bool "Allwinner A10 (sun4i) SoCs support" default ARCH_SUNXI + select SUN4I_INTC config MACH_SUN5I bool "Allwinner A10s / A13 (sun5i) SoCs support" default ARCH_SUNXI + select SUN4I_INTC select SUN5I_HSTIMER config MACH_SUN6I @@ -34,6 +33,8 @@ config MACH_SUN6I select ARM_GIC select MFD_SUN6I_PRCM select SUN5I_HSTIMER + select SUN6I_R_INTC + select SUNXI_NMI_INTC config MACH_SUN7I bool "Allwinner A20 (sun7i) SoCs support" @@ -43,17 +44,21 @@ config MACH_SUN7I select ARCH_SUPPORTS_BIG_ENDIAN select HAVE_ARM_ARCH_TIMER select SUN5I_HSTIMER + select SUNXI_NMI_INTC config MACH_SUN8I bool "Allwinner sun8i Family SoCs support" default ARCH_SUNXI select ARM_GIC select MFD_SUN6I_PRCM + select SUN6I_R_INTC + select SUNXI_NMI_INTC config MACH_SUN9I bool "Allwinner (sun9i) SoCs support" default ARCH_SUNXI select ARM_GIC + select SUNXI_NMI_INTC config ARCH_SUNXI_MC_SMP bool @@ -69,6 +74,7 @@ if ARCH_MULTI_V5 config MACH_SUNIV bool "Allwinner ARMv5 F-series (suniv) SoCs support" default ARCH_SUNXI + select SUN4I_INTC help Support for Allwinner suniv ARMv5 SoCs. (F1C100A, F1C100s, F1C200s, F1C500, F1C600) diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index aa08bcb72db9..290702328a33 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -493,3 +493,11 @@ void __init early_ioremap_init(void) { early_ioremap_setup(); } + +bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + unsigned long pfn = PHYS_PFN(offset); + + return memblock_is_map_memory(pfn); +} diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c index 06dbfb968182..fb9f3eb6bf48 100644 --- a/arch/arm/mm/proc-v7-bugs.c +++ b/arch/arm/mm/proc-v7-bugs.c @@ -288,6 +288,7 @@ void cpu_v7_ca15_ibe(void) { if (check_spectre_auxcr(this_cpu_ptr(&spectre_warned), BIT(0))) cpu_v7_spectre_v2_init(); + cpu_v7_spectre_bhb_init(); } void cpu_v7_bugs_init(void) diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index ec52b776f926..8ca1c9f262a2 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -28,7 +28,7 @@ CPPFLAGS_vdso.lds += -P -C -U$(ARCH) CFLAGS_REMOVE_vdso.o = -pg # Force -O2 to avoid libgcc dependencies -CFLAGS_REMOVE_vgettimeofday.o = -pg -Os $(GCC_PLUGINS_CFLAGS) +CFLAGS_REMOVE_vgettimeofday.o = -pg -Os $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) ifeq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o = -O2 else diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 20ea89d9ac2f..d550f5acfaf3 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -262,31 +262,31 @@ config ARM64_CONT_PMD_SHIFT default 4 config ARCH_MMAP_RND_BITS_MIN - default 14 if ARM64_64K_PAGES - default 16 if ARM64_16K_PAGES - default 18 + default 14 if ARM64_64K_PAGES + default 16 if ARM64_16K_PAGES + default 18 # max bits determined by the following formula: # VA_BITS - PAGE_SHIFT - 3 config ARCH_MMAP_RND_BITS_MAX - default 19 if ARM64_VA_BITS=36 - default 24 if ARM64_VA_BITS=39 - default 27 if ARM64_VA_BITS=42 - default 30 if ARM64_VA_BITS=47 - default 29 if ARM64_VA_BITS=48 && ARM64_64K_PAGES - default 31 if ARM64_VA_BITS=48 && ARM64_16K_PAGES - default 33 if ARM64_VA_BITS=48 - default 14 if ARM64_64K_PAGES - default 16 if ARM64_16K_PAGES - default 18 + default 19 if ARM64_VA_BITS=36 + default 24 if ARM64_VA_BITS=39 + default 27 if ARM64_VA_BITS=42 + default 30 if ARM64_VA_BITS=47 + default 29 if ARM64_VA_BITS=48 && ARM64_64K_PAGES + default 31 if ARM64_VA_BITS=48 && ARM64_16K_PAGES + default 33 if ARM64_VA_BITS=48 + default 14 if ARM64_64K_PAGES + default 16 if ARM64_16K_PAGES + default 18 config ARCH_MMAP_RND_COMPAT_BITS_MIN - default 7 if ARM64_64K_PAGES - default 9 if ARM64_16K_PAGES - default 11 + default 7 if ARM64_64K_PAGES + default 9 if ARM64_16K_PAGES + default 11 config ARCH_MMAP_RND_COMPAT_BITS_MAX - default 16 + default 16 config NO_IOPORT_MAP def_bool y if !PCI @@ -313,7 +313,7 @@ config GENERIC_HWEIGHT def_bool y config GENERIC_CSUM - def_bool y + def_bool y config GENERIC_CALIBRATE_DELAY def_bool y @@ -1046,8 +1046,7 @@ config SOCIONEXT_SYNQUACER_PREITS If unsure, say Y. -endmenu - +endmenu # "ARM errata workarounds via the alternatives framework" choice prompt "Page size" @@ -1575,9 +1574,9 @@ config SETEND_EMULATION be unexpected results in the applications. If unsure, say Y -endif +endif # ARMV8_DEPRECATED -endif +endif # COMPAT menu "ARMv8.1 architectural features" @@ -1602,15 +1601,15 @@ config ARM64_PAN bool "Enable support for Privileged Access Never (PAN)" default y help - Privileged Access Never (PAN; part of the ARMv8.1 Extensions) - prevents the kernel or hypervisor from accessing user-space (EL0) - memory directly. + Privileged Access Never (PAN; part of the ARMv8.1 Extensions) + prevents the kernel or hypervisor from accessing user-space (EL0) + memory directly. - Choosing this option will cause any unprotected (not using - copy_to_user et al) memory access to fail with a permission fault. + Choosing this option will cause any unprotected (not using + copy_to_user et al) memory access to fail with a permission fault. - The feature is detected at runtime, and will remain as a 'nop' - instruction if the cpu does not implement the feature. + The feature is detected at runtime, and will remain as a 'nop' + instruction if the cpu does not implement the feature. config AS_HAS_LDAPR def_bool $(as-instr,.arch_extension rcpc) @@ -1638,15 +1637,15 @@ config ARM64_USE_LSE_ATOMICS built with binutils >= 2.25 in order for the new instructions to be used. -endmenu +endmenu # "ARMv8.1 architectural features" menu "ARMv8.2 architectural features" config AS_HAS_ARMV8_2 - def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a) + def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a) config AS_HAS_SHA3 - def_bool $(as-instr,.arch armv8.2-a+sha3) + def_bool $(as-instr,.arch armv8.2-a+sha3) config ARM64_PMEM bool "Enable support for persistent memory" @@ -1690,7 +1689,7 @@ config ARM64_CNP at runtime, and does not affect PEs that do not implement this feature. -endmenu +endmenu # "ARMv8.2 architectural features" menu "ARMv8.3 architectural features" @@ -1753,7 +1752,7 @@ config AS_HAS_PAC config AS_HAS_CFI_NEGATE_RA_STATE def_bool $(as-instr,.cfi_startproc\n.cfi_negate_ra_state\n.cfi_endproc\n) -endmenu +endmenu # "ARMv8.3 architectural features" menu "ARMv8.4 architectural features" @@ -1794,7 +1793,7 @@ config ARM64_TLB_RANGE The feature introduces new assembly instructions, and they were support when binutils >= 2.30. -endmenu +endmenu # "ARMv8.4 architectural features" menu "ARMv8.5 architectural features" @@ -1880,6 +1879,7 @@ config ARM64_MTE depends on AS_HAS_LSE_ATOMICS # Required for tag checking in the uaccess routines depends on ARM64_PAN + select ARCH_HAS_SUBPAGE_FAULTS select ARCH_USES_HIGH_VMA_FLAGS help Memory Tagging (part of the ARMv8.5 Extensions) provides @@ -1901,7 +1901,7 @@ config ARM64_MTE Documentation/arm64/memory-tagging-extension.rst. -endmenu +endmenu # "ARMv8.5 architectural features" menu "ARMv8.7 architectural features" @@ -1910,12 +1910,12 @@ config ARM64_EPAN default y depends on ARM64_PAN help - Enhanced Privileged Access Never (EPAN) allows Privileged - Access Never to be used with Execute-only mappings. + Enhanced Privileged Access Never (EPAN) allows Privileged + Access Never to be used with Execute-only mappings. - The feature is detected at runtime, and will remain disabled - if the cpu does not implement the feature. -endmenu + The feature is detected at runtime, and will remain disabled + if the cpu does not implement the feature. +endmenu # "ARMv8.7 architectural features" config ARM64_SVE bool "ARM Scalable Vector Extension support" @@ -1948,6 +1948,17 @@ config ARM64_SVE booting the kernel. If unsure and you are not observing these symptoms, you should assume that it is safe to say Y. +config ARM64_SME + bool "ARM Scalable Matrix Extension support" + default y + depends on ARM64_SVE + help + The Scalable Matrix Extension (SME) is an extension to the AArch64 + execution state which utilises a substantial subset of the SVE + instruction set, together with the addition of new architectural + register state capable of holding two dimensional matrix tiles to + enable various matrix operations. + config ARM64_MODULE_PLTS bool "Use PLTs to allow module memory to spill over into vmalloc area" depends on MODULES @@ -1991,7 +2002,7 @@ config ARM64_DEBUG_PRIORITY_MASKING the validity of ICC_PMR_EL1 when calling concerned functions. If unsure, say N -endif +endif # ARM64_PSEUDO_NMI config RELOCATABLE bool "Build a relocatable kernel image" if EXPERT @@ -2050,7 +2061,19 @@ config STACKPROTECTOR_PER_TASK def_bool y depends on STACKPROTECTOR && CC_HAVE_STACKPROTECTOR_SYSREG -endmenu +# The GPIO number here must be sorted by descending number. In case of +# a multiplatform kernel, we just want the highest value required by the +# selected platforms. +config ARCH_NR_GPIO + int + default 2048 if ARCH_APPLE + default 0 + help + Maximum number of GPIOs in the system. + + If unsure, leave the default value. + +endmenu # "Kernel Features" menu "Boot options" @@ -2114,7 +2137,7 @@ config EFI help This option provides support for runtime services provided by UEFI firmware (such as non-volatile variables, realtime - clock, and platform reset). A UEFI stub is also provided to + clock, and platform reset). A UEFI stub is also provided to allow the kernel to be booted as an EFI application. This is only useful on systems that have UEFI firmware. @@ -2129,7 +2152,7 @@ config DMI However, even with this option, the resultant kernel should continue to boot on existing non-UEFI platforms. -endmenu +endmenu # "Boot options" config SYSVIPC_COMPAT def_bool y @@ -2150,7 +2173,7 @@ config ARCH_HIBERNATION_HEADER config ARCH_SUSPEND_POSSIBLE def_bool y -endmenu +endmenu # "Power management options" menu "CPU Power Management" @@ -2158,7 +2181,7 @@ source "drivers/cpuidle/Kconfig" source "drivers/cpufreq/Kconfig" -endmenu +endmenu # "CPU Power Management" source "drivers/acpi/Kconfig" @@ -2166,4 +2189,4 @@ source "arch/arm64/kvm/Kconfig" if CRYPTO source "arch/arm64/crypto/Kconfig" -endif +endif # CRYPTO diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index 30b123cde02c..4e6d635a1731 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -11,12 +11,11 @@ config ARCH_ACTIONS config ARCH_SUNXI bool "Allwinner sunxi 64-bit SoC Family" select ARCH_HAS_RESET_CONTROLLER - select GENERIC_IRQ_CHIP - select IRQ_DOMAIN_HIERARCHY - select IRQ_FASTEOI_HIERARCHY_HANDLERS select PINCTRL select RESET_CONTROLLER select SUN4I_TIMER + select SUN6I_R_INTC + select SUNXI_NMI_INTC help This enables support for Allwinner sunxi based SoCs like the A64. @@ -253,6 +252,7 @@ config ARCH_INTEL_SOCFPGA config ARCH_SYNQUACER bool "Socionext SynQuacer SoC Family" + select IRQ_FASTEOI_HIERARCHY_HANDLERS config ARCH_TEGRA bool "NVIDIA Tegra SoC Family" @@ -325,4 +325,4 @@ config ARCH_ZYNQMP help This enables support for Xilinx ZynqMP Family -endmenu +endmenu # "Platform selection" diff --git a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts index 1cee26479bfe..98c9a3265446 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts @@ -303,7 +303,7 @@ /* switch nodes are enabled by U-Boot if modules are present */ switch0@10 { compatible = "marvell,mv88e6190"; - reg = <0x10 0>; + reg = <0x10>; dsa,member = <0 0>; interrupt-parent = <&moxtet>; interrupts = <MOXTET_IRQ_PERIDOT(0)>; @@ -428,7 +428,7 @@ switch0@2 { compatible = "marvell,mv88e6085"; - reg = <0x2 0>; + reg = <0x2>; dsa,member = <0 0>; interrupt-parent = <&moxtet>; interrupts = <MOXTET_IRQ_TOPAZ>; @@ -495,7 +495,7 @@ switch1@11 { compatible = "marvell,mv88e6190"; - reg = <0x11 0>; + reg = <0x11>; dsa,member = <0 1>; interrupt-parent = <&moxtet>; interrupts = <MOXTET_IRQ_PERIDOT(1)>; @@ -620,7 +620,7 @@ switch1@2 { compatible = "marvell,mv88e6085"; - reg = <0x2 0>; + reg = <0x2>; dsa,member = <0 1>; interrupt-parent = <&moxtet>; interrupts = <MOXTET_IRQ_TOPAZ>; @@ -687,7 +687,7 @@ switch2@12 { compatible = "marvell,mv88e6190"; - reg = <0x12 0>; + reg = <0x12>; dsa,member = <0 2>; interrupt-parent = <&moxtet>; interrupts = <MOXTET_IRQ_PERIDOT(2)>; @@ -803,7 +803,7 @@ switch2@2 { compatible = "marvell,mv88e6085"; - reg = <0x2 0>; + reg = <0x2>; dsa,member = <0 2>; interrupt-parent = <&moxtet>; interrupts = <MOXTET_IRQ_TOPAZ>; diff --git a/arch/arm64/boot/dts/mediatek/mt7622.dtsi b/arch/arm64/boot/dts/mediatek/mt7622.dtsi index 6f8cb3ad1e84..f232f8baf4e8 100644 --- a/arch/arm64/boot/dts/mediatek/mt7622.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt7622.dtsi @@ -357,7 +357,7 @@ }; cci_control2: slave-if@5000 { - compatible = "arm,cci-400-ctrl-if"; + compatible = "arm,cci-400-ctrl-if", "syscon"; interface-type = "ace"; reg = <0x5000 0x1000>; }; @@ -901,6 +901,11 @@ }; }; + hifsys: syscon@1af00000 { + compatible = "mediatek,mt7622-hifsys", "syscon"; + reg = <0 0x1af00000 0 0x70>; + }; + ethsys: syscon@1b000000 { compatible = "mediatek,mt7622-ethsys", "syscon"; @@ -919,6 +924,26 @@ #dma-cells = <1>; }; + pcie_mirror: pcie-mirror@10000400 { + compatible = "mediatek,mt7622-pcie-mirror", + "syscon"; + reg = <0 0x10000400 0 0x10>; + }; + + wed0: wed@1020a000 { + compatible = "mediatek,mt7622-wed", + "syscon"; + reg = <0 0x1020a000 0 0x1000>; + interrupts = <GIC_SPI 214 IRQ_TYPE_LEVEL_LOW>; + }; + + wed1: wed@1020b000 { + compatible = "mediatek,mt7622-wed", + "syscon"; + reg = <0 0x1020b000 0 0x1000>; + interrupts = <GIC_SPI 215 IRQ_TYPE_LEVEL_LOW>; + }; + eth: ethernet@1b100000 { compatible = "mediatek,mt7622-eth", "mediatek,mt2701-eth", @@ -945,6 +970,11 @@ power-domains = <&scpsys MT7622_POWER_DOMAIN_ETHSYS>; mediatek,ethsys = <ðsys>; mediatek,sgmiisys = <&sgmiisys>; + cci-control-port = <&cci_control2>; + mediatek,wed = <&wed0>, <&wed1>; + mediatek,pcie-mirror = <&pcie_mirror>; + mediatek,hifsys = <&hifsys>; + dma-coherent; #address-cells = <1>; #size-cells = <0>; status = "disabled"; diff --git a/arch/arm64/boot/dts/mediatek/mt7986a-rfb.dts b/arch/arm64/boot/dts/mediatek/mt7986a-rfb.dts index 21e420829572..882277a52b69 100644 --- a/arch/arm64/boot/dts/mediatek/mt7986a-rfb.dts +++ b/arch/arm64/boot/dts/mediatek/mt7986a-rfb.dts @@ -25,6 +25,80 @@ }; }; +ð { + status = "okay"; + + gmac0: mac@0 { + compatible = "mediatek,eth-mac"; + reg = <0>; + phy-mode = "2500base-x"; + + fixed-link { + speed = <2500>; + full-duplex; + pause; + }; + }; + + mdio: mdio-bus { + #address-cells = <1>; + #size-cells = <0>; + }; +}; + +&mdio { + switch: switch@0 { + compatible = "mediatek,mt7531"; + reg = <31>; + reset-gpios = <&pio 5 0>; + }; +}; + +&switch { + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + label = "lan0"; + }; + + port@1 { + reg = <1>; + label = "lan1"; + }; + + port@2 { + reg = <2>; + label = "lan2"; + }; + + port@3 { + reg = <3>; + label = "lan3"; + }; + + port@4 { + reg = <4>; + label = "lan4"; + }; + + port@6 { + reg = <6>; + label = "cpu"; + ethernet = <&gmac0>; + phy-mode = "2500base-x"; + + fixed-link { + speed = <2500>; + full-duplex; + pause; + }; + }; + }; +}; + &uart0 { status = "okay"; }; diff --git a/arch/arm64/boot/dts/mediatek/mt7986a.dtsi b/arch/arm64/boot/dts/mediatek/mt7986a.dtsi index 694acf8f5b70..d2636a0ed152 100644 --- a/arch/arm64/boot/dts/mediatek/mt7986a.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt7986a.dtsi @@ -222,6 +222,45 @@ #reset-cells = <1>; }; + eth: ethernet@15100000 { + compatible = "mediatek,mt7986-eth"; + reg = <0 0x15100000 0 0x80000>; + interrupts = <GIC_SPI 196 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 197 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 198 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 199 IRQ_TYPE_LEVEL_HIGH>; + clocks = <ðsys CLK_ETH_FE_EN>, + <ðsys CLK_ETH_GP2_EN>, + <ðsys CLK_ETH_GP1_EN>, + <ðsys CLK_ETH_WOCPU1_EN>, + <ðsys CLK_ETH_WOCPU0_EN>, + <&sgmiisys0 CLK_SGMII0_TX250M_EN>, + <&sgmiisys0 CLK_SGMII0_RX250M_EN>, + <&sgmiisys0 CLK_SGMII0_CDR_REF>, + <&sgmiisys0 CLK_SGMII0_CDR_FB>, + <&sgmiisys1 CLK_SGMII1_TX250M_EN>, + <&sgmiisys1 CLK_SGMII1_RX250M_EN>, + <&sgmiisys1 CLK_SGMII1_CDR_REF>, + <&sgmiisys1 CLK_SGMII1_CDR_FB>, + <&topckgen CLK_TOP_NETSYS_SEL>, + <&topckgen CLK_TOP_NETSYS_500M_SEL>; + clock-names = "fe", "gp2", "gp1", "wocpu1", "wocpu0", + "sgmii_tx250m", "sgmii_rx250m", + "sgmii_cdr_ref", "sgmii_cdr_fb", + "sgmii2_tx250m", "sgmii2_rx250m", + "sgmii2_cdr_ref", "sgmii2_cdr_fb", + "netsys0", "netsys1"; + assigned-clocks = <&topckgen CLK_TOP_NETSYS_2X_SEL>, + <&topckgen CLK_TOP_SGM_325M_SEL>; + assigned-clock-parents = <&apmixedsys CLK_APMIXED_NET2PLL>, + <&apmixedsys CLK_APMIXED_SGMPLL>; + mediatek,ethsys = <ðsys>; + mediatek,sgmiisys = <&sgmiisys0>, <&sgmiisys1>; + #reset-cells = <1>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; }; }; diff --git a/arch/arm64/boot/dts/mediatek/mt7986b-rfb.dts b/arch/arm64/boot/dts/mediatek/mt7986b-rfb.dts index d73467ea3641..0f49d5764ff3 100644 --- a/arch/arm64/boot/dts/mediatek/mt7986b-rfb.dts +++ b/arch/arm64/boot/dts/mediatek/mt7986b-rfb.dts @@ -28,3 +28,73 @@ &uart0 { status = "okay"; }; + +ð { + status = "okay"; + + gmac0: mac@0 { + compatible = "mediatek,eth-mac"; + reg = <0>; + phy-mode = "2500base-x"; + + fixed-link { + speed = <2500>; + full-duplex; + pause; + }; + }; + + mdio: mdio-bus { + #address-cells = <1>; + #size-cells = <0>; + + switch@0 { + compatible = "mediatek,mt7531"; + reg = <31>; + reset-gpios = <&pio 5 0>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + label = "lan0"; + }; + + port@1 { + reg = <1>; + label = "lan1"; + }; + + port@2 { + reg = <2>; + label = "lan2"; + }; + + port@3 { + reg = <3>; + label = "lan3"; + }; + + port@4 { + reg = <4>; + label = "lan4"; + }; + + port@6 { + reg = <6>; + label = "cpu"; + ethernet = <&gmac0>; + phy-mode = "2500base-x"; + + fixed-link { + speed = <2500>; + full-duplex; + pause; + }; + }; + }; + }; + }; +}; diff --git a/arch/arm64/boot/dts/qcom/sm8250-mtp.dts b/arch/arm64/boot/dts/qcom/sm8250-mtp.dts index fb99cc2827c7..7ab3627cc347 100644 --- a/arch/arm64/boot/dts/qcom/sm8250-mtp.dts +++ b/arch/arm64/boot/dts/qcom/sm8250-mtp.dts @@ -622,6 +622,10 @@ status = "okay"; }; +&rxmacro { + status = "okay"; +}; + &slpi { status = "okay"; firmware-name = "qcom/sm8250/slpi.mbn"; @@ -773,6 +777,8 @@ }; &swr1 { + status = "okay"; + wcd_rx: wcd9380-rx@0,4 { compatible = "sdw20217010d00"; reg = <0 4>; @@ -781,6 +787,8 @@ }; &swr2 { + status = "okay"; + wcd_tx: wcd9380-tx@0,3 { compatible = "sdw20217010d00"; reg = <0 3>; @@ -819,6 +827,10 @@ }; }; +&txmacro { + status = "okay"; +}; + &uart12 { status = "okay"; }; diff --git a/arch/arm64/boot/dts/qcom/sm8250.dtsi b/arch/arm64/boot/dts/qcom/sm8250.dtsi index af8f22636436..1304b86af1a0 100644 --- a/arch/arm64/boot/dts/qcom/sm8250.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8250.dtsi @@ -2255,6 +2255,7 @@ pinctrl-0 = <&rx_swr_active>; compatible = "qcom,sm8250-lpass-rx-macro"; reg = <0 0x3200000 0 0x1000>; + status = "disabled"; clocks = <&q6afecc LPASS_CLK_ID_TX_CORE_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>, <&q6afecc LPASS_CLK_ID_TX_CORE_NPL_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>, @@ -2273,6 +2274,7 @@ swr1: soundwire-controller@3210000 { reg = <0 0x3210000 0 0x2000>; compatible = "qcom,soundwire-v1.5.1"; + status = "disabled"; interrupts = <GIC_SPI 298 IRQ_TYPE_LEVEL_HIGH>; clocks = <&rxmacro>; clock-names = "iface"; @@ -2300,6 +2302,7 @@ pinctrl-0 = <&tx_swr_active>; compatible = "qcom,sm8250-lpass-tx-macro"; reg = <0 0x3220000 0 0x1000>; + status = "disabled"; clocks = <&q6afecc LPASS_CLK_ID_TX_CORE_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>, <&q6afecc LPASS_CLK_ID_TX_CORE_NPL_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>, @@ -2323,6 +2326,7 @@ compatible = "qcom,soundwire-v1.5.1"; interrupts-extended = <&intc GIC_SPI 297 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "core"; + status = "disabled"; clocks = <&txmacro>; clock-names = "iface"; diff --git a/arch/arm64/boot/dts/rockchip/rk3568-bpi-r2-pro.dts b/arch/arm64/boot/dts/rockchip/rk3568-bpi-r2-pro.dts index a01886b467ed..067fe4a6b178 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568-bpi-r2-pro.dts +++ b/arch/arm64/boot/dts/rockchip/rk3568-bpi-r2-pro.dts @@ -16,6 +16,7 @@ aliases { ethernet0 = &gmac0; + ethernet1 = &gmac1; mmc0 = &sdmmc0; mmc1 = &sdhci; }; @@ -78,7 +79,6 @@ assigned-clocks = <&cru SCLK_GMAC0_RX_TX>, <&cru SCLK_GMAC0>; assigned-clock-parents = <&cru SCLK_GMAC0_RGMII_SPEED>, <&cru CLK_MAC0_2TOP>; clock_in_out = "input"; - phy-handle = <&rgmii_phy0>; phy-mode = "rgmii"; pinctrl-names = "default"; pinctrl-0 = <&gmac0_miim @@ -90,8 +90,38 @@ snps,reset-active-low; /* Reset time is 20ms, 100ms for rtl8211f */ snps,reset-delays-us = <0 20000 100000>; + tx_delay = <0x4f>; + rx_delay = <0x0f>; + status = "okay"; + + fixed-link { + speed = <1000>; + full-duplex; + pause; + }; +}; + +&gmac1 { + assigned-clocks = <&cru SCLK_GMAC1_RX_TX>, <&cru SCLK_GMAC1>; + assigned-clock-parents = <&cru SCLK_GMAC1_RGMII_SPEED>, <&cru CLK_MAC1_2TOP>; + clock_in_out = "output"; + phy-handle = <&rgmii_phy1>; + phy-mode = "rgmii"; + pinctrl-names = "default"; + pinctrl-0 = <&gmac1m1_miim + &gmac1m1_tx_bus2 + &gmac1m1_rx_bus2 + &gmac1m1_rgmii_clk + &gmac1m1_rgmii_bus>; + + snps,reset-gpio = <&gpio3 RK_PB0 GPIO_ACTIVE_LOW>; + snps,reset-active-low; + /* Reset time is 20ms, 100ms for rtl8211f */ + snps,reset-delays-us = <0 20000 100000>; + tx_delay = <0x3c>; rx_delay = <0x2f>; + status = "okay"; }; @@ -315,8 +345,8 @@ status = "disabled"; }; -&mdio0 { - rgmii_phy0: ethernet-phy@0 { +&mdio1 { + rgmii_phy1: ethernet-phy@0 { compatible = "ethernet-phy-ieee802.3-c22"; reg = <0x0>; }; @@ -345,9 +375,9 @@ pmuio2-supply = <&vcc3v3_pmu>; vccio1-supply = <&vccio_acodec>; vccio3-supply = <&vccio_sd>; - vccio4-supply = <&vcc_1v8>; + vccio4-supply = <&vcc_3v3>; vccio5-supply = <&vcc_3v3>; - vccio6-supply = <&vcc_3v3>; + vccio6-supply = <&vcc_1v8>; vccio7-supply = <&vcc_3v3>; status = "okay"; }; diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 345fe98605ba..5c8ee5a541d2 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -7,3 +7,4 @@ generic-y += parport.h generic-y += user.h generated-y += cpucaps.h +generated-y += sysreg-defs.h diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 8bd5afc7b692..48d4473e8eee 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -26,12 +26,6 @@ * sets the GP register's most significant bits to 0 with an explicit cast. */ -static inline void gic_write_eoir(u32 irq) -{ - write_sysreg_s(irq, SYS_ICC_EOIR1_EL1); - isb(); -} - static __always_inline void gic_write_dir(u32 irq) { write_sysreg_s(irq, SYS_ICC_DIR_EL1); diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h index d1bb5e71df25..3a6b6d38c5b8 100644 --- a/arch/arm64/include/asm/archrandom.h +++ b/arch/arm64/include/asm/archrandom.h @@ -142,7 +142,7 @@ static inline bool __init __early_cpu_has_rndr(void) { /* Open code as we run prior to the first call to cpufeature. */ unsigned long ftr = read_sysreg_s(SYS_ID_AA64ISAR0_EL1); - return (ftr >> ID_AA64ISAR0_RNDR_SHIFT) & 0xf; + return (ftr >> ID_AA64ISAR0_EL1_RNDR_SHIFT) & 0xf; } static inline bool __init __must_check diff --git a/arch/arm64/include/asm/asm-bug.h b/arch/arm64/include/asm/asm-bug.h index 03f52f84a4f3..c762038ba400 100644 --- a/arch/arm64/include/asm/asm-bug.h +++ b/arch/arm64/include/asm/asm-bug.h @@ -14,7 +14,7 @@ 14472: .string file; \ .popsection; \ \ - .long 14472b - 14470b; \ + .long 14472b - .; \ .short line; #else #define _BUGVERBOSE_LOCATION(file, line) @@ -25,7 +25,7 @@ #define __BUG_ENTRY(flags) \ .pushsection __bug_table,"aw"; \ .align 2; \ - 14470: .long 14471f - 14470b; \ + 14470: .long 14471f - .; \ _BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ .short flags; \ .popsection; \ diff --git a/arch/arm64/include/asm/compiler.h b/arch/arm64/include/asm/compiler.h index dc3ea4080e2e..6fb2e6bcc392 100644 --- a/arch/arm64/include/asm/compiler.h +++ b/arch/arm64/include/asm/compiler.h @@ -23,20 +23,4 @@ #define __builtin_return_address(val) \ (void *)(ptrauth_clear_pac((unsigned long)__builtin_return_address(val))) -#ifdef CONFIG_CFI_CLANG -/* - * With CONFIG_CFI_CLANG, the compiler replaces function address - * references with the address of the function's CFI jump table - * entry. The function_nocfi macro always returns the address of the - * actual function instead. - */ -#define function_nocfi(x) ({ \ - void *addr; \ - asm("adrp %0, " __stringify(x) "\n\t" \ - "add %0, %0, :lo12:" __stringify(x) \ - : "=r" (addr)); \ - addr; \ -}) -#endif - #endif /* __ASM_COMPILER_H */ diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index a58e366f0b07..115cdec1ae87 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -58,11 +58,15 @@ struct cpuinfo_arm64 { u64 reg_id_aa64pfr0; u64 reg_id_aa64pfr1; u64 reg_id_aa64zfr0; + u64 reg_id_aa64smfr0; struct cpuinfo_32bit aarch32; /* pseudo-ZCR for recording maximum ZCR_EL1 LEN value: */ u64 reg_zcr; + + /* pseudo-SMCR for recording maximum SMCR_EL1 LEN value: */ + u64 reg_smcr; }; DECLARE_PER_CPU(struct cpuinfo_arm64, cpu_data); diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index c62e7e5e2f0c..14a8f3d93add 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -622,6 +622,13 @@ static inline bool id_aa64pfr0_sve(u64 pfr0) return val > 0; } +static inline bool id_aa64pfr1_sme(u64 pfr1) +{ + u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_SME_SHIFT); + + return val > 0; +} + static inline bool id_aa64pfr1_mte(u64 pfr1) { u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_MTE_SHIFT); @@ -759,6 +766,23 @@ static __always_inline bool system_supports_sve(void) cpus_have_const_cap(ARM64_SVE); } +static __always_inline bool system_supports_sme(void) +{ + return IS_ENABLED(CONFIG_ARM64_SME) && + cpus_have_const_cap(ARM64_SME); +} + +static __always_inline bool system_supports_fa64(void) +{ + return IS_ENABLED(CONFIG_ARM64_SME) && + cpus_have_const_cap(ARM64_SME_FA64); +} + +static __always_inline bool system_supports_tpidr2(void) +{ + return system_supports_sme(); +} + static __always_inline bool system_supports_cnp(void) { return IS_ENABLED(CONFIG_ARM64_CNP) && diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index ff8f4511df71..92331c07c2d1 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -36,7 +36,7 @@ #define MIDR_VARIANT(midr) \ (((midr) & MIDR_VARIANT_MASK) >> MIDR_VARIANT_SHIFT) #define MIDR_IMPLEMENTOR_SHIFT 24 -#define MIDR_IMPLEMENTOR_MASK (0xff << MIDR_IMPLEMENTOR_SHIFT) +#define MIDR_IMPLEMENTOR_MASK (0xffU << MIDR_IMPLEMENTOR_SHIFT) #define MIDR_IMPLEMENTOR(midr) \ (((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT) diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index 00c291067e57..7b7e05c02691 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -64,7 +64,7 @@ struct task_struct; struct step_hook { struct list_head node; - int (*fn)(struct pt_regs *regs, unsigned int esr); + int (*fn)(struct pt_regs *regs, unsigned long esr); }; void register_user_step_hook(struct step_hook *hook); @@ -75,7 +75,7 @@ void unregister_kernel_step_hook(struct step_hook *hook); struct break_hook { struct list_head node; - int (*fn)(struct pt_regs *regs, unsigned int esr); + int (*fn)(struct pt_regs *regs, unsigned long esr); u16 imm; u16 mask; /* These bits are ignored when comparing with imm */ }; diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index c31be7eda9df..34ceff08cac4 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -143,6 +143,50 @@ .Lskip_sve_\@: .endm +/* SME register access and priority mapping */ +.macro __init_el2_nvhe_sme + mrs x1, id_aa64pfr1_el1 + ubfx x1, x1, #ID_AA64PFR1_SME_SHIFT, #4 + cbz x1, .Lskip_sme_\@ + + bic x0, x0, #CPTR_EL2_TSM // Also disable SME traps + msr cptr_el2, x0 // Disable copro. traps to EL2 + isb + + mrs x1, sctlr_el2 + orr x1, x1, #SCTLR_ELx_ENTP2 // Disable TPIDR2 traps + msr sctlr_el2, x1 + isb + + mov x1, #0 // SMCR controls + + mrs_s x2, SYS_ID_AA64SMFR0_EL1 + ubfx x2, x2, #ID_AA64SMFR0_FA64_SHIFT, #1 // Full FP in SM? + cbz x2, .Lskip_sme_fa64_\@ + + orr x1, x1, SMCR_ELx_FA64_MASK +.Lskip_sme_fa64_\@: + + orr x1, x1, #SMCR_ELx_LEN_MASK // Enable full SME vector + msr_s SYS_SMCR_EL2, x1 // length for EL1. + + mrs_s x1, SYS_SMIDR_EL1 // Priority mapping supported? + ubfx x1, x1, #SMIDR_EL1_SMPS_SHIFT, #1 + cbz x1, .Lskip_sme_\@ + + msr_s SYS_SMPRIMAP_EL2, xzr // Make all priorities equal + + mrs x1, id_aa64mmfr1_el1 // HCRX_EL2 present? + ubfx x1, x1, #ID_AA64MMFR1_HCX_SHIFT, #4 + cbz x1, .Lskip_sme_\@ + + mrs_s x1, SYS_HCRX_EL2 + orr x1, x1, #HCRX_EL2_SMPME_MASK // Enable priority mapping + msr_s SYS_HCRX_EL2, x1 + +.Lskip_sme_\@: +.endm + /* Disable any fine grained traps */ .macro __init_el2_fgt mrs x1, id_aa64mmfr0_el1 @@ -153,15 +197,26 @@ mrs x1, id_aa64dfr0_el1 ubfx x1, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4 cmp x1, #3 - b.lt .Lset_fgt_\@ + b.lt .Lset_debug_fgt_\@ /* Disable PMSNEVFR_EL1 read and write traps */ orr x0, x0, #(1 << 62) -.Lset_fgt_\@: +.Lset_debug_fgt_\@: msr_s SYS_HDFGRTR_EL2, x0 msr_s SYS_HDFGWTR_EL2, x0 - msr_s SYS_HFGRTR_EL2, xzr - msr_s SYS_HFGWTR_EL2, xzr + + mov x0, xzr + mrs x1, id_aa64pfr1_el1 + ubfx x1, x1, #ID_AA64PFR1_SME_SHIFT, #4 + cbz x1, .Lset_fgt_\@ + + /* Disable nVHE traps of TPIDR2 and SMPRI */ + orr x0, x0, #HFGxTR_EL2_nSMPRI_EL1_MASK + orr x0, x0, #HFGxTR_EL2_nTPIDR2_EL0_MASK + +.Lset_fgt_\@: + msr_s SYS_HFGRTR_EL2, x0 + msr_s SYS_HFGWTR_EL2, x0 msr_s SYS_HFGITR_EL2, xzr mrs x1, id_aa64pfr0_el1 // AMU traps UNDEF without AMU @@ -196,6 +251,7 @@ __init_el2_nvhe_idregs __init_el2_nvhe_cptr __init_el2_nvhe_sve + __init_el2_nvhe_sme __init_el2_fgt __init_el2_nvhe_prepare_eret .endm diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index d52a0b269ee8..8f236de7359c 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -37,7 +37,8 @@ #define ESR_ELx_EC_ERET (0x1a) /* EL2 only */ /* Unallocated EC: 0x1B */ #define ESR_ELx_EC_FPAC (0x1C) /* EL1 and above */ -/* Unallocated EC: 0x1D - 0x1E */ +#define ESR_ELx_EC_SME (0x1D) +/* Unallocated EC: 0x1E */ #define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */ #define ESR_ELx_EC_IABT_LOW (0x20) #define ESR_ELx_EC_IABT_CUR (0x21) @@ -75,6 +76,7 @@ #define ESR_ELx_IL_SHIFT (25) #define ESR_ELx_IL (UL(1) << ESR_ELx_IL_SHIFT) #define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1) +#define ESR_ELx_ISS(esr) ((esr) & ESR_ELx_ISS_MASK) /* ISS field definitions shared by different classes */ #define ESR_ELx_WNR_SHIFT (6) @@ -136,7 +138,7 @@ #define ESR_ELx_WFx_ISS_TI (UL(1) << 0) #define ESR_ELx_WFx_ISS_WFI (UL(0) << 0) #define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) -#define ESR_ELx_xVC_IMM_MASK ((1UL << 16) - 1) +#define ESR_ELx_xVC_IMM_MASK ((UL(1) << 16) - 1) #define DISR_EL1_IDS (UL(1) << 24) /* @@ -327,17 +329,26 @@ #define ESR_ELx_CP15_32_ISS_SYS_CNTFRQ (ESR_ELx_CP15_32_ISS_SYS_VAL(0, 0, 14, 0) |\ ESR_ELx_CP15_32_ISS_DIR_READ) +/* + * ISS values for SME traps + */ + +#define ESR_ELx_SME_ISS_SME_DISABLED 0 +#define ESR_ELx_SME_ISS_ILL 1 +#define ESR_ELx_SME_ISS_SM_DISABLED 2 +#define ESR_ELx_SME_ISS_ZA_DISABLED 3 + #ifndef __ASSEMBLY__ #include <asm/types.h> -static inline bool esr_is_data_abort(u32 esr) +static inline bool esr_is_data_abort(unsigned long esr) { - const u32 ec = ESR_ELx_EC(esr); + const unsigned long ec = ESR_ELx_EC(esr); return ec == ESR_ELx_EC_DABT_LOW || ec == ESR_ELx_EC_DABT_CUR; } -const char *esr_get_class_string(u32 esr); +const char *esr_get_class_string(unsigned long esr); #endif /* __ASSEMBLY */ #endif /* __ASM_ESR_H */ diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 339477dca551..d94aecff9690 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -19,9 +19,9 @@ #define __exception_irq_entry __kprobes #endif -static inline u32 disr_to_esr(u64 disr) +static inline unsigned long disr_to_esr(u64 disr) { - unsigned int esr = ESR_ELx_EC_SERROR << ESR_ELx_EC_SHIFT; + unsigned long esr = ESR_ELx_EC_SERROR << ESR_ELx_EC_SHIFT; if ((disr & DISR_EL1_IDS) == 0) esr |= (disr & DISR_EL1_ESR_MASK); @@ -57,23 +57,24 @@ asmlinkage void call_on_irq_stack(struct pt_regs *regs, void (*func)(struct pt_regs *)); asmlinkage void asm_exit_to_user_mode(struct pt_regs *regs); -void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs); +void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs); void do_undefinstr(struct pt_regs *regs); void do_bti(struct pt_regs *regs); -void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, +void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr, struct pt_regs *regs); -void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs); -void do_sve_acc(unsigned int esr, struct pt_regs *regs); -void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs); -void do_sysinstr(unsigned int esr, struct pt_regs *regs); -void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); -void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr); -void do_cp15instr(unsigned int esr, struct pt_regs *regs); +void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs); +void do_sve_acc(unsigned long esr, struct pt_regs *regs); +void do_sme_acc(unsigned long esr, struct pt_regs *regs); +void do_fpsimd_exc(unsigned long esr, struct pt_regs *regs); +void do_sysinstr(unsigned long esr, struct pt_regs *regs); +void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs); +void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr); +void do_cp15instr(unsigned long esr, struct pt_regs *regs); void do_el0_svc(struct pt_regs *regs); void do_el0_svc_compat(struct pt_regs *regs); -void do_ptrauth_fault(struct pt_regs *regs, unsigned int esr); -void do_serror(struct pt_regs *regs, unsigned int esr); +void do_ptrauth_fault(struct pt_regs *regs, unsigned long esr); +void do_serror(struct pt_regs *regs, unsigned long esr); void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags); -void panic_bad_stack(struct pt_regs *regs, unsigned int esr, unsigned long far); +void panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far); #endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index cb24385e3632..9bb1873f5295 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -32,6 +32,18 @@ #define VFP_STATE_SIZE ((32 * 8) + 4) #endif +/* + * When we defined the maximum SVE vector length we defined the ABI so + * that the maximum vector length included all the reserved for future + * expansion bits in ZCR rather than those just currently defined by + * the architecture. While SME follows a similar pattern the fact that + * it includes a square matrix means that any allocations that attempt + * to cover the maximum potential vector length (such as happen with + * the regset used for ptrace) end up being extremely large. Define + * the much lower actual limit for use in such situations. + */ +#define SME_VQ_MAX 16 + struct task_struct; extern void fpsimd_save_state(struct user_fpsimd_state *state); @@ -46,11 +58,23 @@ extern void fpsimd_restore_current_state(void); extern void fpsimd_update_current_state(struct user_fpsimd_state const *state); extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state, - void *sve_state, unsigned int sve_vl); + void *sve_state, unsigned int sve_vl, + void *za_state, unsigned int sme_vl, + u64 *svcr); extern void fpsimd_flush_task_state(struct task_struct *target); extern void fpsimd_save_and_flush_cpu_state(void); +static inline bool thread_sm_enabled(struct thread_struct *thread) +{ + return system_supports_sme() && (thread->svcr & SVCR_SM_MASK); +} + +static inline bool thread_za_enabled(struct thread_struct *thread) +{ + return system_supports_sme() && (thread->svcr & SVCR_ZA_MASK); +} + /* Maximum VL that SVE/SME VL-agnostic software can transparently support */ #define VL_ARCH_MAX 0x100 @@ -62,7 +86,14 @@ static inline size_t sve_ffr_offset(int vl) static inline void *sve_pffr(struct thread_struct *thread) { - return (char *)thread->sve_state + sve_ffr_offset(thread_get_sve_vl(thread)); + unsigned int vl; + + if (system_supports_sme() && thread_sm_enabled(thread)) + vl = thread_get_sme_vl(thread); + else + vl = thread_get_sve_vl(thread); + + return (char *)thread->sve_state + sve_ffr_offset(vl); } extern void sve_save_state(void *state, u32 *pfpsr, int save_ffr); @@ -71,11 +102,17 @@ extern void sve_load_state(void const *state, u32 const *pfpsr, extern void sve_flush_live(bool flush_ffr, unsigned long vq_minus_1); extern unsigned int sve_get_vl(void); extern void sve_set_vq(unsigned long vq_minus_1); +extern void sme_set_vq(unsigned long vq_minus_1); +extern void za_save_state(void *state); +extern void za_load_state(void const *state); struct arm64_cpu_capabilities; extern void sve_kernel_enable(const struct arm64_cpu_capabilities *__unused); +extern void sme_kernel_enable(const struct arm64_cpu_capabilities *__unused); +extern void fa64_kernel_enable(const struct arm64_cpu_capabilities *__unused); extern u64 read_zcr_features(void); +extern u64 read_smcr_features(void); /* * Helpers to translate bit indices in sve_vq_map to VQ values (and @@ -119,6 +156,7 @@ struct vl_info { extern void sve_alloc(struct task_struct *task); extern void fpsimd_release_task(struct task_struct *task); extern void fpsimd_sync_to_sve(struct task_struct *task); +extern void fpsimd_force_sync_to_sve(struct task_struct *task); extern void sve_sync_to_fpsimd(struct task_struct *task); extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task); @@ -171,6 +209,12 @@ static inline void write_vl(enum vec_type type, u64 val) write_sysreg_s(tmp | val, SYS_ZCR_EL1); break; #endif +#ifdef CONFIG_ARM64_SME + case ARM64_VEC_SME: + tmp = read_sysreg_s(SYS_SMCR_EL1) & ~SMCR_ELx_LEN_MASK; + write_sysreg_s(tmp | val, SYS_SMCR_EL1); + break; +#endif default: WARN_ON_ONCE(1); break; @@ -208,6 +252,8 @@ static inline bool sve_vq_available(unsigned int vq) return vq_available(ARM64_VEC_SVE, vq); } +size_t sve_state_size(struct task_struct const *task); + #else /* ! CONFIG_ARM64_SVE */ static inline void sve_alloc(struct task_struct *task) { } @@ -247,8 +293,93 @@ static inline void vec_update_vq_map(enum vec_type t) { } static inline int vec_verify_vq_map(enum vec_type t) { return 0; } static inline void sve_setup(void) { } +static inline size_t sve_state_size(struct task_struct const *task) +{ + return 0; +} + #endif /* ! CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +static inline void sme_user_disable(void) +{ + sysreg_clear_set(cpacr_el1, CPACR_EL1_SMEN_EL0EN, 0); +} + +static inline void sme_user_enable(void) +{ + sysreg_clear_set(cpacr_el1, 0, CPACR_EL1_SMEN_EL0EN); +} + +static inline void sme_smstart_sm(void) +{ + asm volatile(__msr_s(SYS_SVCR_SMSTART_SM_EL0, "xzr")); +} + +static inline void sme_smstop_sm(void) +{ + asm volatile(__msr_s(SYS_SVCR_SMSTOP_SM_EL0, "xzr")); +} + +static inline void sme_smstop(void) +{ + asm volatile(__msr_s(SYS_SVCR_SMSTOP_SMZA_EL0, "xzr")); +} + +extern void __init sme_setup(void); + +static inline int sme_max_vl(void) +{ + return vec_max_vl(ARM64_VEC_SME); +} + +static inline int sme_max_virtualisable_vl(void) +{ + return vec_max_virtualisable_vl(ARM64_VEC_SME); +} + +extern void sme_alloc(struct task_struct *task); +extern unsigned int sme_get_vl(void); +extern int sme_set_current_vl(unsigned long arg); +extern int sme_get_current_vl(void); + +/* + * Return how many bytes of memory are required to store the full SME + * specific state (currently just ZA) for task, given task's currently + * configured vector length. + */ +static inline size_t za_state_size(struct task_struct const *task) +{ + unsigned int vl = task_get_sme_vl(task); + + return ZA_SIG_REGS_SIZE(sve_vq_from_vl(vl)); +} + +#else + +static inline void sme_user_disable(void) { BUILD_BUG(); } +static inline void sme_user_enable(void) { BUILD_BUG(); } + +static inline void sme_smstart_sm(void) { } +static inline void sme_smstop_sm(void) { } +static inline void sme_smstop(void) { } + +static inline void sme_alloc(struct task_struct *task) { } +static inline void sme_setup(void) { } +static inline unsigned int sme_get_vl(void) { return 0; } +static inline int sme_max_vl(void) { return 0; } +static inline int sme_max_virtualisable_vl(void) { return 0; } +static inline int sme_set_current_vl(unsigned long arg) { return -EINVAL; } +static inline int sme_get_current_vl(void) { return -EINVAL; } + +static inline size_t za_state_size(struct task_struct const *task) +{ + return 0; +} + +#endif /* ! CONFIG_ARM64_SME */ + /* For use by EFI runtime services calls only */ extern void __efi_fpsimd_begin(void); extern void __efi_fpsimd_end(void); diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index 2509d7dde55a..5e0910cf4832 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -93,6 +93,12 @@ .endif .endm +.macro _sme_check_wv v + .if (\v) < 12 || (\v) > 15 + .error "Bad vector select register \v." + .endif +.endm + /* SVE instruction encodings for non-SVE-capable assemblers */ /* (pre binutils 2.28, all kernel capable clang versions support SVE) */ @@ -174,6 +180,54 @@ | (\np) .endm +/* SME instruction encodings for non-SME-capable assemblers */ +/* (pre binutils 2.38/LLVM 13) */ + +/* RDSVL X\nx, #\imm */ +.macro _sme_rdsvl nx, imm + _check_general_reg \nx + _check_num (\imm), -0x20, 0x1f + .inst 0x04bf5800 \ + | (\nx) \ + | (((\imm) & 0x3f) << 5) +.endm + +/* + * STR (vector from ZA array): + * STR ZA[\nw, #\offset], [X\nxbase, #\offset, MUL VL] + */ +.macro _sme_str_zav nw, nxbase, offset=0 + _sme_check_wv \nw + _check_general_reg \nxbase + _check_num (\offset), -0x100, 0xff + .inst 0xe1200000 \ + | (((\nw) & 3) << 13) \ + | ((\nxbase) << 5) \ + | ((\offset) & 7) +.endm + +/* + * LDR (vector to ZA array): + * LDR ZA[\nw, #\offset], [X\nxbase, #\offset, MUL VL] + */ +.macro _sme_ldr_zav nw, nxbase, offset=0 + _sme_check_wv \nw + _check_general_reg \nxbase + _check_num (\offset), -0x100, 0xff + .inst 0xe1000000 \ + | (((\nw) & 3) << 13) \ + | ((\nxbase) << 5) \ + | ((\offset) & 7) +.endm + +/* + * Zero the entire ZA array + * ZERO ZA + */ +.macro zero_za + .inst 0xc00800ff +.endm + .macro __for from:req, to:req .if (\from) == (\to) _for__body %\from @@ -208,6 +262,17 @@ 921: .endm +/* Update SMCR_EL1.LEN with the new VQ */ +.macro sme_load_vq xvqminus1, xtmp, xtmp2 + mrs_s \xtmp, SYS_SMCR_EL1 + bic \xtmp2, \xtmp, SMCR_ELx_LEN_MASK + orr \xtmp2, \xtmp2, \xvqminus1 + cmp \xtmp2, \xtmp + b.eq 921f + msr_s SYS_SMCR_EL1, \xtmp2 //self-synchronising +921: +.endm + /* Preserve the first 128-bits of Znz and zero the rest. */ .macro _sve_flush_z nz _sve_check_zreg \nz @@ -254,3 +319,25 @@ ldr w\nxtmp, [\xpfpsr, #4] msr fpcr, x\nxtmp .endm + +.macro sme_save_za nxbase, xvl, nw + mov w\nw, #0 + +423: + _sme_str_zav \nw, \nxbase + add x\nxbase, x\nxbase, \xvl + add x\nw, x\nw, #1 + cmp \xvl, x\nw + bne 423b +.endm + +.macro sme_load_za nxbase, xvl, nw + mov w\nw, #0 + +423: + _sme_ldr_zav \nw, \nxbase + add x\nxbase, x\nxbase, \xvl + add x\nw, x\nw, #1 + cmp \xvl, x\nw + bne 423b +.endm diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index 1494cfa8639b..dbc45a4157fa 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -80,8 +80,15 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS struct dyn_ftrace; +struct ftrace_ops; +struct ftrace_regs; + int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec); #define ftrace_init_nop ftrace_init_nop + +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +#define ftrace_graph_func ftrace_graph_func #endif #define ftrace_return_address(n) return_address(n) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 1242f71937f8..d656822b13f1 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -44,6 +44,8 @@ extern void huge_ptep_clear_flush(struct vm_area_struct *vma, #define __HAVE_ARCH_HUGE_PTE_CLEAR extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz); +#define __HAVE_ARCH_HUGE_PTEP_GET +extern pte_t huge_ptep_get(pte_t *ptep); extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz); #define set_huge_swap_pte_at set_huge_swap_pte_at diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 8db5ec0089db..9f0ce004fdbc 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -109,6 +109,14 @@ #define KERNEL_HWCAP_AFP __khwcap2_feature(AFP) #define KERNEL_HWCAP_RPRES __khwcap2_feature(RPRES) #define KERNEL_HWCAP_MTE3 __khwcap2_feature(MTE3) +#define KERNEL_HWCAP_SME __khwcap2_feature(SME) +#define KERNEL_HWCAP_SME_I16I64 __khwcap2_feature(SME_I16I64) +#define KERNEL_HWCAP_SME_F64F64 __khwcap2_feature(SME_F64F64) +#define KERNEL_HWCAP_SME_I8I32 __khwcap2_feature(SME_I8I32) +#define KERNEL_HWCAP_SME_F16F32 __khwcap2_feature(SME_F16F32) +#define KERNEL_HWCAP_SME_B16F32 __khwcap2_feature(SME_B16F32) +#define KERNEL_HWCAP_SME_F32F32 __khwcap2_feature(SME_F32F32) +#define KERNEL_HWCAP_SME_FA64 __khwcap2_feature(SME_FA64) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 1e5760d567ae..6aa2dc836db1 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -201,6 +201,8 @@ enum aarch64_insn_size_type { enum aarch64_insn_ldst_type { AARCH64_INSN_LDST_LOAD_REG_OFFSET, AARCH64_INSN_LDST_STORE_REG_OFFSET, + AARCH64_INSN_LDST_LOAD_IMM_OFFSET, + AARCH64_INSN_LDST_STORE_IMM_OFFSET, AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX, AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX, AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX, @@ -335,6 +337,7 @@ __AARCH64_INSN_FUNCS(load_pre, 0x3FE00C00, 0x38400C00) __AARCH64_INSN_FUNCS(store_post, 0x3FE00C00, 0x38000400) __AARCH64_INSN_FUNCS(load_post, 0x3FE00C00, 0x38400400) __AARCH64_INSN_FUNCS(str_reg, 0x3FE0EC00, 0x38206800) +__AARCH64_INSN_FUNCS(str_imm, 0x3FC00000, 0x39000000) __AARCH64_INSN_FUNCS(ldadd, 0x3F20FC00, 0x38200000) __AARCH64_INSN_FUNCS(ldclr, 0x3F20FC00, 0x38201000) __AARCH64_INSN_FUNCS(ldeor, 0x3F20FC00, 0x38202000) @@ -342,6 +345,7 @@ __AARCH64_INSN_FUNCS(ldset, 0x3F20FC00, 0x38203000) __AARCH64_INSN_FUNCS(swp, 0x3F20FC00, 0x38208000) __AARCH64_INSN_FUNCS(cas, 0x3FA07C00, 0x08A07C00) __AARCH64_INSN_FUNCS(ldr_reg, 0x3FE0EC00, 0x38606800) +__AARCH64_INSN_FUNCS(ldr_imm, 0x3FC00000, 0x39400000) __AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000) __AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000) __AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000) @@ -501,6 +505,11 @@ u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg, enum aarch64_insn_register offset, enum aarch64_insn_size_type size, enum aarch64_insn_ldst_type type); +u32 aarch64_insn_gen_load_store_imm(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + unsigned int imm, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type); u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, enum aarch64_insn_register reg2, enum aarch64_insn_register base, diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 7fd836bea7eb..3995652daf81 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -192,4 +192,8 @@ extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size); extern int valid_phys_addr_range(phys_addr_t addr, size_t size); extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); +extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags); +#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap + #endif /* __ASM_IO_H */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 1767ded83888..13ae232ec4a1 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -279,6 +279,7 @@ #define CPTR_EL2_TCPAC (1U << 31) #define CPTR_EL2_TAM (1 << 30) #define CPTR_EL2_TTA (1 << 20) +#define CPTR_EL2_TSM (1 << 12) #define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT) #define CPTR_EL2_TZ (1 << 8) #define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index f71358271b71..08233172e7a9 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -236,14 +236,14 @@ static inline bool vcpu_mode_priv(const struct kvm_vcpu *vcpu) return mode != PSR_MODE_EL0t; } -static __always_inline u32 kvm_vcpu_get_esr(const struct kvm_vcpu *vcpu) +static __always_inline u64 kvm_vcpu_get_esr(const struct kvm_vcpu *vcpu) { return vcpu->arch.fault.esr_el2; } static __always_inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu) { - u32 esr = kvm_vcpu_get_esr(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); if (esr & ESR_ELx_CV) return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT; @@ -374,7 +374,7 @@ static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu) static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) { - u32 esr = kvm_vcpu_get_esr(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); return ESR_ELx_SYS64_ISS_RT(esr); } diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 94a27a7520f4..d5888dedf02a 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -153,7 +153,7 @@ struct kvm_arch { }; struct kvm_vcpu_fault_info { - u32 esr_el2; /* Hyp Syndrom Register */ + u64 esr_el2; /* Hyp Syndrom Register */ u64 far_el2; /* Hyp Fault Address Register */ u64 hpfar_el2; /* Hyp IPA Fault Address Register */ u64 disr_el1; /* Deferred [SError] Status Register */ @@ -295,8 +295,11 @@ struct vcpu_reset_state { struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; + + /* Guest floating point state */ void *sve_state; unsigned int sve_max_vl; + u64 svcr; /* Stage 2 paging state used by the hardware on next switch */ struct kvm_s2_mmu *hw_mmu; @@ -451,6 +454,7 @@ struct kvm_vcpu_arch { #define KVM_ARM64_DEBUG_STATE_SAVE_TRBE (1 << 13) /* Save TRBE context if active */ #define KVM_ARM64_FP_FOREIGN_FPSTATE (1 << 14) #define KVM_ARM64_ON_UNSUPPORTED_CPU (1 << 15) /* Physical CPU not in supported_cpus */ +#define KVM_ARM64_HOST_SME_ENABLED (1 << 16) /* SME enabled for EL0 */ #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_USE_SW_BP | \ diff --git a/arch/arm64/include/asm/kvm_ras.h b/arch/arm64/include/asm/kvm_ras.h index 8ac6ee77437c..87e10d9a635b 100644 --- a/arch/arm64/include/asm/kvm_ras.h +++ b/arch/arm64/include/asm/kvm_ras.h @@ -14,7 +14,7 @@ * Was this synchronous external abort a RAS notification? * Returns '0' for errors handled by some RAS subsystem, or -ENOENT. */ -static inline int kvm_handle_guest_sea(phys_addr_t addr, unsigned int esr) +static inline int kvm_handle_guest_sea(phys_addr_t addr, u64 esr) { /* apei_claim_sea(NULL) expects to mask interrupts itself */ lockdep_assert_irqs_enabled(); diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index adcb937342f1..aa523591a44e 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -47,6 +47,7 @@ long set_mte_ctrl(struct task_struct *task, unsigned long arg); long get_mte_ctrl(struct task_struct *task); int mte_ptrace_copy_tags(struct task_struct *child, long request, unsigned long addr, unsigned long data); +size_t mte_probe_user_range(const char __user *uaddr, size_t size); #else /* CONFIG_ARM64_MTE */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 66671ff05183..dd3d12bce07b 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -49,7 +49,7 @@ #define PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -#define PTRS_PER_PMD PTRS_PER_PTE +#define PTRS_PER_PMD (1 << (PAGE_SHIFT - 3)) #endif /* @@ -59,7 +59,7 @@ #define PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) #define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_MASK (~(PUD_SIZE-1)) -#define PTRS_PER_PUD PTRS_PER_PTE +#define PTRS_PER_PUD (1 << (PAGE_SHIFT - 3)) #endif /* diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index dff2b483ea50..45c358538f13 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1001,7 +1001,8 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, */ static inline bool arch_faults_on_old_pte(void) { - WARN_ON(preemptible()); + /* The register read below requires a stable CPU to make any sense */ + cant_migrate(); return !cpu_has_hw_af(); } diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 73e38d9a540c..bf8aafee1eac 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -118,6 +118,7 @@ struct debug_info { enum vec_type { ARM64_VEC_SVE = 0, + ARM64_VEC_SME, ARM64_VEC_MAX, }; @@ -153,6 +154,7 @@ struct thread_struct { unsigned int fpsimd_cpu; void *sve_state; /* SVE registers, if any */ + void *za_state; /* ZA register, if any */ unsigned int vl[ARM64_VEC_MAX]; /* vector length */ unsigned int vl_onexec[ARM64_VEC_MAX]; /* vl after next exec */ unsigned long fault_address; /* fault info */ @@ -168,6 +170,8 @@ struct thread_struct { u64 mte_ctrl; #endif u64 sctlr_user; + u64 svcr; + u64 tpidr2_el0; }; static inline unsigned int thread_get_vl(struct thread_struct *thread, @@ -181,6 +185,19 @@ static inline unsigned int thread_get_sve_vl(struct thread_struct *thread) return thread_get_vl(thread, ARM64_VEC_SVE); } +static inline unsigned int thread_get_sme_vl(struct thread_struct *thread) +{ + return thread_get_vl(thread, ARM64_VEC_SME); +} + +static inline unsigned int thread_get_cur_vl(struct thread_struct *thread) +{ + if (system_supports_sme() && (thread->svcr & SVCR_SM_MASK)) + return thread_get_sme_vl(thread); + else + return thread_get_sve_vl(thread); +} + unsigned int task_get_vl(const struct task_struct *task, enum vec_type type); void task_set_vl(struct task_struct *task, enum vec_type type, unsigned long vl); @@ -194,6 +211,11 @@ static inline unsigned int task_get_sve_vl(const struct task_struct *task) return task_get_vl(task, ARM64_VEC_SVE); } +static inline unsigned int task_get_sme_vl(const struct task_struct *task) +{ + return task_get_vl(task, ARM64_VEC_SME); +} + static inline void task_set_sve_vl(struct task_struct *task, unsigned long vl) { task_set_vl(task, ARM64_VEC_SVE, vl); @@ -354,9 +376,11 @@ extern void __init minsigstksz_setup(void); */ #include <asm/fpsimd.h> -/* Userspace interface for PR_SVE_{SET,GET}_VL prctl()s: */ +/* Userspace interface for PR_S[MV]E_{SET,GET}_VL prctl()s: */ #define SVE_SET_VL(arg) sve_set_current_vl(arg) #define SVE_GET_VL() sve_get_current_vl() +#define SME_SET_VL(arg) sme_set_current_vl(arg) +#define SME_GET_VL() sme_get_current_vl() /* PR_PAC_RESET_KEYS prctl */ #define PAC_RESET_KEYS(tsk, arg) ptrauth_prctl_reset_keys(tsk, arg) @@ -381,12 +405,10 @@ long get_tagged_addr_ctrl(struct task_struct *task); * of header definitions for the use of task_stack_page. */ -#define current_top_of_stack() \ -({ \ - struct stack_info _info; \ - BUG_ON(!on_accessible_stack(current, current_stack_pointer, 1, &_info)); \ - _info.high; \ -}) +/* + * The top of the current task's task stack + */ +#define current_top_of_stack() ((unsigned long)current->stack + THREAD_SIZE) #define on_thread_stack() (on_task_stack(current, current_stack_pointer, 1, NULL)) #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index e77cdef9ca29..aec9315bf156 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -31,38 +31,6 @@ struct stack_info { enum stack_type type; }; -/* - * A snapshot of a frame record or fp/lr register values, along with some - * accounting information necessary for robust unwinding. - * - * @fp: The fp value in the frame record (or the real fp) - * @pc: The lr value in the frame record (or the real lr) - * - * @stacks_done: Stacks which have been entirely unwound, for which it is no - * longer valid to unwind to. - * - * @prev_fp: The fp that pointed to this frame record, or a synthetic value - * of 0. This is used to ensure that within a stack, each - * subsequent frame record is at an increasing address. - * @prev_type: The type of stack this frame record was on, or a synthetic - * value of STACK_TYPE_UNKNOWN. This is used to detect a - * transition from one stack to another. - * - * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance - * associated with the most recently encountered replacement lr - * value. - */ -struct stackframe { - unsigned long fp; - unsigned long pc; - DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES); - unsigned long prev_fp; - enum stack_type prev_type; -#ifdef CONFIG_KRETPROBES - struct llist_node *kr_cur; -#endif -}; - extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, const char *loglvl); diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index fbf5f8bb9055..55f998c3dc28 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -115,9 +115,21 @@ #define SYS_DC_CISW sys_insn(1, 0, 7, 14, 2) /* + * Automatically generated definitions for system registers, the + * manual encodings below are in the process of being converted to + * come from here. The header relies on the definition of sys_reg() + * earlier in this file. + */ +#include "asm/sysreg-defs.h" + +/* * System registers, organised loosely by encoding but grouped together * where the architected name contains an index. e.g. ID_MMFR<n>_EL1. */ +#define SYS_SVCR_SMSTOP_SM_EL0 sys_reg(0, 3, 4, 2, 3) +#define SYS_SVCR_SMSTART_SM_EL0 sys_reg(0, 3, 4, 3, 3) +#define SYS_SVCR_SMSTOP_SMZA_EL0 sys_reg(0, 3, 4, 6, 3) + #define SYS_OSDTRRX_EL1 sys_reg(2, 0, 0, 0, 2) #define SYS_MDCCINT_EL1 sys_reg(2, 0, 0, 2, 0) #define SYS_MDSCR_EL1 sys_reg(2, 0, 0, 2, 2) @@ -181,6 +193,7 @@ #define SYS_ID_AA64PFR0_EL1 sys_reg(3, 0, 0, 4, 0) #define SYS_ID_AA64PFR1_EL1 sys_reg(3, 0, 0, 4, 1) #define SYS_ID_AA64ZFR0_EL1 sys_reg(3, 0, 0, 4, 4) +#define SYS_ID_AA64SMFR0_EL1 sys_reg(3, 0, 0, 4, 5) #define SYS_ID_AA64DFR0_EL1 sys_reg(3, 0, 0, 5, 0) #define SYS_ID_AA64DFR1_EL1 sys_reg(3, 0, 0, 5, 1) @@ -188,7 +201,6 @@ #define SYS_ID_AA64AFR0_EL1 sys_reg(3, 0, 0, 5, 4) #define SYS_ID_AA64AFR1_EL1 sys_reg(3, 0, 0, 5, 5) -#define SYS_ID_AA64ISAR0_EL1 sys_reg(3, 0, 0, 6, 0) #define SYS_ID_AA64ISAR1_EL1 sys_reg(3, 0, 0, 6, 1) #define SYS_ID_AA64ISAR2_EL1 sys_reg(3, 0, 0, 6, 2) @@ -196,17 +208,12 @@ #define SYS_ID_AA64MMFR1_EL1 sys_reg(3, 0, 0, 7, 1) #define SYS_ID_AA64MMFR2_EL1 sys_reg(3, 0, 0, 7, 2) -#define SYS_SCTLR_EL1 sys_reg(3, 0, 1, 0, 0) #define SYS_ACTLR_EL1 sys_reg(3, 0, 1, 0, 1) -#define SYS_CPACR_EL1 sys_reg(3, 0, 1, 0, 2) #define SYS_RGSR_EL1 sys_reg(3, 0, 1, 0, 5) #define SYS_GCR_EL1 sys_reg(3, 0, 1, 0, 6) -#define SYS_ZCR_EL1 sys_reg(3, 0, 1, 2, 0) #define SYS_TRFCR_EL1 sys_reg(3, 0, 1, 2, 1) -#define SYS_TTBR0_EL1 sys_reg(3, 0, 2, 0, 0) -#define SYS_TTBR1_EL1 sys_reg(3, 0, 2, 0, 1) #define SYS_TCR_EL1 sys_reg(3, 0, 2, 0, 2) #define SYS_APIAKEYLO_EL1 sys_reg(3, 0, 2, 1, 0) @@ -242,7 +249,6 @@ #define SYS_TFSR_EL1 sys_reg(3, 0, 5, 6, 0) #define SYS_TFSRE0_EL1 sys_reg(3, 0, 5, 6, 1) -#define SYS_FAR_EL1 sys_reg(3, 0, 6, 0, 0) #define SYS_PAR_EL1 sys_reg(3, 0, 7, 4, 0) #define SYS_PAR_EL1_F BIT(0) @@ -441,7 +447,6 @@ #define SYS_ICC_IGRPEN0_EL1 sys_reg(3, 0, 12, 12, 6) #define SYS_ICC_IGRPEN1_EL1 sys_reg(3, 0, 12, 12, 7) -#define SYS_CONTEXTIDR_EL1 sys_reg(3, 0, 13, 0, 1) #define SYS_TPIDR_EL1 sys_reg(3, 0, 13, 0, 4) #define SYS_SCXTNUM_EL1 sys_reg(3, 0, 13, 0, 7) @@ -449,11 +454,12 @@ #define SYS_CNTKCTL_EL1 sys_reg(3, 0, 14, 1, 0) #define SYS_CCSIDR_EL1 sys_reg(3, 1, 0, 0, 0) -#define SYS_CLIDR_EL1 sys_reg(3, 1, 0, 0, 1) #define SYS_GMID_EL1 sys_reg(3, 1, 0, 0, 4) #define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7) -#define SYS_CSSELR_EL1 sys_reg(3, 2, 0, 0, 0) +#define SMIDR_EL1_IMPLEMENTER_SHIFT 24 +#define SMIDR_EL1_SMPS_SHIFT 15 +#define SMIDR_EL1_AFFINITY_SHIFT 0 #define SYS_CTR_EL0 sys_reg(3, 3, 0, 0, 1) #define SYS_DCZID_EL0 sys_reg(3, 3, 0, 0, 7) @@ -477,6 +483,7 @@ #define SYS_TPIDR_EL0 sys_reg(3, 3, 13, 0, 2) #define SYS_TPIDRRO_EL0 sys_reg(3, 3, 13, 0, 3) +#define SYS_TPIDR2_EL0 sys_reg(3, 3, 13, 0, 5) #define SYS_SCXTNUM_EL0 sys_reg(3, 3, 13, 0, 7) @@ -544,9 +551,8 @@ #define SYS_HFGRTR_EL2 sys_reg(3, 4, 1, 1, 4) #define SYS_HFGWTR_EL2 sys_reg(3, 4, 1, 1, 5) #define SYS_HFGITR_EL2 sys_reg(3, 4, 1, 1, 6) -#define SYS_ZCR_EL2 sys_reg(3, 4, 1, 2, 0) #define SYS_TRFCR_EL2 sys_reg(3, 4, 1, 2, 1) -#define SYS_DACR32_EL2 sys_reg(3, 4, 3, 0, 0) +#define SYS_HCRX_EL2 sys_reg(3, 4, 1, 2, 2) #define SYS_HDFGRTR_EL2 sys_reg(3, 4, 3, 1, 4) #define SYS_HDFGWTR_EL2 sys_reg(3, 4, 3, 1, 5) #define SYS_HAFGRTR_EL2 sys_reg(3, 4, 3, 1, 6) @@ -557,7 +563,6 @@ #define SYS_VSESR_EL2 sys_reg(3, 4, 5, 2, 3) #define SYS_FPEXC32_EL2 sys_reg(3, 4, 5, 3, 0) #define SYS_TFSR_EL2 sys_reg(3, 4, 5, 6, 0) -#define SYS_FAR_EL2 sys_reg(3, 4, 6, 0, 0) #define SYS_VDISR_EL2 sys_reg(3, 4, 12, 1, 1) #define __SYS__AP0Rx_EL2(x) sys_reg(3, 4, 12, 8, x) @@ -603,8 +608,6 @@ /* VHE encodings for architectural EL0/1 system registers */ #define SYS_SCTLR_EL12 sys_reg(3, 5, 1, 0, 0) -#define SYS_CPACR_EL12 sys_reg(3, 5, 1, 0, 2) -#define SYS_ZCR_EL12 sys_reg(3, 5, 1, 2, 0) #define SYS_TTBR0_EL12 sys_reg(3, 5, 2, 0, 0) #define SYS_TTBR1_EL12 sys_reg(3, 5, 2, 0, 1) #define SYS_TCR_EL12 sys_reg(3, 5, 2, 0, 2) @@ -614,11 +617,9 @@ #define SYS_AFSR1_EL12 sys_reg(3, 5, 5, 1, 1) #define SYS_ESR_EL12 sys_reg(3, 5, 5, 2, 0) #define SYS_TFSR_EL12 sys_reg(3, 5, 5, 6, 0) -#define SYS_FAR_EL12 sys_reg(3, 5, 6, 0, 0) #define SYS_MAIR_EL12 sys_reg(3, 5, 10, 2, 0) #define SYS_AMAIR_EL12 sys_reg(3, 5, 10, 3, 0) #define SYS_VBAR_EL12 sys_reg(3, 5, 12, 0, 0) -#define SYS_CONTEXTIDR_EL12 sys_reg(3, 5, 13, 0, 1) #define SYS_CNTKCTL_EL12 sys_reg(3, 5, 14, 1, 0) #define SYS_CNTP_TVAL_EL02 sys_reg(3, 5, 14, 2, 0) #define SYS_CNTP_CTL_EL02 sys_reg(3, 5, 14, 2, 1) @@ -628,31 +629,30 @@ #define SYS_CNTV_CVAL_EL02 sys_reg(3, 5, 14, 3, 2) /* Common SCTLR_ELx flags. */ +#define SCTLR_ELx_ENTP2 (BIT(60)) #define SCTLR_ELx_DSSBS (BIT(44)) #define SCTLR_ELx_ATA (BIT(43)) -#define SCTLR_ELx_TCF_SHIFT 40 -#define SCTLR_ELx_TCF_NONE (UL(0x0) << SCTLR_ELx_TCF_SHIFT) -#define SCTLR_ELx_TCF_SYNC (UL(0x1) << SCTLR_ELx_TCF_SHIFT) -#define SCTLR_ELx_TCF_ASYNC (UL(0x2) << SCTLR_ELx_TCF_SHIFT) -#define SCTLR_ELx_TCF_ASYMM (UL(0x3) << SCTLR_ELx_TCF_SHIFT) -#define SCTLR_ELx_TCF_MASK (UL(0x3) << SCTLR_ELx_TCF_SHIFT) - #define SCTLR_ELx_ENIA_SHIFT 31 -#define SCTLR_ELx_ITFSB (BIT(37)) -#define SCTLR_ELx_ENIA (BIT(SCTLR_ELx_ENIA_SHIFT)) -#define SCTLR_ELx_ENIB (BIT(30)) -#define SCTLR_ELx_ENDA (BIT(27)) -#define SCTLR_ELx_EE (BIT(25)) -#define SCTLR_ELx_IESB (BIT(21)) -#define SCTLR_ELx_WXN (BIT(19)) -#define SCTLR_ELx_ENDB (BIT(13)) -#define SCTLR_ELx_I (BIT(12)) -#define SCTLR_ELx_SA (BIT(3)) -#define SCTLR_ELx_C (BIT(2)) -#define SCTLR_ELx_A (BIT(1)) -#define SCTLR_ELx_M (BIT(0)) +#define SCTLR_ELx_ITFSB (BIT(37)) +#define SCTLR_ELx_ENIA (BIT(SCTLR_ELx_ENIA_SHIFT)) +#define SCTLR_ELx_ENIB (BIT(30)) +#define SCTLR_ELx_LSMAOE (BIT(29)) +#define SCTLR_ELx_nTLSMD (BIT(28)) +#define SCTLR_ELx_ENDA (BIT(27)) +#define SCTLR_ELx_EE (BIT(25)) +#define SCTLR_ELx_EIS (BIT(22)) +#define SCTLR_ELx_IESB (BIT(21)) +#define SCTLR_ELx_TSCXT (BIT(20)) +#define SCTLR_ELx_WXN (BIT(19)) +#define SCTLR_ELx_ENDB (BIT(13)) +#define SCTLR_ELx_I (BIT(12)) +#define SCTLR_ELx_EOS (BIT(11)) +#define SCTLR_ELx_SA (BIT(3)) +#define SCTLR_ELx_C (BIT(2)) +#define SCTLR_ELx_A (BIT(1)) +#define SCTLR_ELx_M (BIT(0)) /* SCTLR_EL2 specific flags. */ #define SCTLR_EL2_RES1 ((BIT(4)) | (BIT(5)) | (BIT(11)) | (BIT(16)) | \ @@ -674,34 +674,6 @@ (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) /* SCTLR_EL1 specific flags. */ -#define SCTLR_EL1_EPAN (BIT(57)) -#define SCTLR_EL1_ATA0 (BIT(42)) - -#define SCTLR_EL1_TCF0_SHIFT 38 -#define SCTLR_EL1_TCF0_NONE (UL(0x0) << SCTLR_EL1_TCF0_SHIFT) -#define SCTLR_EL1_TCF0_SYNC (UL(0x1) << SCTLR_EL1_TCF0_SHIFT) -#define SCTLR_EL1_TCF0_ASYNC (UL(0x2) << SCTLR_EL1_TCF0_SHIFT) -#define SCTLR_EL1_TCF0_ASYMM (UL(0x3) << SCTLR_EL1_TCF0_SHIFT) -#define SCTLR_EL1_TCF0_MASK (UL(0x3) << SCTLR_EL1_TCF0_SHIFT) - -#define SCTLR_EL1_BT1 (BIT(36)) -#define SCTLR_EL1_BT0 (BIT(35)) -#define SCTLR_EL1_UCI (BIT(26)) -#define SCTLR_EL1_E0E (BIT(24)) -#define SCTLR_EL1_SPAN (BIT(23)) -#define SCTLR_EL1_NTWE (BIT(18)) -#define SCTLR_EL1_NTWI (BIT(16)) -#define SCTLR_EL1_UCT (BIT(15)) -#define SCTLR_EL1_DZE (BIT(14)) -#define SCTLR_EL1_UMA (BIT(9)) -#define SCTLR_EL1_SED (BIT(8)) -#define SCTLR_EL1_ITD (BIT(7)) -#define SCTLR_EL1_CP15BEN (BIT(5)) -#define SCTLR_EL1_SA0 (BIT(4)) - -#define SCTLR_EL1_RES1 ((BIT(11)) | (BIT(20)) | (BIT(22)) | (BIT(28)) | \ - (BIT(29))) - #ifdef CONFIG_CPU_BIG_ENDIAN #define ENDIAN_SET_EL1 (SCTLR_EL1_E0E | SCTLR_ELx_EE) #else @@ -709,13 +681,17 @@ #endif #define INIT_SCTLR_EL1_MMU_OFF \ - (ENDIAN_SET_EL1 | SCTLR_EL1_RES1) + (ENDIAN_SET_EL1 | SCTLR_EL1_LSMAOE | SCTLR_EL1_nTLSMD | \ + SCTLR_EL1_EIS | SCTLR_EL1_TSCXT | SCTLR_EL1_EOS) #define INIT_SCTLR_EL1_MMU_ON \ - (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_EL1_SA0 | \ - SCTLR_EL1_SED | SCTLR_ELx_I | SCTLR_EL1_DZE | SCTLR_EL1_UCT | \ - SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN | SCTLR_ELx_ITFSB | \ - ENDIAN_SET_EL1 | SCTLR_EL1_UCI | SCTLR_EL1_EPAN | SCTLR_EL1_RES1) + (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | \ + SCTLR_EL1_SA0 | SCTLR_EL1_SED | SCTLR_ELx_I | \ + SCTLR_EL1_DZE | SCTLR_EL1_UCT | SCTLR_EL1_nTWE | \ + SCTLR_ELx_IESB | SCTLR_EL1_SPAN | SCTLR_ELx_ITFSB | \ + ENDIAN_SET_EL1 | SCTLR_EL1_UCI | SCTLR_EL1_EPAN | \ + SCTLR_EL1_LSMAOE | SCTLR_EL1_nTLSMD | SCTLR_EL1_EIS | \ + SCTLR_EL1_TSCXT | SCTLR_EL1_EOS) /* MAIR_ELx memory attributes (used by Linux) */ #define MAIR_ATTR_DEVICE_nGnRnE UL(0x00) @@ -728,25 +704,6 @@ /* Position the attr at the correct index */ #define MAIR_ATTRIDX(attr, idx) ((attr) << ((idx) * 8)) -/* id_aa64isar0 */ -#define ID_AA64ISAR0_RNDR_SHIFT 60 -#define ID_AA64ISAR0_TLB_SHIFT 56 -#define ID_AA64ISAR0_TS_SHIFT 52 -#define ID_AA64ISAR0_FHM_SHIFT 48 -#define ID_AA64ISAR0_DP_SHIFT 44 -#define ID_AA64ISAR0_SM4_SHIFT 40 -#define ID_AA64ISAR0_SM3_SHIFT 36 -#define ID_AA64ISAR0_SHA3_SHIFT 32 -#define ID_AA64ISAR0_RDM_SHIFT 28 -#define ID_AA64ISAR0_ATOMICS_SHIFT 20 -#define ID_AA64ISAR0_CRC32_SHIFT 16 -#define ID_AA64ISAR0_SHA2_SHIFT 12 -#define ID_AA64ISAR0_SHA1_SHIFT 8 -#define ID_AA64ISAR0_AES_SHIFT 4 - -#define ID_AA64ISAR0_TLB_RANGE_NI 0x0 -#define ID_AA64ISAR0_TLB_RANGE 0x2 - /* id_aa64isar1 */ #define ID_AA64ISAR1_I8MM_SHIFT 52 #define ID_AA64ISAR1_DGH_SHIFT 48 @@ -836,6 +793,7 @@ #define ID_AA64PFR0_ELx_32BIT_64BIT 0x2 /* id_aa64pfr1 */ +#define ID_AA64PFR1_SME_SHIFT 24 #define ID_AA64PFR1_MPAMFRAC_SHIFT 16 #define ID_AA64PFR1_RASFRAC_SHIFT 12 #define ID_AA64PFR1_MTE_SHIFT 8 @@ -846,6 +804,7 @@ #define ID_AA64PFR1_SSBS_PSTATE_ONLY 1 #define ID_AA64PFR1_SSBS_PSTATE_INSNS 2 #define ID_AA64PFR1_BT_BTI 0x1 +#define ID_AA64PFR1_SME 1 #define ID_AA64PFR1_MTE_NI 0x0 #define ID_AA64PFR1_MTE_EL0 0x1 @@ -874,6 +833,23 @@ #define ID_AA64ZFR0_AES_PMULL 0x2 #define ID_AA64ZFR0_SVEVER_SVE2 0x1 +/* id_aa64smfr0 */ +#define ID_AA64SMFR0_FA64_SHIFT 63 +#define ID_AA64SMFR0_I16I64_SHIFT 52 +#define ID_AA64SMFR0_F64F64_SHIFT 48 +#define ID_AA64SMFR0_I8I32_SHIFT 36 +#define ID_AA64SMFR0_F16F32_SHIFT 35 +#define ID_AA64SMFR0_B16F32_SHIFT 34 +#define ID_AA64SMFR0_F32F32_SHIFT 32 + +#define ID_AA64SMFR0_FA64 0x1 +#define ID_AA64SMFR0_I16I64 0x4 +#define ID_AA64SMFR0_F64F64 0x1 +#define ID_AA64SMFR0_I8I32 0x4 +#define ID_AA64SMFR0_F16F32 0x1 +#define ID_AA64SMFR0_B16F32 0x1 +#define ID_AA64SMFR0_F32F32 0x1 + /* id_aa64mmfr0 */ #define ID_AA64MMFR0_ECV_SHIFT 60 #define ID_AA64MMFR0_FGT_SHIFT 56 @@ -926,6 +902,7 @@ /* id_aa64mmfr1 */ #define ID_AA64MMFR1_ECBHB_SHIFT 60 +#define ID_AA64MMFR1_HCX_SHIFT 40 #define ID_AA64MMFR1_AFP_SHIFT 44 #define ID_AA64MMFR1_ETS_SHIFT 36 #define ID_AA64MMFR1_TWED_SHIFT 32 @@ -1110,18 +1087,12 @@ #define DCZID_DZP_SHIFT 4 #define DCZID_BS_SHIFT 0 -/* - * The ZCR_ELx_LEN_* definitions intentionally include bits [8:4] which - * are reserved by the SVE architecture for future expansion of the LEN - * field, with compatible semantics. - */ -#define ZCR_ELx_LEN_SHIFT 0 -#define ZCR_ELx_LEN_SIZE 9 -#define ZCR_ELx_LEN_MASK 0x1ff - #define CPACR_EL1_FPEN_EL1EN (BIT(20)) /* enable EL1 access */ #define CPACR_EL1_FPEN_EL0EN (BIT(21)) /* enable EL0 access, if EL1EN set */ +#define CPACR_EL1_SMEN_EL1EN (BIT(24)) /* enable EL1 access */ +#define CPACR_EL1_SMEN_EL0EN (BIT(25)) /* enable EL0 access, if EL1EN set */ + #define CPACR_EL1_ZEN_EL1EN (BIT(16)) /* enable EL1 access */ #define CPACR_EL1_ZEN_EL0EN (BIT(17)) /* enable EL0 access, if EL1EN set */ @@ -1170,6 +1141,8 @@ #define TRFCR_ELx_ExTRE BIT(1) #define TRFCR_ELx_E0TRE BIT(0) +/* HCRX_EL2 definitions */ +#define HCRX_EL2_SMPME_MASK (1 << 5) /* GIC Hypervisor interface registers */ /* ICH_MISR_EL2 bit definitions */ @@ -1233,6 +1206,12 @@ #define ICH_VTR_TDS_SHIFT 19 #define ICH_VTR_TDS_MASK (1 << ICH_VTR_TDS_SHIFT) +/* HFG[WR]TR_EL2 bit definitions */ +#define HFGxTR_EL2_nTPIDR2_EL0_SHIFT 55 +#define HFGxTR_EL2_nTPIDR2_EL0_MASK BIT_MASK(HFGxTR_EL2_nTPIDR2_EL0_SHIFT) +#define HFGxTR_EL2_nSMPRI_EL1_SHIFT 54 +#define HFGxTR_EL2_nSMPRI_EL1_MASK BIT_MASK(HFGxTR_EL2_nSMPRI_EL1_SHIFT) + #define ARM64_FEATURE_FIELD_BITS 4 /* Create a mask for the feature bits of the specified feature. */ @@ -1345,4 +1324,10 @@ #endif +#define SYS_FIELD_PREP(reg, field, val) \ + FIELD_PREP(reg##_##field##_MASK, val) + +#define SYS_FIELD_PREP_ENUM(reg, field, val) \ + FIELD_PREP(reg##_##field##_MASK, reg##_##field##_##val) + #endif /* __ASM_SYSREG_H */ diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h index 305a7157c6a6..0eb7709422e2 100644 --- a/arch/arm64/include/asm/system_misc.h +++ b/arch/arm64/include/asm/system_misc.h @@ -23,9 +23,9 @@ void die(const char *msg, struct pt_regs *regs, int err); struct siginfo; void arm64_notify_die(const char *str, struct pt_regs *regs, int signo, int sicode, unsigned long far, - int err); + unsigned long err); -void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int, +void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned long, struct pt_regs *), int sig, int code, const char *name); diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index e1317b7c4525..848739c15de8 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -82,6 +82,8 @@ int arch_dup_task_struct(struct task_struct *dst, #define TIF_SVE_VL_INHERIT 24 /* Inherit SVE vl_onexec across exec */ #define TIF_SSBD 25 /* Wants SSB mitigation */ #define TIF_TAGGED_ADDR 26 /* Allow tagged user addresses */ +#define TIF_SME 27 /* SME in use */ +#define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */ #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index 54f32a0675df..6e5826470bea 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -24,7 +24,7 @@ struct undef_hook { void register_undef_hook(struct undef_hook *hook); void unregister_undef_hook(struct undef_hook *hook); -void force_signal_inject(int signal, int code, unsigned long address, unsigned int err); +void force_signal_inject(int signal, int code, unsigned long address, unsigned long err); void arm64_notify_segfault(unsigned long addr); void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str); void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str); @@ -57,7 +57,7 @@ static inline int in_entry_text(unsigned long ptr) * errors share the same encoding as an all-zeros encoding from a CPU that * doesn't support RAS. */ -static inline bool arm64_is_ras_serror(u32 esr) +static inline bool arm64_is_ras_serror(unsigned long esr) { WARN_ON(preemptible()); @@ -77,9 +77,9 @@ static inline bool arm64_is_ras_serror(u32 esr) * We treat them as Uncontainable. * Non-RAS SError's are reported as Uncontained/Uncategorized. */ -static inline u32 arm64_ras_serror_get_severity(u32 esr) +static inline unsigned long arm64_ras_serror_get_severity(unsigned long esr) { - u32 aet = esr & ESR_ELx_AET; + unsigned long aet = esr & ESR_ELx_AET; if (!arm64_is_ras_serror(esr)) { /* Not a RAS error, we can't interpret the ESR. */ @@ -98,6 +98,6 @@ static inline u32 arm64_ras_serror_get_severity(u32 esr) return aet; } -bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr); -void __noreturn arm64_serror_panic(struct pt_regs *regs, u32 esr); +bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr); +void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr); #endif diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index e8dce0cc5eaa..63f9c828f1a7 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -460,4 +460,19 @@ static inline int __copy_from_user_flushcache(void *dst, const void __user *src, } #endif +#ifdef CONFIG_ARCH_HAS_SUBPAGE_FAULTS + +/* + * Return 0 on success, the number of bytes not probed otherwise. + */ +static inline size_t probe_subpage_writeable(const char __user *uaddr, + size_t size) +{ + if (!system_supports_mte()) + return 0; + return mte_probe_user_range(uaddr, size); +} + +#endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ + #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 99cb5d383048..b0256cec63b5 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -79,5 +79,13 @@ #define HWCAP2_AFP (1 << 20) #define HWCAP2_RPRES (1 << 21) #define HWCAP2_MTE3 (1 << 22) +#define HWCAP2_SME (1 << 23) +#define HWCAP2_SME_I16I64 (1 << 24) +#define HWCAP2_SME_F64F64 (1 << 25) +#define HWCAP2_SME_I8I32 (1 << 26) +#define HWCAP2_SME_F16F32 (1 << 27) +#define HWCAP2_SME_B16F32 (1 << 28) +#define HWCAP2_SME_F32F32 (1 << 29) +#define HWCAP2_SME_FA64 (1 << 30) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index c1b6ddc02d2f..ab585359242d 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -139,8 +139,10 @@ struct kvm_guest_debug_arch { __u64 dbg_wvr[KVM_ARM_MAX_DBG_REGS]; }; +#define KVM_DEBUG_ARCH_HSR_HIGH_VALID (1 << 0) struct kvm_debug_exit_arch { __u32 hsr; + __u32 hsr_high; /* ESR_EL2[61:32] */ __u64 far; /* used for watchpoints */ }; diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 758ae984ff97..7fa2f7036aa7 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -109,7 +109,7 @@ struct user_hwdebug_state { } dbg_regs[16]; }; -/* SVE/FP/SIMD state (NT_ARM_SVE) */ +/* SVE/FP/SIMD state (NT_ARM_SVE & NT_ARM_SSVE) */ struct user_sve_header { __u32 size; /* total meaningful regset content in bytes */ @@ -220,6 +220,7 @@ struct user_sve_header { (SVE_PT_SVE_PREG_OFFSET(vq, __SVE_NUM_PREGS) - \ SVE_PT_SVE_PREGS_OFFSET(vq)) +/* For streaming mode SVE (SSVE) FFR must be read and written as zero */ #define SVE_PT_SVE_FFR_OFFSET(vq) \ (SVE_PT_REGS_OFFSET + __SVE_FFR_OFFSET(vq)) @@ -240,10 +241,12 @@ struct user_sve_header { - SVE_PT_SVE_OFFSET + (__SVE_VQ_BYTES - 1)) \ / __SVE_VQ_BYTES * __SVE_VQ_BYTES) -#define SVE_PT_SIZE(vq, flags) \ - (((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ? \ - SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, flags) \ - : SVE_PT_FPSIMD_OFFSET + SVE_PT_FPSIMD_SIZE(vq, flags)) +#define SVE_PT_SIZE(vq, flags) \ + (((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ? \ + SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, flags) \ + : ((((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD ? \ + SVE_PT_FPSIMD_OFFSET + SVE_PT_FPSIMD_SIZE(vq, flags) \ + : SVE_PT_REGS_OFFSET))) /* pointer authentication masks (NT_ARM_PAC_MASK) */ @@ -265,6 +268,62 @@ struct user_pac_generic_keys { __uint128_t apgakey; }; +/* ZA state (NT_ARM_ZA) */ + +struct user_za_header { + __u32 size; /* total meaningful regset content in bytes */ + __u32 max_size; /* maxmium possible size for this thread */ + __u16 vl; /* current vector length */ + __u16 max_vl; /* maximum possible vector length */ + __u16 flags; + __u16 __reserved; +}; + +/* + * Common ZA_PT_* flags: + * These must be kept in sync with prctl interface in <linux/prctl.h> + */ +#define ZA_PT_VL_INHERIT ((1 << 17) /* PR_SME_VL_INHERIT */ >> 16) +#define ZA_PT_VL_ONEXEC ((1 << 18) /* PR_SME_SET_VL_ONEXEC */ >> 16) + + +/* + * The remainder of the ZA state follows struct user_za_header. The + * total size of the ZA state (including header) depends on the + * metadata in the header: ZA_PT_SIZE(vq, flags) gives the total size + * of the state in bytes, including the header. + * + * Refer to <asm/sigcontext.h> for details of how to pass the correct + * "vq" argument to these macros. + */ + +/* Offset from the start of struct user_za_header to the register data */ +#define ZA_PT_ZA_OFFSET \ + ((sizeof(struct user_za_header) + (__SVE_VQ_BYTES - 1)) \ + / __SVE_VQ_BYTES * __SVE_VQ_BYTES) + +/* + * The payload starts at offset ZA_PT_ZA_OFFSET, and is of size + * ZA_PT_ZA_SIZE(vq, flags). + * + * The ZA array is stored as a sequence of horizontal vectors ZAV of SVL/8 + * bytes each, starting from vector 0. + * + * Additional data might be appended in the future. + * + * The ZA matrix is represented in memory in an endianness-invariant layout + * which differs from the layout used for the FPSIMD V-registers on big-endian + * systems: see sigcontext.h for more explanation. + */ + +#define ZA_PT_ZAV_OFFSET(vq, n) \ + (ZA_PT_ZA_OFFSET + ((vq * __SVE_VQ_BYTES) * n)) + +#define ZA_PT_ZA_SIZE(vq) ((vq * __SVE_VQ_BYTES) * (vq * __SVE_VQ_BYTES)) + +#define ZA_PT_SIZE(vq) \ + (ZA_PT_ZA_OFFSET + ZA_PT_ZA_SIZE(vq)) + #endif /* __ASSEMBLY__ */ #endif /* _UAPI__ASM_PTRACE_H */ diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index 0c796c795dbe..4aaf31e3bf16 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -134,6 +134,17 @@ struct extra_context { struct sve_context { struct _aarch64_ctx head; __u16 vl; + __u16 flags; + __u16 __reserved[2]; +}; + +#define SVE_SIG_FLAG_SM 0x1 /* Context describes streaming mode */ + +#define ZA_MAGIC 0x54366345 + +struct za_context { + struct _aarch64_ctx head; + __u16 vl; __u16 __reserved[3]; }; @@ -186,9 +197,16 @@ struct sve_context { * sve_context.vl must equal the thread's current vector length when * doing a sigreturn. * + * On systems with support for SME the SVE register state may reflect either + * streaming or non-streaming mode. In streaming mode the streaming mode + * vector length will be used and the flag SVE_SIG_FLAG_SM will be set in + * the flags field. It is permitted to enter or leave streaming mode in + * a signal return, applications should take care to ensure that any difference + * in vector length between the two modes is handled, including any resizing + * and movement of context blocks. * - * Note: for all these macros, the "vq" argument denotes the SVE - * vector length in quadwords (i.e., units of 128 bits). + * Note: for all these macros, the "vq" argument denotes the vector length + * in quadwords (i.e., units of 128 bits). * * The correct way to obtain vq is to use sve_vq_from_vl(vl). The * result is valid if and only if sve_vl_valid(vl) is true. This is @@ -249,4 +267,37 @@ struct sve_context { #define SVE_SIG_CONTEXT_SIZE(vq) \ (SVE_SIG_REGS_OFFSET + SVE_SIG_REGS_SIZE(vq)) +/* + * If the ZA register is enabled for the thread at signal delivery then, + * za_context.head.size >= ZA_SIG_CONTEXT_SIZE(sve_vq_from_vl(za_context.vl)) + * and the register data may be accessed using the ZA_SIG_*() macros. + * + * If za_context.head.size < ZA_SIG_CONTEXT_SIZE(sve_vq_from_vl(za_context.vl)) + * then ZA was not enabled and no register data was included in which case + * ZA register was not enabled for the thread and no register data + * the ZA_SIG_*() macros should not be used except for this check. + * + * The same convention applies when returning from a signal: a caller + * will need to remove or resize the za_context block if it wants to + * enable the ZA register when it was previously non-live or vice-versa. + * This may require the caller to allocate fresh memory and/or move other + * context blocks in the signal frame. + * + * Changing the vector length during signal return is not permitted: + * za_context.vl must equal the thread's current SME vector length when + * doing a sigreturn. + */ + +#define ZA_SIG_REGS_OFFSET \ + ((sizeof(struct za_context) + (__SVE_VQ_BYTES - 1)) \ + / __SVE_VQ_BYTES * __SVE_VQ_BYTES) + +#define ZA_SIG_REGS_SIZE(vq) ((vq * __SVE_VQ_BYTES) * (vq * __SVE_VQ_BYTES)) + +#define ZA_SIG_ZAV_OFFSET(vq, n) (ZA_SIG_REGS_OFFSET + \ + (SVE_SIG_ZREG_SIZE(vq) * n)) + +#define ZA_SIG_CONTEXT_SIZE(vq) \ + (ZA_SIG_REGS_OFFSET + ZA_SIG_REGS_SIZE(vq)) + #endif /* _UAPI__ASM_SIGCONTEXT_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 986837d7ec82..fa7981d0d917 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -75,6 +75,10 @@ obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o +# Force dependency (vdso*-wrap.S includes vdso.so through incbin) +$(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so +$(obj)/vdso32-wrap.o: $(obj)/vdso32/vdso.so + obj-y += probes/ head-y := head.o extra-y += $(head-y) vmlinux.lds diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 4c9b5b4b7a0b..c05cc3b6162e 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -208,6 +208,8 @@ static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = { #ifdef CONFIG_ARM64_ERRATUM_1286807 { ERRATA_MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 0), + /* Kryo4xx Gold (rcpe to rfpe) => (r0p0 to r3p0) */ + ERRATA_MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xe), }, #endif {}, @@ -215,7 +217,7 @@ static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = { #endif #ifdef CONFIG_CAVIUM_ERRATUM_23154 -const struct midr_range cavium_erratum_23154_cpus[] = { +static const struct midr_range cavium_erratum_23154_cpus[] = { MIDR_ALL_VERSIONS(MIDR_THUNDERX), MIDR_ALL_VERSIONS(MIDR_THUNDERX_81XX), MIDR_ALL_VERSIONS(MIDR_THUNDERX_83XX), diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index d72c4b4d389c..4ccddf382e5b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -191,20 +191,20 @@ static bool __system_matches_cap(unsigned int n); * sync with the documentation of the CPU feature register ABI. */ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RNDR_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TLB_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_FHM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_DP_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM4_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RDM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_ATOMICS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_CRC32_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA1_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RNDR_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TLB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_FHM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_DP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM4_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RDM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_ATOMIC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_CRC32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_AES_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -261,6 +261,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SME_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE), @@ -293,6 +295,24 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_FA64_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I16I64_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F64F64_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I8I32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F16F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_B16F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F32F32_SHIFT, 1, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ECV_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_FGT_SHIFT, 4, 0), @@ -557,7 +577,13 @@ static const struct arm64_ftr_bits ftr_id_dfr1[] = { static const struct arm64_ftr_bits ftr_zcr[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, - ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_SIZE, 0), /* LEN */ + ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_WIDTH, 0), /* LEN */ + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_smcr[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, + SMCR_ELx_LEN_SHIFT, SMCR_ELx_LEN_WIDTH, 0), /* LEN */ ARM64_FTR_END, }; @@ -645,6 +671,7 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, &id_aa64pfr1_override), ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0), + ARM64_FTR_REG(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0), /* Op1 = 0, CRn = 0, CRm = 5 */ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), @@ -654,7 +681,6 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1, &id_aa64isar1_override), - ARM64_FTR_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2, &id_aa64isar2_override), @@ -666,6 +692,7 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 1, CRm = 2 */ ARM64_FTR_REG(SYS_ZCR_EL1, ftr_zcr), + ARM64_FTR_REG(SYS_SMCR_EL1, ftr_smcr), /* Op1 = 1, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), @@ -810,7 +837,7 @@ static void __init sort_ftr_regs(void) * to sys_id for subsequent binary search in get_arm64_ftr_reg() * to work correctly. */ - BUG_ON(arm64_ftr_regs[i].sys_id < arm64_ftr_regs[i - 1].sys_id); + BUG_ON(arm64_ftr_regs[i].sys_id <= arm64_ftr_regs[i - 1].sys_id); } } @@ -960,6 +987,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0); + init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0); if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) init_32bit_cpu_features(&info->aarch32); @@ -969,6 +997,12 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) vec_init_vq_map(ARM64_VEC_SVE); } + if (id_aa64pfr1_sme(info->reg_id_aa64pfr1)) { + init_cpu_ftr_reg(SYS_SMCR_EL1, info->reg_smcr); + if (IS_ENABLED(CONFIG_ARM64_SME)) + vec_init_vq_map(ARM64_VEC_SME); + } + if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); @@ -1195,6 +1229,9 @@ void update_cpu_features(int cpu, taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu, info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0); + taint |= check_update_ftr_reg(SYS_ID_AA64SMFR0_EL1, cpu, + info->reg_id_aa64smfr0, boot->reg_id_aa64smfr0); + if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) { taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu, info->reg_zcr, boot->reg_zcr); @@ -1205,6 +1242,16 @@ void update_cpu_features(int cpu, vec_update_vq_map(ARM64_VEC_SVE); } + if (id_aa64pfr1_sme(info->reg_id_aa64pfr1)) { + taint |= check_update_ftr_reg(SYS_SMCR_EL1, cpu, + info->reg_smcr, boot->reg_smcr); + + /* Probe vector lengths, unless we already gave up on SME */ + if (id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1)) && + !system_capabilities_finalized()) + vec_update_vq_map(ARM64_VEC_SME); + } + /* * The kernel uses the LDGM/STGM instructions and the number of tags * they read/write depends on the GMID_EL1.BS field. Check that the @@ -1288,6 +1335,7 @@ u64 __read_sysreg_by_encoding(u32 sys_id) read_sysreg_case(SYS_ID_AA64PFR0_EL1); read_sysreg_case(SYS_ID_AA64PFR1_EL1); read_sysreg_case(SYS_ID_AA64ZFR0_EL1); + read_sysreg_case(SYS_ID_AA64SMFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR0_EL1); @@ -2013,7 +2061,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_ATOMICS_SHIFT, + .field_pos = ID_AA64ISAR0_EL1_ATOMIC_SHIFT, .field_width = 4, .sign = FTR_UNSIGNED, .min_field_value = 2, @@ -2195,10 +2243,10 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_TLB_SHIFT, + .field_pos = ID_AA64ISAR0_EL1_TLB_SHIFT, .field_width = 4, .sign = FTR_UNSIGNED, - .min_field_value = ID_AA64ISAR0_TLB_RANGE, + .min_field_value = ID_AA64ISAR0_EL1_TLB_RANGE, }, #ifdef CONFIG_ARM64_HW_AFDBM { @@ -2227,7 +2275,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_CRC32_SHIFT, + .field_pos = ID_AA64ISAR0_EL1_CRC32_SHIFT, .field_width = 4, .min_field_value = 1, }, @@ -2382,7 +2430,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_RNDR_SHIFT, + .field_pos = ID_AA64ISAR0_EL1_RNDR_SHIFT, .field_width = 4, .sign = FTR_UNSIGNED, .min_field_value = 1, @@ -2442,6 +2490,33 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, .min_field_value = 1, }, +#ifdef CONFIG_ARM64_SME + { + .desc = "Scalable Matrix Extension", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR1_SME_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR1_SME, + .matches = has_cpuid_feature, + .cpu_enable = sme_kernel_enable, + }, + /* FA64 should be sorted after the base SME capability */ + { + .desc = "FA64", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME_FA64, + .sys_reg = SYS_ID_AA64SMFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64SMFR0_FA64_SHIFT, + .field_width = 1, + .min_field_value = ID_AA64SMFR0_FA64, + .matches = has_cpuid_feature, + .cpu_enable = fa64_kernel_enable, + }, +#endif /* CONFIG_ARM64_SME */ {}, }; @@ -2514,22 +2589,22 @@ static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = { #endif static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_PMULL), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AES), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA1), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA2), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_SHA512), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_CRC32), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ATOMICS), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA3), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM3), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM4), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FLAGM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RNDR_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RNG), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_PMULL), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AES), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SHA1_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA1), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA2), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_SHA512), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_CRC32_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_CRC32), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_ATOMIC_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ATOMICS), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_RDM_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SHA3_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA3), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SM3_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM3), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SM4_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM4), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_DP_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_FHM_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_TS_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FLAGM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_TS_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_RNDR_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RNG), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, 4, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_FP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, 4, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FPHP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, 4, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_ASIMD), @@ -2575,6 +2650,16 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_ECV_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV), HWCAP_CAP(SYS_ID_AA64MMFR1_EL1, ID_AA64MMFR1_AFP_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AFP), HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_RPRES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RPRES), +#ifdef CONFIG_ARM64_SME + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SME_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_SME, CAP_HWCAP, KERNEL_HWCAP_SME), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_FA64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_FA64, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_I16I64_SHIFT, 4, FTR_UNSIGNED, ID_AA64SMFR0_I16I64, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F64F64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_F64F64, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_I8I32_SHIFT, 4, FTR_UNSIGNED, ID_AA64SMFR0_I8I32, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F16F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_F16F32, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_B16F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_B16F32, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F32F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_F32F32, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), +#endif /* CONFIG_ARM64_SME */ {}, }; @@ -2872,6 +2957,23 @@ static void verify_sve_features(void) /* Add checks on other ZCR bits here if necessary */ } +static void verify_sme_features(void) +{ + u64 safe_smcr = read_sanitised_ftr_reg(SYS_SMCR_EL1); + u64 smcr = read_smcr_features(); + + unsigned int safe_len = safe_smcr & SMCR_ELx_LEN_MASK; + unsigned int len = smcr & SMCR_ELx_LEN_MASK; + + if (len < safe_len || vec_verify_vq_map(ARM64_VEC_SME)) { + pr_crit("CPU%d: SME: vector length support mismatch\n", + smp_processor_id()); + cpu_die_early(); + } + + /* Add checks on other SMCR bits here if necessary */ +} + static void verify_hyp_capabilities(void) { u64 safe_mmfr1, mmfr0, mmfr1; @@ -2924,6 +3026,9 @@ static void verify_local_cpu_capabilities(void) if (system_supports_sve()) verify_sve_features(); + if (system_supports_sme()) + verify_sme_features(); + if (is_hyp_mode_available()) verify_hyp_capabilities(); } @@ -3041,6 +3146,7 @@ void __init setup_cpu_features(void) pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); sve_setup(); + sme_setup(); minsigstksz_setup(); /* Advertise that we have computed the system capabilities */ diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 330b92ea863a..8a8136a096ac 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -98,6 +98,14 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_AFP] = "afp", [KERNEL_HWCAP_RPRES] = "rpres", [KERNEL_HWCAP_MTE3] = "mte3", + [KERNEL_HWCAP_SME] = "sme", + [KERNEL_HWCAP_SME_I16I64] = "smei16i64", + [KERNEL_HWCAP_SME_F64F64] = "smef64f64", + [KERNEL_HWCAP_SME_I8I32] = "smei8i32", + [KERNEL_HWCAP_SME_F16F32] = "smef16f32", + [KERNEL_HWCAP_SME_B16F32] = "smeb16f32", + [KERNEL_HWCAP_SME_F32F32] = "smef32f32", + [KERNEL_HWCAP_SME_FA64] = "smefa64", }; #ifdef CONFIG_COMPAT @@ -401,6 +409,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1); + info->reg_id_aa64smfr0 = read_cpuid(ID_AA64SMFR0_EL1); if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) info->reg_gmid = read_cpuid(GMID_EL1); @@ -412,6 +421,10 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) id_aa64pfr0_sve(info->reg_id_aa64pfr0)) info->reg_zcr = read_zcr_features(); + if (IS_ENABLED(CONFIG_ARM64_SME) && + id_aa64pfr1_sme(info->reg_id_aa64pfr1)) + info->reg_smcr = read_smcr_features(); + cpuinfo_detect_icache_policy(info); } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 4f3661eeb7ec..bf9fe71589bc 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -202,7 +202,7 @@ void unregister_kernel_step_hook(struct step_hook *hook) * So we call all the registered handlers, until the right handler is * found which returns zero. */ -static int call_step_hook(struct pt_regs *regs, unsigned int esr) +static int call_step_hook(struct pt_regs *regs, unsigned long esr) { struct step_hook *hook; struct list_head *list; @@ -238,7 +238,7 @@ static void send_user_sigtrap(int si_code) "User debug trap"); } -static int single_step_handler(unsigned long unused, unsigned int esr, +static int single_step_handler(unsigned long unused, unsigned long esr, struct pt_regs *regs) { bool handler_found = false; @@ -299,11 +299,11 @@ void unregister_kernel_break_hook(struct break_hook *hook) unregister_debug_hook(&hook->node); } -static int call_break_hook(struct pt_regs *regs, unsigned int esr) +static int call_break_hook(struct pt_regs *regs, unsigned long esr) { struct break_hook *hook; struct list_head *list; - int (*fn)(struct pt_regs *regs, unsigned int esr) = NULL; + int (*fn)(struct pt_regs *regs, unsigned long esr) = NULL; list = user_mode(regs) ? &user_break_hook : &kernel_break_hook; @@ -312,7 +312,7 @@ static int call_break_hook(struct pt_regs *regs, unsigned int esr) * entirely not preemptible, and we can use rcu list safely here. */ list_for_each_entry_rcu(hook, list, node) { - unsigned int comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; + unsigned long comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; if ((comment & ~hook->mask) == hook->imm) fn = hook->fn; @@ -322,7 +322,7 @@ static int call_break_hook(struct pt_regs *regs, unsigned int esr) } NOKPROBE_SYMBOL(call_break_hook); -static int brk_handler(unsigned long unused, unsigned int esr, +static int brk_handler(unsigned long unused, unsigned long esr, struct pt_regs *regs) { if (call_break_hook(regs, esr) == DBG_HOOK_HANDLED) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 878c65aa7206..56cefd33eb8e 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -75,7 +75,7 @@ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) if (interrupts_enabled(regs)) { if (regs->exit_rcu) { trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); rcu_irq_exit(); lockdep_hardirqs_on(CALLER_ADDR0); return; @@ -121,7 +121,7 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) static __always_inline void __exit_to_user_mode(void) { trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); user_enter_irqoff(); lockdep_hardirqs_on(CALLER_ADDR0); } @@ -179,7 +179,7 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs) ftrace_nmi_exit(); if (restore) { trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); } rcu_nmi_exit(); @@ -215,7 +215,7 @@ static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs) if (restore) { trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); } rcu_nmi_exit(); @@ -282,13 +282,13 @@ extern void (*handle_arch_irq)(struct pt_regs *); extern void (*handle_arch_fiq)(struct pt_regs *); static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector, - unsigned int esr) + unsigned long esr) { arm64_enter_nmi(regs); console_verbose(); - pr_crit("Unhandled %s exception on CPU%d, ESR 0x%08x -- %s\n", + pr_crit("Unhandled %s exception on CPU%d, ESR 0x%016lx -- %s\n", vector, smp_processor_id(), esr, esr_get_class_string(esr)); @@ -537,6 +537,14 @@ static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr) exit_to_user_mode(regs); } +static void noinstr el0_sme_acc(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_sme_acc(esr, regs); + exit_to_user_mode(regs); +} + static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) { enter_from_user_mode(regs); @@ -645,6 +653,9 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_SVE: el0_sve_acc(regs, esr); break; + case ESR_ELx_EC_SME: + el0_sme_acc(regs, esr); + break; case ESR_ELx_EC_FP_EXC64: el0_fpsimd_exc(regs, esr); break; @@ -818,7 +829,7 @@ UNHANDLED(el0t, 32, error) #ifdef CONFIG_VMAP_STACK asmlinkage void noinstr handle_bad_stack(struct pt_regs *regs) { - unsigned int esr = read_sysreg(esr_el1); + unsigned long esr = read_sysreg(esr_el1); unsigned long far = read_sysreg(far_el1); arm64_enter_nmi(regs); diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index dc242e269f9a..229436f33df5 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -86,3 +86,39 @@ SYM_FUNC_START(sve_flush_live) SYM_FUNC_END(sve_flush_live) #endif /* CONFIG_ARM64_SVE */ + +#ifdef CONFIG_ARM64_SME + +SYM_FUNC_START(sme_get_vl) + _sme_rdsvl 0, 1 + ret +SYM_FUNC_END(sme_get_vl) + +SYM_FUNC_START(sme_set_vq) + sme_load_vq x0, x1, x2 + ret +SYM_FUNC_END(sme_set_vq) + +/* + * Save the SME state + * + * x0 - pointer to buffer for state + */ +SYM_FUNC_START(za_save_state) + _sme_rdsvl 1, 1 // x1 = VL/8 + sme_save_za 0, x1, 12 + ret +SYM_FUNC_END(za_save_state) + +/* + * Load the SME state + * + * x0 - pointer to buffer for state + */ +SYM_FUNC_START(za_load_state) + _sme_rdsvl 1, 1 // x1 = VL/8 + sme_load_za 0, x1, 12 + ret +SYM_FUNC_END(za_load_state) + +#endif /* CONFIG_ARM64_SME */ diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index e535480a4069..d42a205ef625 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -97,12 +97,6 @@ SYM_CODE_START(ftrace_common) SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) bl ftrace_stub -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) // ftrace_graph_caller(); - nop // If enabled, this will be replaced - // "b ftrace_graph_caller" -#endif - /* * At the callsite x0-x8 and x19-x30 were live. Any C code will have preserved * x19-x29 per the AAPCS, and we created frame records upon entry, so we need @@ -127,17 +121,6 @@ ftrace_common_return: ret x9 SYM_CODE_END(ftrace_common) -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_CODE_START(ftrace_graph_caller) - ldr x0, [sp, #S_PC] - sub x0, x0, #AARCH64_INSN_SIZE // ip (callsite's BL insn) - add x1, sp, #S_LR // parent_ip (callsite's LR) - ldr x2, [sp, #PT_REGS_SIZE] // parent fp (callsite's FP) - bl prepare_ftrace_return - b ftrace_common_return -SYM_CODE_END(ftrace_graph_caller) -#endif - #else /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ /* diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index ede028dee81b..5b82b9292400 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -596,7 +596,7 @@ SYM_CODE_START_LOCAL(ret_to_user) ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step enable_step_tsk x19, x2 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK - bl stackleak_erase + bl stackleak_erase_on_task_stack #endif kernel_exit 0 SYM_CODE_END(ret_to_user) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 47af76e53221..819979398127 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -121,7 +121,10 @@ struct fpsimd_last_state_struct { struct user_fpsimd_state *st; void *sve_state; + void *za_state; + u64 *svcr; unsigned int sve_vl; + unsigned int sme_vl; }; static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state); @@ -136,6 +139,12 @@ __ro_after_init struct vl_info vl_info[ARM64_VEC_MAX] = { .max_virtualisable_vl = SVE_VL_MIN, }, #endif +#ifdef CONFIG_ARM64_SME + [ARM64_VEC_SME] = { + .type = ARM64_VEC_SME, + .name = "SME", + }, +#endif }; static unsigned int vec_vl_inherit_flag(enum vec_type type) @@ -143,6 +152,8 @@ static unsigned int vec_vl_inherit_flag(enum vec_type type) switch (type) { case ARM64_VEC_SVE: return TIF_SVE_VL_INHERIT; + case ARM64_VEC_SME: + return TIF_SME_VL_INHERIT; default: WARN_ON_ONCE(1); return 0; @@ -186,6 +197,26 @@ extern void __percpu *efi_sve_state; #endif /* ! CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +static int get_sme_default_vl(void) +{ + return get_default_vl(ARM64_VEC_SME); +} + +static void set_sme_default_vl(int val) +{ + set_default_vl(ARM64_VEC_SME, val); +} + +static void sme_free(struct task_struct *); + +#else + +static inline void sme_free(struct task_struct *t) { } + +#endif + DEFINE_PER_CPU(bool, fpsimd_context_busy); EXPORT_PER_CPU_SYMBOL(fpsimd_context_busy); @@ -206,10 +237,19 @@ static void __get_cpu_fpsimd_context(void) * * The double-underscore version must only be called if you know the task * can't be preempted. + * + * On RT kernels local_bh_disable() is not sufficient because it only + * serializes soft interrupt related sections via a local lock, but stays + * preemptible. Disabling preemption is the right choice here as bottom + * half processing is always in thread context on RT kernels so it + * implicitly prevents bottom half processing as well. */ static void get_cpu_fpsimd_context(void) { - local_bh_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); __get_cpu_fpsimd_context(); } @@ -230,7 +270,10 @@ static void __put_cpu_fpsimd_context(void) static void put_cpu_fpsimd_context(void) { __put_cpu_fpsimd_context(); - local_bh_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); } static bool have_cpu_fpsimd_context(void) @@ -238,23 +281,6 @@ static bool have_cpu_fpsimd_context(void) return !preemptible() && __this_cpu_read(fpsimd_context_busy); } -/* - * Call __sve_free() directly only if you know task can't be scheduled - * or preempted. - */ -static void __sve_free(struct task_struct *task) -{ - kfree(task->thread.sve_state); - task->thread.sve_state = NULL; -} - -static void sve_free(struct task_struct *task) -{ - WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); - - __sve_free(task); -} - unsigned int task_get_vl(const struct task_struct *task, enum vec_type type) { return task->thread.vl[type]; @@ -279,16 +305,27 @@ void task_set_vl_onexec(struct task_struct *task, enum vec_type type, } /* + * TIF_SME controls whether a task can use SME without trapping while + * in userspace, when TIF_SME is set then we must have storage + * alocated in sve_state and za_state to store the contents of both ZA + * and the SVE registers for both streaming and non-streaming modes. + * + * If both SVCR.ZA and SVCR.SM are disabled then at any point we + * may disable TIF_SME and reenable traps. + */ + + +/* * TIF_SVE controls whether a task can use SVE without trapping while - * in userspace, and also the way a task's FPSIMD/SVE state is stored - * in thread_struct. + * in userspace, and also (together with TIF_SME) the way a task's + * FPSIMD/SVE state is stored in thread_struct. * * The kernel uses this flag to track whether a user task is actively * using SVE, and therefore whether full SVE register state needs to * be tracked. If not, the cheaper FPSIMD context handling code can * be used instead of the more costly SVE equivalents. * - * * TIF_SVE set: + * * TIF_SVE or SVCR.SM set: * * The task can execute SVE instructions while in userspace without * trapping to the kernel. @@ -296,7 +333,8 @@ void task_set_vl_onexec(struct task_struct *task, enum vec_type type, * When stored, Z0-Z31 (incorporating Vn in bits[127:0] or the * corresponding Zn), P0-P15 and FFR are encoded in in * task->thread.sve_state, formatted appropriately for vector - * length task->thread.sve_vl. + * length task->thread.sve_vl or, if SVCR.SM is set, + * task->thread.sme_vl. * * task->thread.sve_state must point to a valid buffer at least * sve_state_size(task) bytes in size. @@ -334,16 +372,44 @@ void task_set_vl_onexec(struct task_struct *task, enum vec_type type, */ static void task_fpsimd_load(void) { + bool restore_sve_regs = false; + bool restore_ffr; + WARN_ON(!system_supports_fpsimd()); WARN_ON(!have_cpu_fpsimd_context()); + /* Check if we should restore SVE first */ if (IS_ENABLED(CONFIG_ARM64_SVE) && test_thread_flag(TIF_SVE)) { sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1); + restore_sve_regs = true; + restore_ffr = true; + } + + /* Restore SME, override SVE register configuration if needed */ + if (system_supports_sme()) { + unsigned long sme_vl = task_get_sme_vl(current); + + /* Ensure VL is set up for restoring data */ + if (test_thread_flag(TIF_SME)) + sme_set_vq(sve_vq_from_vl(sme_vl) - 1); + + write_sysreg_s(current->thread.svcr, SYS_SVCR); + + if (thread_za_enabled(¤t->thread)) + za_load_state(current->thread.za_state); + + if (thread_sm_enabled(¤t->thread)) { + restore_sve_regs = true; + restore_ffr = system_supports_fa64(); + } + } + + if (restore_sve_regs) sve_load_state(sve_pffr(¤t->thread), - ¤t->thread.uw.fpsimd_state.fpsr, true); - } else { + ¤t->thread.uw.fpsimd_state.fpsr, + restore_ffr); + else fpsimd_load_state(¤t->thread.uw.fpsimd_state); - } } /* @@ -361,6 +427,9 @@ static void fpsimd_save(void) struct fpsimd_last_state_struct const *last = this_cpu_ptr(&fpsimd_last_state); /* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */ + bool save_sve_regs = false; + bool save_ffr; + unsigned int vl; WARN_ON(!system_supports_fpsimd()); WARN_ON(!have_cpu_fpsimd_context()); @@ -368,9 +437,32 @@ static void fpsimd_save(void) if (test_thread_flag(TIF_FOREIGN_FPSTATE)) return; - if (IS_ENABLED(CONFIG_ARM64_SVE) && - test_thread_flag(TIF_SVE)) { - if (WARN_ON(sve_get_vl() != last->sve_vl)) { + if (test_thread_flag(TIF_SVE)) { + save_sve_regs = true; + save_ffr = true; + vl = last->sve_vl; + } + + if (system_supports_sme()) { + u64 *svcr = last->svcr; + *svcr = read_sysreg_s(SYS_SVCR); + + *svcr = read_sysreg_s(SYS_SVCR); + + if (*svcr & SVCR_ZA_MASK) + za_save_state(last->za_state); + + /* If we are in streaming mode override regular SVE. */ + if (*svcr & SVCR_SM_MASK) { + save_sve_regs = true; + save_ffr = system_supports_fa64(); + vl = last->sme_vl; + } + } + + if (IS_ENABLED(CONFIG_ARM64_SVE) && save_sve_regs) { + /* Get the configured VL from RDVL, will account for SM */ + if (WARN_ON(sve_get_vl() != vl)) { /* * Can't save the user regs, so current would * re-enter user with corrupt state. @@ -381,8 +473,8 @@ static void fpsimd_save(void) } sve_save_state((char *)last->sve_state + - sve_ffr_offset(last->sve_vl), - &last->st->fpsr, true); + sve_ffr_offset(vl), + &last->st->fpsr, save_ffr); } else { fpsimd_save_state(last->st); } @@ -409,6 +501,8 @@ static unsigned int find_supported_vector_length(enum vec_type type, if (vl > max_vl) vl = max_vl; + if (vl < info->min_vl) + vl = info->min_vl; bit = find_next_bit(info->vq_map, SVE_VQ_MAX, __vq_to_bit(sve_vq_from_vl(vl))); @@ -467,6 +561,30 @@ static int __init sve_sysctl_init(void) static int __init sve_sysctl_init(void) { return 0; } #endif /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */ +#if defined(CONFIG_ARM64_SME) && defined(CONFIG_SYSCTL) +static struct ctl_table sme_default_vl_table[] = { + { + .procname = "sme_default_vector_length", + .mode = 0644, + .proc_handler = vec_proc_do_default_vl, + .extra1 = &vl_info[ARM64_VEC_SME], + }, + { } +}; + +static int __init sme_sysctl_init(void) +{ + if (system_supports_sme()) + if (!register_sysctl("abi", sme_default_vl_table)) + return -EINVAL; + + return 0; +} + +#else /* ! (CONFIG_ARM64_SME && CONFIG_SYSCTL) */ +static int __init sme_sysctl_init(void) { return 0; } +#endif /* ! (CONFIG_ARM64_SME && CONFIG_SYSCTL) */ + #define ZREG(sve_state, vq, n) ((char *)(sve_state) + \ (SVE_SIG_ZREG_OFFSET(vq, n) - SVE_SIG_REGS_OFFSET)) @@ -520,7 +638,7 @@ static void fpsimd_to_sve(struct task_struct *task) if (!system_supports_sve()) return; - vq = sve_vq_from_vl(task_get_sve_vl(task)); + vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); __fpsimd_to_sve(sst, fst, vq); } @@ -537,7 +655,7 @@ static void fpsimd_to_sve(struct task_struct *task) */ static void sve_to_fpsimd(struct task_struct *task) { - unsigned int vq; + unsigned int vq, vl; void const *sst = task->thread.sve_state; struct user_fpsimd_state *fst = &task->thread.uw.fpsimd_state; unsigned int i; @@ -546,7 +664,8 @@ static void sve_to_fpsimd(struct task_struct *task) if (!system_supports_sve()) return; - vq = sve_vq_from_vl(task_get_sve_vl(task)); + vl = thread_get_cur_vl(&task->thread); + vq = sve_vq_from_vl(vl); for (i = 0; i < SVE_NUM_ZREGS; ++i) { p = (__uint128_t const *)ZREG(sst, vq, i); fst->vregs[i] = arm64_le128_to_cpu(*p); @@ -554,14 +673,37 @@ static void sve_to_fpsimd(struct task_struct *task) } #ifdef CONFIG_ARM64_SVE +/* + * Call __sve_free() directly only if you know task can't be scheduled + * or preempted. + */ +static void __sve_free(struct task_struct *task) +{ + kfree(task->thread.sve_state); + task->thread.sve_state = NULL; +} + +static void sve_free(struct task_struct *task) +{ + WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); + + __sve_free(task); +} /* * Return how many bytes of memory are required to store the full SVE * state for task, given task's currently configured vector length. */ -static size_t sve_state_size(struct task_struct const *task) +size_t sve_state_size(struct task_struct const *task) { - return SVE_SIG_REGS_SIZE(sve_vq_from_vl(task_get_sve_vl(task))); + unsigned int vl = 0; + + if (system_supports_sve()) + vl = task_get_sve_vl(task); + if (system_supports_sme()) + vl = max(vl, task_get_sme_vl(task)); + + return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)); } /* @@ -588,6 +730,19 @@ void sve_alloc(struct task_struct *task) /* + * Force the FPSIMD state shared with SVE to be updated in the SVE state + * even if the SVE state is the current active state. + * + * This should only be called by ptrace. task must be non-runnable. + * task->thread.sve_state must point to at least sve_state_size(task) + * bytes of allocated kernel memory. + */ +void fpsimd_force_sync_to_sve(struct task_struct *task) +{ + fpsimd_to_sve(task); +} + +/* * Ensure that task->thread.sve_state is up to date with respect to * the user task, irrespective of when SVE is in use or not. * @@ -597,7 +752,8 @@ void sve_alloc(struct task_struct *task) */ void fpsimd_sync_to_sve(struct task_struct *task) { - if (!test_tsk_thread_flag(task, TIF_SVE)) + if (!test_tsk_thread_flag(task, TIF_SVE) && + !thread_sm_enabled(&task->thread)) fpsimd_to_sve(task); } @@ -611,7 +767,8 @@ void fpsimd_sync_to_sve(struct task_struct *task) */ void sve_sync_to_fpsimd(struct task_struct *task) { - if (test_tsk_thread_flag(task, TIF_SVE)) + if (test_tsk_thread_flag(task, TIF_SVE) || + thread_sm_enabled(&task->thread)) sve_to_fpsimd(task); } @@ -636,7 +793,7 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task) if (!test_tsk_thread_flag(task, TIF_SVE)) return; - vq = sve_vq_from_vl(task_get_sve_vl(task)); + vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); memset(sst, 0, SVE_SIG_REGS_SIZE(vq)); __fpsimd_to_sve(sst, fst, vq); @@ -680,8 +837,7 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type, /* * To ensure the FPSIMD bits of the SVE vector registers are preserved, * write any live register state back to task_struct, and convert to a - * regular FPSIMD thread. Since the vector length can only be changed - * with a syscall we can't be in streaming mode while reconfiguring. + * regular FPSIMD thread. */ if (task == current) { get_cpu_fpsimd_context(); @@ -690,17 +846,26 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type, } fpsimd_flush_task_state(task); - if (test_and_clear_tsk_thread_flag(task, TIF_SVE)) + if (test_and_clear_tsk_thread_flag(task, TIF_SVE) || + thread_sm_enabled(&task->thread)) sve_to_fpsimd(task); + if (system_supports_sme() && type == ARM64_VEC_SME) { + task->thread.svcr &= ~(SVCR_SM_MASK | + SVCR_ZA_MASK); + clear_thread_flag(TIF_SME); + } + if (task == current) put_cpu_fpsimd_context(); /* - * Force reallocation of task SVE state to the correct size - * on next use: + * Force reallocation of task SVE and SME state to the correct + * size on next use: */ sve_free(task); + if (system_supports_sme() && type == ARM64_VEC_SME) + sme_free(task); task_set_vl(task, type, vl); @@ -761,6 +926,36 @@ int sve_get_current_vl(void) return vec_prctl_status(ARM64_VEC_SVE, 0); } +#ifdef CONFIG_ARM64_SME +/* PR_SME_SET_VL */ +int sme_set_current_vl(unsigned long arg) +{ + unsigned long vl, flags; + int ret; + + vl = arg & PR_SME_VL_LEN_MASK; + flags = arg & ~vl; + + if (!system_supports_sme() || is_compat_task()) + return -EINVAL; + + ret = vec_set_vector_length(current, ARM64_VEC_SME, vl, flags); + if (ret) + return ret; + + return vec_prctl_status(ARM64_VEC_SME, flags); +} + +/* PR_SME_GET_VL */ +int sme_get_current_vl(void) +{ + if (!system_supports_sme() || is_compat_task()) + return -EINVAL; + + return vec_prctl_status(ARM64_VEC_SME, 0); +} +#endif /* CONFIG_ARM64_SME */ + static void vec_probe_vqs(struct vl_info *info, DECLARE_BITMAP(map, SVE_VQ_MAX)) { @@ -770,7 +965,23 @@ static void vec_probe_vqs(struct vl_info *info, for (vq = SVE_VQ_MAX; vq >= SVE_VQ_MIN; --vq) { write_vl(info->type, vq - 1); /* self-syncing */ - vl = sve_get_vl(); + + switch (info->type) { + case ARM64_VEC_SVE: + vl = sve_get_vl(); + break; + case ARM64_VEC_SME: + vl = sme_get_vl(); + break; + default: + vl = 0; + break; + } + + /* Minimum VL identified? */ + if (sve_vq_from_vl(vl) > vq) + break; + vq = sve_vq_from_vl(vl); /* skip intervening lengths */ set_bit(__vq_to_bit(vq), map); } @@ -856,21 +1067,25 @@ int vec_verify_vq_map(enum vec_type type) static void __init sve_efi_setup(void) { - struct vl_info *info = &vl_info[ARM64_VEC_SVE]; + int max_vl = 0; + int i; if (!IS_ENABLED(CONFIG_EFI)) return; + for (i = 0; i < ARRAY_SIZE(vl_info); i++) + max_vl = max(vl_info[i].max_vl, max_vl); + /* * alloc_percpu() warns and prints a backtrace if this goes wrong. * This is evidence of a crippled system and we are returning void, * so no attempt is made to handle this situation here. */ - if (!sve_vl_valid(info->max_vl)) + if (!sve_vl_valid(max_vl)) goto fail; efi_sve_state = __alloc_percpu( - SVE_SIG_REGS_SIZE(sve_vq_from_vl(info->max_vl)), SVE_VQ_BYTES); + SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), SVE_VQ_BYTES); if (!efi_sve_state) goto fail; @@ -989,10 +1204,172 @@ void __init sve_setup(void) void fpsimd_release_task(struct task_struct *dead_task) { __sve_free(dead_task); + sme_free(dead_task); } #endif /* CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +/* + * Ensure that task->thread.za_state is allocated and sufficiently large. + * + * This function should be used only in preparation for replacing + * task->thread.za_state with new data. The memory is always zeroed + * here to prevent stale data from showing through: this is done in + * the interest of testability and predictability, the architecture + * guarantees that when ZA is enabled it will be zeroed. + */ +void sme_alloc(struct task_struct *task) +{ + if (task->thread.za_state) { + memset(task->thread.za_state, 0, za_state_size(task)); + return; + } + + /* This could potentially be up to 64K. */ + task->thread.za_state = + kzalloc(za_state_size(task), GFP_KERNEL); +} + +static void sme_free(struct task_struct *task) +{ + kfree(task->thread.za_state); + task->thread.za_state = NULL; +} + +void sme_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* Set priority for all PEs to architecturally defined minimum */ + write_sysreg_s(read_sysreg_s(SYS_SMPRI_EL1) & ~SMPRI_EL1_PRIORITY_MASK, + SYS_SMPRI_EL1); + + /* Allow SME in kernel */ + write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1); + isb(); + + /* Allow EL0 to access TPIDR2 */ + write_sysreg(read_sysreg(SCTLR_EL1) | SCTLR_ELx_ENTP2, SCTLR_EL1); + isb(); +} + +/* + * This must be called after sme_kernel_enable(), we rely on the + * feature table being sorted to ensure this. + */ +void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* Allow use of FA64 */ + write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK, + SYS_SMCR_EL1); +} + +/* + * Read the pseudo-SMCR used by cpufeatures to identify the supported + * vector length. + * + * Use only if SME is present. + * This function clobbers the SME vector length. + */ +u64 read_smcr_features(void) +{ + u64 smcr; + unsigned int vq_max; + + sme_kernel_enable(NULL); + sme_smstart_sm(); + + /* + * Set the maximum possible VL. + */ + write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_LEN_MASK, + SYS_SMCR_EL1); + + smcr = read_sysreg_s(SYS_SMCR_EL1); + smcr &= ~(u64)SMCR_ELx_LEN_MASK; /* Only the LEN field */ + vq_max = sve_vq_from_vl(sve_get_vl()); + smcr |= vq_max - 1; /* set LEN field to maximum effective value */ + + sme_smstop_sm(); + + return smcr; +} + +void __init sme_setup(void) +{ + struct vl_info *info = &vl_info[ARM64_VEC_SME]; + u64 smcr; + int min_bit; + + if (!system_supports_sme()) + return; + + /* + * SME doesn't require any particular vector length be + * supported but it does require at least one. We should have + * disabled the feature entirely while bringing up CPUs but + * let's double check here. + */ + WARN_ON(bitmap_empty(info->vq_map, SVE_VQ_MAX)); + + min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); + info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit)); + + smcr = read_sanitised_ftr_reg(SYS_SMCR_EL1); + info->max_vl = sve_vl_from_vq((smcr & SMCR_ELx_LEN_MASK) + 1); + + /* + * Sanity-check that the max VL we determined through CPU features + * corresponds properly to sme_vq_map. If not, do our best: + */ + if (WARN_ON(info->max_vl != find_supported_vector_length(ARM64_VEC_SME, + info->max_vl))) + info->max_vl = find_supported_vector_length(ARM64_VEC_SME, + info->max_vl); + + WARN_ON(info->min_vl > info->max_vl); + + /* + * For the default VL, pick the maximum supported value <= 32 + * (256 bits) if there is one since this is guaranteed not to + * grow the signal frame when in streaming mode, otherwise the + * minimum available VL will be used. + */ + set_sme_default_vl(find_supported_vector_length(ARM64_VEC_SME, 32)); + + pr_info("SME: minimum available vector length %u bytes per vector\n", + info->min_vl); + pr_info("SME: maximum available vector length %u bytes per vector\n", + info->max_vl); + pr_info("SME: default vector length %u bytes per vector\n", + get_sme_default_vl()); +} + +#endif /* CONFIG_ARM64_SME */ + +static void sve_init_regs(void) +{ + /* + * Convert the FPSIMD state to SVE, zeroing all the state that + * is not shared with FPSIMD. If (as is likely) the current + * state is live in the registers then do this there and + * update our metadata for the current task including + * disabling the trap, otherwise update our in-memory copy. + * We are guaranteed to not be in streaming mode, we can only + * take a SVE trap when not in streaming mode and we can't be + * in streaming mode when taking a SME trap. + */ + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { + unsigned long vq_minus_one = + sve_vq_from_vl(task_get_sve_vl(current)) - 1; + sve_set_vq(vq_minus_one); + sve_flush_live(true, vq_minus_one); + fpsimd_bind_task_to_cpu(); + } else { + fpsimd_to_sve(current); + } +} + /* * Trapped SVE access * @@ -1004,7 +1381,7 @@ void fpsimd_release_task(struct task_struct *dead_task) * would have disabled the SVE access trap for userspace during * ret_to_user, making an SVE access trap impossible in that case. */ -void do_sve_acc(unsigned int esr, struct pt_regs *regs) +void do_sve_acc(unsigned long esr, struct pt_regs *regs) { /* Even if we chose not to use SVE, the hardware could still trap: */ if (unlikely(!system_supports_sve()) || WARN_ON(is_compat_task())) { @@ -1024,29 +1401,84 @@ void do_sve_acc(unsigned int esr, struct pt_regs *regs) WARN_ON(1); /* SVE access shouldn't have trapped */ /* - * Convert the FPSIMD state to SVE, zeroing all the state that - * is not shared with FPSIMD. If (as is likely) the current - * state is live in the registers then do this there and - * update our metadata for the current task including - * disabling the trap, otherwise update our in-memory copy. + * Even if the task can have used streaming mode we can only + * generate SVE access traps in normal SVE mode and + * transitioning out of streaming mode may discard any + * streaming mode state. Always clear the high bits to avoid + * any potential errors tracking what is properly initialised. + */ + sve_init_regs(); + + put_cpu_fpsimd_context(); +} + +/* + * Trapped SME access + * + * Storage is allocated for the full SVE and SME state, the current + * FPSIMD register contents are migrated to SVE if SVE is not already + * active, and the access trap is disabled. + * + * TIF_SME should be clear on entry: otherwise, fpsimd_restore_current_state() + * would have disabled the SME access trap for userspace during + * ret_to_user, making an SVE access trap impossible in that case. + */ +void do_sme_acc(unsigned long esr, struct pt_regs *regs) +{ + /* Even if we chose not to use SME, the hardware could still trap: */ + if (unlikely(!system_supports_sme()) || WARN_ON(is_compat_task())) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + /* + * If this not a trap due to SME being disabled then something + * is being used in the wrong mode, report as SIGILL. */ + if (ESR_ELx_ISS(esr) != ESR_ELx_SME_ISS_SME_DISABLED) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + sve_alloc(current); + sme_alloc(current); + if (!current->thread.sve_state || !current->thread.za_state) { + force_sig(SIGKILL); + return; + } + + get_cpu_fpsimd_context(); + + /* With TIF_SME userspace shouldn't generate any traps */ + if (test_and_set_thread_flag(TIF_SME)) + WARN_ON(1); + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { unsigned long vq_minus_one = - sve_vq_from_vl(task_get_sve_vl(current)) - 1; - sve_set_vq(vq_minus_one); - sve_flush_live(true, vq_minus_one); + sve_vq_from_vl(task_get_sme_vl(current)) - 1; + sme_set_vq(vq_minus_one); + fpsimd_bind_task_to_cpu(); - } else { - fpsimd_to_sve(current); } + /* + * If SVE was not already active initialise the SVE registers, + * any non-shared state between the streaming and regular SVE + * registers is architecturally guaranteed to be zeroed when + * we enter streaming mode. We do not need to initialize ZA + * since ZA must be disabled at this point and enabling ZA is + * architecturally defined to zero ZA. + */ + if (system_supports_sve() && !test_thread_flag(TIF_SVE)) + sve_init_regs(); + put_cpu_fpsimd_context(); } /* * Trapped FP/ASIMD access. */ -void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) +void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs) { /* TODO: implement lazy context saving/restoring */ WARN_ON(1); @@ -1055,7 +1487,7 @@ void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) /* * Raise a SIGFPE for the current process. */ -void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) +void do_fpsimd_exc(unsigned long esr, struct pt_regs *regs) { unsigned int si_code = FPE_FLTUNK; @@ -1141,6 +1573,9 @@ static void fpsimd_flush_thread_vl(enum vec_type type) void fpsimd_flush_thread(void) { + void *sve_state = NULL; + void *za_state = NULL; + if (!system_supports_fpsimd()) return; @@ -1152,11 +1587,28 @@ void fpsimd_flush_thread(void) if (system_supports_sve()) { clear_thread_flag(TIF_SVE); - sve_free(current); + + /* Defer kfree() while in atomic context */ + sve_state = current->thread.sve_state; + current->thread.sve_state = NULL; + fpsimd_flush_thread_vl(ARM64_VEC_SVE); } + if (system_supports_sme()) { + clear_thread_flag(TIF_SME); + + /* Defer kfree() while in atomic context */ + za_state = current->thread.za_state; + current->thread.za_state = NULL; + + fpsimd_flush_thread_vl(ARM64_VEC_SME); + current->thread.svcr = 0; + } + put_cpu_fpsimd_context(); + kfree(sve_state); + kfree(za_state); } /* @@ -1198,22 +1650,34 @@ static void fpsimd_bind_task_to_cpu(void) WARN_ON(!system_supports_fpsimd()); last->st = ¤t->thread.uw.fpsimd_state; last->sve_state = current->thread.sve_state; + last->za_state = current->thread.za_state; last->sve_vl = task_get_sve_vl(current); + last->sme_vl = task_get_sme_vl(current); + last->svcr = ¤t->thread.svcr; current->thread.fpsimd_cpu = smp_processor_id(); + /* + * Toggle SVE and SME trapping for userspace if needed, these + * are serialsied by ret_to_user(). + */ + if (system_supports_sme()) { + if (test_thread_flag(TIF_SME)) + sme_user_enable(); + else + sme_user_disable(); + } + if (system_supports_sve()) { - /* Toggle SVE trapping for userspace if needed */ if (test_thread_flag(TIF_SVE)) sve_user_enable(); else sve_user_disable(); - - /* Serialised by exception return to user */ } } void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state, - unsigned int sve_vl) + unsigned int sve_vl, void *za_state, + unsigned int sme_vl, u64 *svcr) { struct fpsimd_last_state_struct *last = this_cpu_ptr(&fpsimd_last_state); @@ -1222,8 +1686,11 @@ void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state, WARN_ON(!in_softirq() && !irqs_disabled()); last->st = st; + last->svcr = svcr; last->sve_state = sve_state; + last->za_state = za_state; last->sve_vl = sve_vl; + last->sme_vl = sme_vl; } /* @@ -1320,6 +1787,15 @@ static void fpsimd_flush_cpu_state(void) { WARN_ON(!system_supports_fpsimd()); __this_cpu_write(fpsimd_last_state.st, NULL); + + /* + * Leaving streaming mode enabled will cause issues for any kernel + * NEON and leaving streaming mode or ZA enabled may increase power + * consumption. + */ + if (system_supports_sme()) + sme_smstop(); + set_thread_flag(TIF_FOREIGN_FPSTATE); } @@ -1397,6 +1873,7 @@ EXPORT_SYMBOL(kernel_neon_end); static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state); static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); static DEFINE_PER_CPU(bool, efi_sve_state_used); +static DEFINE_PER_CPU(bool, efi_sm_state); /* * EFI runtime services support functions @@ -1431,12 +1908,28 @@ void __efi_fpsimd_begin(void) */ if (system_supports_sve() && likely(efi_sve_state)) { char *sve_state = this_cpu_ptr(efi_sve_state); + bool ffr = true; + u64 svcr; __this_cpu_write(efi_sve_state_used, true); + if (system_supports_sme()) { + svcr = read_sysreg_s(SYS_SVCR); + + if (!system_supports_fa64()) + ffr = svcr & SVCR_SM_MASK; + + __this_cpu_write(efi_sm_state, ffr); + } + sve_save_state(sve_state + sve_ffr_offset(sve_max_vl()), &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - true); + ffr); + + if (system_supports_sme()) + sysreg_clear_set_s(SYS_SVCR, + SVCR_SM_MASK, 0); + } else { fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); } @@ -1459,11 +1952,26 @@ void __efi_fpsimd_end(void) if (system_supports_sve() && likely(__this_cpu_read(efi_sve_state_used))) { char const *sve_state = this_cpu_ptr(efi_sve_state); + bool ffr = true; + + /* + * Restore streaming mode; EFI calls are + * normal function calls so should not return in + * streaming mode. + */ + if (system_supports_sme()) { + if (__this_cpu_read(efi_sm_state)) { + sysreg_clear_set_s(SYS_SVCR, + 0, + SVCR_SM_MASK); + if (!system_supports_fa64()) + ffr = efi_sm_state; + } + } - sve_set_vq(sve_vq_from_vl(sve_get_vl()) - 1); sve_load_state(sve_state + sve_ffr_offset(sve_max_vl()), &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - true); + ffr); __this_cpu_write(efi_sve_state_used, false); } else { @@ -1538,6 +2046,13 @@ static int __init fpsimd_init(void) if (!cpu_have_named_feature(ASIMD)) pr_notice("Advanced SIMD is not implemented\n"); - return sve_sysctl_init(); + + if (cpu_have_named_feature(SME) && !cpu_have_named_feature(SVE)) + pr_notice("SME is implemented but not SVE\n"); + + sve_sysctl_init(); + sme_sysctl_init(); + + return 0; } core_initcall(fpsimd_init); diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 4506c4a90ac1..f447c4a36f69 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -268,6 +268,22 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, } #ifdef CONFIG_DYNAMIC_FTRACE + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + /* + * When DYNAMIC_FTRACE_WITH_REGS is selected, `fregs` can never be NULL + * and arch_ftrace_get_regs(fregs) will always give a non-NULL pt_regs + * in which we can safely modify the LR. + */ + struct pt_regs *regs = arch_ftrace_get_regs(fregs); + unsigned long *parent = (unsigned long *)&procedure_link_pointer(regs); + + prepare_ftrace_return(ip, parent, frame_pointer(regs)); +} +#else /* * Turn on/off the call to ftrace_graph_caller() in ftrace_caller() * depending on @enable. @@ -297,5 +313,6 @@ int ftrace_disable_ftrace_graph_caller(void) { return ftrace_modify_graph_caller(false); } +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index cd868084e724..b29a311bb055 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -617,7 +617,7 @@ NOKPROBE_SYMBOL(toggle_bp_registers); /* * Debug exception handlers. */ -static int breakpoint_handler(unsigned long unused, unsigned int esr, +static int breakpoint_handler(unsigned long unused, unsigned long esr, struct pt_regs *regs) { int i, step = 0, *kernel_step; @@ -751,7 +751,7 @@ static int watchpoint_report(struct perf_event *wp, unsigned long addr, return step; } -static int watchpoint_handler(unsigned long addr, unsigned int esr, +static int watchpoint_handler(unsigned long addr, unsigned long esr, struct pt_regs *regs) { int i, step = 0, *kernel_step, access, closest_match = 0; diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index 2aede780fb80..cda9c1e9864f 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -232,14 +232,14 @@ int kgdb_arch_handle_exception(int exception_vector, int signo, return err; } -static int kgdb_brk_fn(struct pt_regs *regs, unsigned int esr) +static int kgdb_brk_fn(struct pt_regs *regs, unsigned long esr) { kgdb_handle_exception(1, SIGTRAP, 0, regs); return DBG_HOOK_HANDLED; } NOKPROBE_SYMBOL(kgdb_brk_fn) -static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr) +static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned long esr) { compiled_break = 1; kgdb_handle_exception(1, SIGTRAP, 0, regs); @@ -248,7 +248,7 @@ static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr) } NOKPROBE_SYMBOL(kgdb_compiled_brk_fn); -static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr) +static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned long esr) { if (!kgdb_single_step) return DBG_HOOK_ERROR; diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index e16b248699d5..19c2d487cb08 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -329,8 +329,13 @@ bool crash_is_nosave(unsigned long pfn) /* in reserved memory? */ addr = __pfn_to_phys(pfn); - if ((addr < crashk_res.start) || (crashk_res.end < addr)) - return false; + if ((addr < crashk_res.start) || (crashk_res.end < addr)) { + if (!crashk_low_res.end) + return false; + + if ((addr < crashk_low_res.start) || (crashk_low_res.end < addr)) + return false; + } if (!kexec_crash_image) return true; diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 59c648d51848..889951291cc0 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -65,10 +65,18 @@ static int prepare_elf_headers(void **addr, unsigned long *sz) /* Exclude crashkernel region */ ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) + goto out; + + if (crashk_low_res.end) { + ret = crash_exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + if (ret) + goto out; + } - if (!ret) - ret = crash_prepare_elf64_headers(cmem, true, addr, sz); + ret = crash_prepare_elf64_headers(cmem, true, addr, sz); +out: kfree(cmem); return ret; } diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 78b3e0f8e997..57b30bcf9f21 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -15,6 +15,7 @@ #include <linux/swapops.h> #include <linux/thread_info.h> #include <linux/types.h> +#include <linux/uaccess.h> #include <linux/uio.h> #include <asm/barrier.h> @@ -76,6 +77,9 @@ void mte_sync_tags(pte_t old_pte, pte_t pte) mte_sync_page_tags(page, old_pte, check_swap, pte_is_tagged); } + + /* ensure the tags are visible before the PTE is set */ + smp_wmb(); } int memcmp_pages(struct page *page1, struct page *page2) @@ -106,7 +110,8 @@ int memcmp_pages(struct page *page1, struct page *page2) static inline void __mte_enable_kernel(const char *mode, unsigned long tcf) { /* Enable MTE Sync Mode for EL1. */ - sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, tcf); + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK, + SYS_FIELD_PREP(SCTLR_EL1, TCF, tcf)); isb(); pr_info_once("MTE: enabled in %s mode at EL1\n", mode); @@ -122,12 +127,12 @@ void mte_enable_kernel_sync(void) WARN_ONCE(system_uses_mte_async_or_asymm_mode(), "MTE async mode enabled system wide!"); - __mte_enable_kernel("synchronous", SCTLR_ELx_TCF_SYNC); + __mte_enable_kernel("synchronous", SCTLR_EL1_TCF_SYNC); } void mte_enable_kernel_async(void) { - __mte_enable_kernel("asynchronous", SCTLR_ELx_TCF_ASYNC); + __mte_enable_kernel("asynchronous", SCTLR_EL1_TCF_ASYNC); /* * MTE async mode is set system wide by the first PE that @@ -144,7 +149,7 @@ void mte_enable_kernel_async(void) void mte_enable_kernel_asymm(void) { if (cpus_have_cap(ARM64_MTE_ASYMM)) { - __mte_enable_kernel("asymmetric", SCTLR_ELx_TCF_ASYMM); + __mte_enable_kernel("asymmetric", SCTLR_EL1_TCF_ASYMM); /* * MTE asymm mode behaves as async mode for store @@ -216,11 +221,11 @@ static void mte_update_sctlr_user(struct task_struct *task) * default order. */ if (resolved_mte_tcf & MTE_CTRL_TCF_ASYMM) - sctlr |= SCTLR_EL1_TCF0_ASYMM; + sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, ASYMM); else if (resolved_mte_tcf & MTE_CTRL_TCF_ASYNC) - sctlr |= SCTLR_EL1_TCF0_ASYNC; + sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, ASYNC); else if (resolved_mte_tcf & MTE_CTRL_TCF_SYNC) - sctlr |= SCTLR_EL1_TCF0_SYNC; + sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, SYNC); task->thread.sctlr_user = sctlr; } @@ -543,3 +548,32 @@ static int register_mte_tcf_preferred_sysctl(void) return 0; } subsys_initcall(register_mte_tcf_preferred_sysctl); + +/* + * Return 0 on success, the number of bytes not probed otherwise. + */ +size_t mte_probe_user_range(const char __user *uaddr, size_t size) +{ + const char __user *end = uaddr + size; + int err = 0; + char val; + + __raw_get_user(val, uaddr, err); + if (err) + return size; + + uaddr = PTR_ALIGN(uaddr, MTE_GRANULE_SIZE); + while (uaddr < end) { + /* + * A read is sufficient for mte, the caller should have probed + * for the pte write permission if required. + */ + __raw_get_user(val, uaddr, err); + if (err) + return end - uaddr; + uaddr += MTE_GRANULE_SIZE; + } + (void)val; + + return 0; +} diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index 75fed4460407..57c7c211f8c7 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -35,7 +35,7 @@ static u64 native_steal_clock(int cpu) DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); struct pv_time_stolen_time_region { - struct pvclock_vcpu_stolen_time *kaddr; + struct pvclock_vcpu_stolen_time __rcu *kaddr; }; static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region); @@ -52,7 +52,9 @@ early_param("no-steal-acc", parse_no_stealacc); /* return stolen time in ns by asking the hypervisor */ static u64 para_steal_clock(int cpu) { + struct pvclock_vcpu_stolen_time *kaddr = NULL; struct pv_time_stolen_time_region *reg; + u64 ret = 0; reg = per_cpu_ptr(&stolen_time_region, cpu); @@ -61,28 +63,37 @@ static u64 para_steal_clock(int cpu) * online notification callback runs. Until the callback * has run we just return zero. */ - if (!reg->kaddr) + rcu_read_lock(); + kaddr = rcu_dereference(reg->kaddr); + if (!kaddr) { + rcu_read_unlock(); return 0; + } - return le64_to_cpu(READ_ONCE(reg->kaddr->stolen_time)); + ret = le64_to_cpu(READ_ONCE(kaddr->stolen_time)); + rcu_read_unlock(); + return ret; } static int stolen_time_cpu_down_prepare(unsigned int cpu) { + struct pvclock_vcpu_stolen_time *kaddr = NULL; struct pv_time_stolen_time_region *reg; reg = this_cpu_ptr(&stolen_time_region); if (!reg->kaddr) return 0; - memunmap(reg->kaddr); - memset(reg, 0, sizeof(*reg)); + kaddr = rcu_replace_pointer(reg->kaddr, NULL, true); + synchronize_rcu(); + memunmap(kaddr); return 0; } static int stolen_time_cpu_online(unsigned int cpu) { + struct pvclock_vcpu_stolen_time *kaddr = NULL; struct pv_time_stolen_time_region *reg; struct arm_smccc_res res; @@ -93,17 +104,19 @@ static int stolen_time_cpu_online(unsigned int cpu) if (res.a0 == SMCCC_RET_NOT_SUPPORTED) return -EINVAL; - reg->kaddr = memremap(res.a0, + kaddr = memremap(res.a0, sizeof(struct pvclock_vcpu_stolen_time), MEMREMAP_WB); + rcu_assign_pointer(reg->kaddr, kaddr); + if (!reg->kaddr) { pr_warn("Failed to map stolen time data structure\n"); return -ENOMEM; } - if (le32_to_cpu(reg->kaddr->revision) != 0 || - le32_to_cpu(reg->kaddr->attributes) != 0) { + if (le32_to_cpu(kaddr->revision) != 0 || + le32_to_cpu(kaddr->attributes) != 0) { pr_warn_once("Unexpected revision or attributes in stolen time data\n"); return -ENXIO; } diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index d9dfa82c1f18..d1d182320245 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -335,7 +335,7 @@ static void __kprobes kprobe_handler(struct pt_regs *regs) } static int __kprobes -kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned int esr) +kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); unsigned long addr = instruction_pointer(regs); @@ -359,7 +359,7 @@ static struct break_hook kprobes_break_ss_hook = { }; static int __kprobes -kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr) +kprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr) { kprobe_handler(regs); return DBG_HOOK_HANDLED; diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index 9be668f3f034..d49aef2657cd 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -166,7 +166,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, } static int uprobe_breakpoint_handler(struct pt_regs *regs, - unsigned int esr) + unsigned long esr) { if (uprobe_pre_sstep_notifier(regs)) return DBG_HOOK_HANDLED; @@ -175,7 +175,7 @@ static int uprobe_breakpoint_handler(struct pt_regs *regs, } static int uprobe_single_step_handler(struct pt_regs *regs, - unsigned int esr) + unsigned long esr) { struct uprobe_task *utask = current->utask; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 7fa97df55e3a..9734c9fb1a32 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -250,6 +250,8 @@ void show_regs(struct pt_regs *regs) static void tls_thread_flush(void) { write_sysreg(0, tpidr_el0); + if (system_supports_tpidr2()) + write_sysreg_s(0, SYS_TPIDR2_EL0); if (is_compat_task()) { current->thread.uw.tp_value = 0; @@ -298,16 +300,42 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) /* * Detach src's sve_state (if any) from dst so that it does not - * get erroneously used or freed prematurely. dst's sve_state + * get erroneously used or freed prematurely. dst's copies * will be allocated on demand later on if dst uses SVE. * For consistency, also clear TIF_SVE here: this could be done * later in copy_process(), but to avoid tripping up future - * maintainers it is best not to leave TIF_SVE and sve_state in + * maintainers it is best not to leave TIF flags and buffers in * an inconsistent state, even temporarily. */ dst->thread.sve_state = NULL; clear_tsk_thread_flag(dst, TIF_SVE); + /* + * In the unlikely event that we create a new thread with ZA + * enabled we should retain the ZA state so duplicate it here. + * This may be shortly freed if we exec() or if CLONE_SETTLS + * but it's simpler to do it here. To avoid confusing the rest + * of the code ensure that we have a sve_state allocated + * whenever za_state is allocated. + */ + if (thread_za_enabled(&src->thread)) { + dst->thread.sve_state = kzalloc(sve_state_size(src), + GFP_KERNEL); + if (!dst->thread.sve_state) + return -ENOMEM; + dst->thread.za_state = kmemdup(src->thread.za_state, + za_state_size(src), + GFP_KERNEL); + if (!dst->thread.za_state) { + kfree(dst->thread.sve_state); + dst->thread.sve_state = NULL; + return -ENOMEM; + } + } else { + dst->thread.za_state = NULL; + clear_tsk_thread_flag(dst, TIF_SME); + } + /* clear any pending asynchronous tag fault raised by the parent */ clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); @@ -343,6 +371,8 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, * out-of-sync with the saved value. */ *task_user_tls(p) = read_sysreg(tpidr_el0); + if (system_supports_tpidr2()) + p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); if (stack_start) { if (is_compat_thread(task_thread_info(p))) @@ -353,10 +383,12 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, /* * If a TLS pointer was passed to clone, use it for the new - * thread. + * thread. We also reset TPIDR2 if it's in use. */ - if (clone_flags & CLONE_SETTLS) + if (clone_flags & CLONE_SETTLS) { p->thread.uw.tp_value = tls; + p->thread.tpidr2_el0 = 0; + } } else { /* * A kthread has no context to ERET to, so ensure any buggy @@ -387,6 +419,8 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, void tls_preserve_current_state(void) { *task_user_tls(current) = read_sysreg(tpidr_el0); + if (system_supports_tpidr2() && !is_compat_task()) + current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); } static void tls_thread_switch(struct task_struct *next) @@ -399,6 +433,8 @@ static void tls_thread_switch(struct task_struct *next) write_sysreg(0, tpidrro_el0); write_sysreg(*task_user_tls(next), tpidr_el0); + if (system_supports_tpidr2()) + write_sysreg_s(next->thread.tpidr2_el0, SYS_TPIDR2_EL0); } /* diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 230a47b9189e..21da83187a60 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -713,21 +713,51 @@ static int system_call_set(struct task_struct *target, #ifdef CONFIG_ARM64_SVE static void sve_init_header_from_task(struct user_sve_header *header, - struct task_struct *target) + struct task_struct *target, + enum vec_type type) { unsigned int vq; + bool active; + bool fpsimd_only; + enum vec_type task_type; memset(header, 0, sizeof(*header)); - header->flags = test_tsk_thread_flag(target, TIF_SVE) ? - SVE_PT_REGS_SVE : SVE_PT_REGS_FPSIMD; - if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) - header->flags |= SVE_PT_VL_INHERIT; + /* Check if the requested registers are active for the task */ + if (thread_sm_enabled(&target->thread)) + task_type = ARM64_VEC_SME; + else + task_type = ARM64_VEC_SVE; + active = (task_type == type); + + switch (type) { + case ARM64_VEC_SVE: + if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) + header->flags |= SVE_PT_VL_INHERIT; + fpsimd_only = !test_tsk_thread_flag(target, TIF_SVE); + break; + case ARM64_VEC_SME: + if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) + header->flags |= SVE_PT_VL_INHERIT; + fpsimd_only = false; + break; + default: + WARN_ON_ONCE(1); + return; + } - header->vl = task_get_sve_vl(target); + if (active) { + if (fpsimd_only) { + header->flags |= SVE_PT_REGS_FPSIMD; + } else { + header->flags |= SVE_PT_REGS_SVE; + } + } + + header->vl = task_get_vl(target, type); vq = sve_vq_from_vl(header->vl); - header->max_vl = sve_max_vl(); + header->max_vl = vec_max_vl(type); header->size = SVE_PT_SIZE(vq, header->flags); header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl), SVE_PT_REGS_SVE); @@ -738,19 +768,17 @@ static unsigned int sve_size_from_header(struct user_sve_header const *header) return ALIGN(header->size, SVE_VQ_BYTES); } -static int sve_get(struct task_struct *target, - const struct user_regset *regset, - struct membuf to) +static int sve_get_common(struct task_struct *target, + const struct user_regset *regset, + struct membuf to, + enum vec_type type) { struct user_sve_header header; unsigned int vq; unsigned long start, end; - if (!system_supports_sve()) - return -EINVAL; - /* Header */ - sve_init_header_from_task(&header, target); + sve_init_header_from_task(&header, target, type); vq = sve_vq_from_vl(header.vl); membuf_write(&to, &header, sizeof(header)); @@ -758,49 +786,61 @@ static int sve_get(struct task_struct *target, if (target == current) fpsimd_preserve_current_state(); - /* Registers: FPSIMD-only case */ - BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); - if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) + BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); + + switch ((header.flags & SVE_PT_REGS_MASK)) { + case SVE_PT_REGS_FPSIMD: return __fpr_get(target, regset, to); - /* Otherwise: full SVE case */ + case SVE_PT_REGS_SVE: + start = SVE_PT_SVE_OFFSET; + end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); + membuf_write(&to, target->thread.sve_state, end - start); - BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); - start = SVE_PT_SVE_OFFSET; - end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); - membuf_write(&to, target->thread.sve_state, end - start); + start = end; + end = SVE_PT_SVE_FPSR_OFFSET(vq); + membuf_zero(&to, end - start); - start = end; - end = SVE_PT_SVE_FPSR_OFFSET(vq); - membuf_zero(&to, end - start); + /* + * Copy fpsr, and fpcr which must follow contiguously in + * struct fpsimd_state: + */ + start = end; + end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; + membuf_write(&to, &target->thread.uw.fpsimd_state.fpsr, + end - start); - /* - * Copy fpsr, and fpcr which must follow contiguously in - * struct fpsimd_state: - */ - start = end; - end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; - membuf_write(&to, &target->thread.uw.fpsimd_state.fpsr, end - start); + start = end; + end = sve_size_from_header(&header); + return membuf_zero(&to, end - start); - start = end; - end = sve_size_from_header(&header); - return membuf_zero(&to, end - start); + default: + return 0; + } } -static int sve_set(struct task_struct *target, +static int sve_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) + struct membuf to) +{ + if (!system_supports_sve()) + return -EINVAL; + + return sve_get_common(target, regset, to, ARM64_VEC_SVE); +} + +static int sve_set_common(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf, + enum vec_type type) { int ret; struct user_sve_header header; unsigned int vq; unsigned long start, end; - if (!system_supports_sve()) - return -EINVAL; - /* Header */ if (count < sizeof(header)) return -EINVAL; @@ -813,13 +853,37 @@ static int sve_set(struct task_struct *target, * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by * vec_set_vector_length(), which will also validate them for us: */ - ret = vec_set_vector_length(target, ARM64_VEC_SVE, header.vl, + ret = vec_set_vector_length(target, type, header.vl, ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); if (ret) goto out; /* Actual VL set may be less than the user asked for: */ - vq = sve_vq_from_vl(task_get_sve_vl(target)); + vq = sve_vq_from_vl(task_get_vl(target, type)); + + /* Enter/exit streaming mode */ + if (system_supports_sme()) { + u64 old_svcr = target->thread.svcr; + + switch (type) { + case ARM64_VEC_SVE: + target->thread.svcr &= ~SVCR_SM_MASK; + break; + case ARM64_VEC_SME: + target->thread.svcr |= SVCR_SM_MASK; + break; + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + + /* + * If we switched then invalidate any existing SVE + * state and ensure there's storage. + */ + if (target->thread.svcr != old_svcr) + sve_alloc(target); + } /* Registers: FPSIMD-only case */ @@ -828,10 +892,15 @@ static int sve_set(struct task_struct *target, ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, SVE_PT_FPSIMD_OFFSET); clear_tsk_thread_flag(target, TIF_SVE); + if (type == ARM64_VEC_SME) + fpsimd_force_sync_to_sve(target); goto out; } - /* Otherwise: full SVE case */ + /* + * Otherwise: no registers or full SVE case. For backwards + * compatibility reasons we treat empty flags as SVE registers. + */ /* * If setting a different VL from the requested VL and there is @@ -852,8 +921,9 @@ static int sve_set(struct task_struct *target, /* * Ensure target->thread.sve_state is up to date with target's - * FPSIMD regs, so that a short copyin leaves trailing registers - * unmodified. + * FPSIMD regs, so that a short copyin leaves trailing + * registers unmodified. Always enable SVE even if going into + * streaming mode. */ fpsimd_sync_to_sve(target); set_tsk_thread_flag(target, TIF_SVE); @@ -889,8 +959,181 @@ out: return ret; } +static int sve_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + if (!system_supports_sve()) + return -EINVAL; + + return sve_set_common(target, regset, pos, count, kbuf, ubuf, + ARM64_VEC_SVE); +} + #endif /* CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +static int ssve_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_sme()) + return -EINVAL; + + return sve_get_common(target, regset, to, ARM64_VEC_SME); +} + +static int ssve_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + if (!system_supports_sme()) + return -EINVAL; + + return sve_set_common(target, regset, pos, count, kbuf, ubuf, + ARM64_VEC_SME); +} + +static int za_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_za_header header; + unsigned int vq; + unsigned long start, end; + + if (!system_supports_sme()) + return -EINVAL; + + /* Header */ + memset(&header, 0, sizeof(header)); + + if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) + header.flags |= ZA_PT_VL_INHERIT; + + header.vl = task_get_sme_vl(target); + vq = sve_vq_from_vl(header.vl); + header.max_vl = sme_max_vl(); + header.max_size = ZA_PT_SIZE(vq); + + /* If ZA is not active there is only the header */ + if (thread_za_enabled(&target->thread)) + header.size = ZA_PT_SIZE(vq); + else + header.size = ZA_PT_ZA_OFFSET; + + membuf_write(&to, &header, sizeof(header)); + + BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); + end = ZA_PT_ZA_OFFSET; + + if (target == current) + fpsimd_preserve_current_state(); + + /* Any register data to include? */ + if (thread_za_enabled(&target->thread)) { + start = end; + end = ZA_PT_SIZE(vq); + membuf_write(&to, target->thread.za_state, end - start); + } + + /* Zero any trailing padding */ + start = end; + end = ALIGN(header.size, SVE_VQ_BYTES); + return membuf_zero(&to, end - start); +} + +static int za_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct user_za_header header; + unsigned int vq; + unsigned long start, end; + + if (!system_supports_sme()) + return -EINVAL; + + /* Header */ + if (count < sizeof(header)) + return -EINVAL; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, + 0, sizeof(header)); + if (ret) + goto out; + + /* + * All current ZA_PT_* flags are consumed by + * vec_set_vector_length(), which will also validate them for + * us: + */ + ret = vec_set_vector_length(target, ARM64_VEC_SME, header.vl, + ((unsigned long)header.flags) << 16); + if (ret) + goto out; + + /* Actual VL set may be less than the user asked for: */ + vq = sve_vq_from_vl(task_get_sme_vl(target)); + + /* Ensure there is some SVE storage for streaming mode */ + if (!target->thread.sve_state) { + sve_alloc(target); + if (!target->thread.sve_state) { + clear_thread_flag(TIF_SME); + ret = -ENOMEM; + goto out; + } + } + + /* Allocate/reinit ZA storage */ + sme_alloc(target); + if (!target->thread.za_state) { + ret = -ENOMEM; + clear_tsk_thread_flag(target, TIF_SME); + goto out; + } + + /* If there is no data then disable ZA */ + if (!count) { + target->thread.svcr &= ~SVCR_ZA_MASK; + goto out; + } + + /* + * If setting a different VL from the requested VL and there is + * register data, the data layout will be wrong: don't even + * try to set the registers in this case. + */ + if (vq != sve_vq_from_vl(header.vl)) { + ret = -EIO; + goto out; + } + + BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); + start = ZA_PT_ZA_OFFSET; + end = ZA_PT_SIZE(vq); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + target->thread.za_state, + start, end); + if (ret) + goto out; + + /* Mark ZA as active and let userspace use it */ + set_tsk_thread_flag(target, TIF_SME); + target->thread.svcr |= SVCR_ZA_MASK; + +out: + fpsimd_flush_task_state(target); + return ret; +} + +#endif /* CONFIG_ARM64_SME */ + #ifdef CONFIG_ARM64_PTR_AUTH static int pac_mask_get(struct task_struct *target, const struct user_regset *regset, @@ -1108,6 +1351,10 @@ enum aarch64_regset { #ifdef CONFIG_ARM64_SVE REGSET_SVE, #endif +#ifdef CONFIG_ARM64_SVE + REGSET_SSVE, + REGSET_ZA, +#endif #ifdef CONFIG_ARM64_PTR_AUTH REGSET_PAC_MASK, REGSET_PAC_ENABLED_KEYS, @@ -1188,6 +1435,33 @@ static const struct user_regset aarch64_regsets[] = { .set = sve_set, }, #endif +#ifdef CONFIG_ARM64_SME + [REGSET_SSVE] = { /* Streaming mode SVE */ + .core_note_type = NT_ARM_SSVE, + .n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE), + SVE_VQ_BYTES), + .size = SVE_VQ_BYTES, + .align = SVE_VQ_BYTES, + .regset_get = ssve_get, + .set = ssve_set, + }, + [REGSET_ZA] = { /* SME ZA */ + .core_note_type = NT_ARM_ZA, + /* + * ZA is a single register but it's variably sized and + * the ptrace core requires that the size of any data + * be an exact multiple of the configured register + * size so report as though we had SVE_VQ_BYTES + * registers. These values aren't exposed to + * userspace. + */ + .n = DIV_ROUND_UP(ZA_PT_SIZE(SME_VQ_MAX), SVE_VQ_BYTES), + .size = SVE_VQ_BYTES, + .align = SVE_VQ_BYTES, + .regset_get = za_get, + .set = za_set, + }, +#endif #ifdef CONFIG_ARM64_PTR_AUTH [REGSET_PAC_MASK] = { .core_note_type = NT_ARM_PAC_MASK, diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index f0a3df9e18a3..413f899e4ac6 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -37,6 +37,15 @@ * safe memory that has been set up to be preserved during the copy operation. */ SYM_CODE_START(arm64_relocate_new_kernel) + /* + * The kimage structure isn't allocated specially and may be clobbered + * during relocation. We must load any values we need from it prior to + * any relocation occurring. + */ + ldr x28, [x0, #KIMAGE_START] + ldr x27, [x0, #KIMAGE_ARCH_EL2_VECTORS] + ldr x26, [x0, #KIMAGE_ARCH_DTB_MEM] + /* Setup the list loop variables. */ ldr x18, [x0, #KIMAGE_ARCH_ZERO_PAGE] /* x18 = zero page for BBM */ ldr x17, [x0, #KIMAGE_ARCH_TTBR1] /* x17 = linear map copy */ @@ -72,21 +81,20 @@ SYM_CODE_START(arm64_relocate_new_kernel) ic iallu dsb nsh isb - ldr x4, [x0, #KIMAGE_START] /* relocation start */ - ldr x1, [x0, #KIMAGE_ARCH_EL2_VECTORS] /* relocation start */ - ldr x0, [x0, #KIMAGE_ARCH_DTB_MEM] /* dtb address */ turn_off_mmu x12, x13 /* Start new image. */ - cbz x1, .Lel1 - mov x1, x4 /* relocation start */ - mov x2, x0 /* dtb address */ + cbz x27, .Lel1 + mov x1, x28 /* kernel entry point */ + mov x2, x26 /* dtb address */ mov x3, xzr mov x4, xzr mov x0, #HVC_SOFT_RESTART hvc #0 /* Jumps from el2 */ .Lel1: + mov x0, x26 /* dtb address */ + mov x1, xzr mov x2, xzr mov x3, xzr - br x4 /* Jumps from el1 */ + br x28 /* Jumps from el1 */ SYM_CODE_END(arm64_relocate_new_kernel) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 3505789cf4bd..fea3223704b6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -225,6 +225,8 @@ static void __init request_standard_resources(void) kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); + insert_resource(&iomem_resource, &kernel_code); + insert_resource(&iomem_resource, &kernel_data); num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); @@ -246,20 +248,7 @@ static void __init request_standard_resources(void) res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; } - request_resource(&iomem_resource, res); - - if (kernel_code.start >= res->start && - kernel_code.end <= res->end) - request_resource(res, &kernel_code); - if (kernel_data.start >= res->start && - kernel_data.end <= res->end) - request_resource(res, &kernel_data); -#ifdef CONFIG_KEXEC_CORE - /* Userspace will find "Crash kernel" region in /proc/iomem. */ - if (crashk_res.end && crashk_res.start >= res->start && - crashk_res.end <= res->end) - request_resource(res, &crashk_res); -#endif + insert_resource(&iomem_resource, res); } } diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 4a4122ef6f39..edb2d9206a78 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -56,6 +56,7 @@ struct rt_sigframe_user_layout { unsigned long fpsimd_offset; unsigned long esr_offset; unsigned long sve_offset; + unsigned long za_offset; unsigned long extra_offset; unsigned long end_offset; }; @@ -218,6 +219,7 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx) struct user_ctxs { struct fpsimd_context __user *fpsimd; struct sve_context __user *sve; + struct za_context __user *za; }; #ifdef CONFIG_ARM64_SVE @@ -226,11 +228,17 @@ static int preserve_sve_context(struct sve_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; + u16 flags = 0; unsigned int vl = task_get_sve_vl(current); unsigned int vq = 0; - if (test_thread_flag(TIF_SVE)) + if (thread_sm_enabled(¤t->thread)) { + vl = task_get_sme_vl(current); vq = sve_vq_from_vl(vl); + flags |= SVE_SIG_FLAG_SM; + } else if (test_thread_flag(TIF_SVE)) { + vq = sve_vq_from_vl(vl); + } memset(reserved, 0, sizeof(reserved)); @@ -238,6 +246,7 @@ static int preserve_sve_context(struct sve_context __user *ctx) __put_user_error(round_up(SVE_SIG_CONTEXT_SIZE(vq), 16), &ctx->head.size, err); __put_user_error(vl, &ctx->vl, err); + __put_user_error(flags, &ctx->flags, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); @@ -258,18 +267,28 @@ static int preserve_sve_context(struct sve_context __user *ctx) static int restore_sve_fpsimd_context(struct user_ctxs *user) { int err; - unsigned int vq; + unsigned int vl, vq; struct user_fpsimd_state fpsimd; struct sve_context sve; if (__copy_from_user(&sve, user->sve, sizeof(sve))) return -EFAULT; - if (sve.vl != task_get_sve_vl(current)) + if (sve.flags & SVE_SIG_FLAG_SM) { + if (!system_supports_sme()) + return -EINVAL; + + vl = task_get_sme_vl(current); + } else { + vl = task_get_sve_vl(current); + } + + if (sve.vl != vl) return -EINVAL; if (sve.head.size <= sizeof(*user->sve)) { clear_thread_flag(TIF_SVE); + current->thread.svcr &= ~SVCR_SM_MASK; goto fpsimd_only; } @@ -301,7 +320,10 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (err) return -EFAULT; - set_thread_flag(TIF_SVE); + if (sve.flags & SVE_SIG_FLAG_SM) + current->thread.svcr |= SVCR_SM_MASK; + else + set_thread_flag(TIF_SVE); fpsimd_only: /* copy the FP and status/control registers */ @@ -326,6 +348,101 @@ extern int restore_sve_fpsimd_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +static int preserve_za_context(struct za_context __user *ctx) +{ + int err = 0; + u16 reserved[ARRAY_SIZE(ctx->__reserved)]; + unsigned int vl = task_get_sme_vl(current); + unsigned int vq; + + if (thread_za_enabled(¤t->thread)) + vq = sve_vq_from_vl(vl); + else + vq = 0; + + memset(reserved, 0, sizeof(reserved)); + + __put_user_error(ZA_MAGIC, &ctx->head.magic, err); + __put_user_error(round_up(ZA_SIG_CONTEXT_SIZE(vq), 16), + &ctx->head.size, err); + __put_user_error(vl, &ctx->vl, err); + BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); + err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); + + if (vq) { + /* + * This assumes that the ZA state has already been saved to + * the task struct by calling the function + * fpsimd_signal_preserve_current_state(). + */ + err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, + current->thread.za_state, + ZA_SIG_REGS_SIZE(vq)); + } + + return err ? -EFAULT : 0; +} + +static int restore_za_context(struct user_ctxs __user *user) +{ + int err; + unsigned int vq; + struct za_context za; + + if (__copy_from_user(&za, user->za, sizeof(za))) + return -EFAULT; + + if (za.vl != task_get_sme_vl(current)) + return -EINVAL; + + if (za.head.size <= sizeof(*user->za)) { + current->thread.svcr &= ~SVCR_ZA_MASK; + return 0; + } + + vq = sve_vq_from_vl(za.vl); + + if (za.head.size < ZA_SIG_CONTEXT_SIZE(vq)) + return -EINVAL; + + /* + * Careful: we are about __copy_from_user() directly into + * thread.za_state with preemption enabled, so protection is + * needed to prevent a racing context switch from writing stale + * registers back over the new data. + */ + + fpsimd_flush_task_state(current); + /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ + + sme_alloc(current); + if (!current->thread.za_state) { + current->thread.svcr &= ~SVCR_ZA_MASK; + clear_thread_flag(TIF_SME); + return -ENOMEM; + } + + err = __copy_from_user(current->thread.za_state, + (char __user const *)user->za + + ZA_SIG_REGS_OFFSET, + ZA_SIG_REGS_SIZE(vq)); + if (err) + return -EFAULT; + + set_thread_flag(TIF_SME); + current->thread.svcr |= SVCR_ZA_MASK; + + return 0; +} +#else /* ! CONFIG_ARM64_SME */ + +/* Turn any non-optimised out attempts to use these into a link error: */ +extern int preserve_za_context(void __user *ctx); +extern int restore_za_context(struct user_ctxs *user); + +#endif /* ! CONFIG_ARM64_SME */ static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) @@ -340,6 +457,7 @@ static int parse_user_sigframe(struct user_ctxs *user, user->fpsimd = NULL; user->sve = NULL; + user->za = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -393,7 +511,7 @@ static int parse_user_sigframe(struct user_ctxs *user, break; case SVE_MAGIC: - if (!system_supports_sve()) + if (!system_supports_sve() && !system_supports_sme()) goto invalid; if (user->sve) @@ -405,6 +523,19 @@ static int parse_user_sigframe(struct user_ctxs *user, user->sve = (struct sve_context __user *)head; break; + case ZA_MAGIC: + if (!system_supports_sme()) + goto invalid; + + if (user->za) + goto invalid; + + if (size < sizeof(*user->za)) + goto invalid; + + user->za = (struct za_context __user *)head; + break; + case EXTRA_MAGIC: if (have_extra_context) goto invalid; @@ -528,6 +659,9 @@ static int restore_sigframe(struct pt_regs *regs, } } + if (err == 0 && system_supports_sme() && user.za) + err = restore_za_context(&user); + return err; } @@ -594,11 +728,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, if (system_supports_sve()) { unsigned int vq = 0; - if (add_all || test_thread_flag(TIF_SVE)) { - int vl = sve_max_vl(); + if (add_all || test_thread_flag(TIF_SVE) || + thread_sm_enabled(¤t->thread)) { + int vl = max(sve_max_vl(), sme_max_vl()); if (!add_all) - vl = task_get_sve_vl(current); + vl = thread_get_cur_vl(¤t->thread); vq = sve_vq_from_vl(vl); } @@ -609,6 +744,24 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } + if (system_supports_sme()) { + unsigned int vl; + unsigned int vq = 0; + + if (add_all) + vl = sme_max_vl(); + else + vl = task_get_sme_vl(current); + + if (thread_za_enabled(¤t->thread)) + vq = sve_vq_from_vl(vl); + + err = sigframe_alloc(user, &user->za_offset, + ZA_SIG_CONTEXT_SIZE(vq)); + if (err) + return err; + } + return sigframe_alloc_end(user); } @@ -649,13 +802,21 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } - /* Scalable Vector Extension state, if present */ - if (system_supports_sve() && err == 0 && user->sve_offset) { + /* Scalable Vector Extension state (including streaming), if present */ + if ((system_supports_sve() || system_supports_sme()) && + err == 0 && user->sve_offset) { struct sve_context __user *sve_ctx = apply_user_offset(user, user->sve_offset); err |= preserve_sve_context(sve_ctx); } + /* ZA state if present */ + if (system_supports_sme() && err == 0 && user->za_offset) { + struct za_context __user *za_ctx = + apply_user_offset(user, user->za_offset); + err |= preserve_za_context(za_ctx); + } + if (err == 0 && user->extra_offset) { char __user *sfp = (char __user *)user->sigframe; char __user *userp = @@ -759,6 +920,13 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, /* TCO (Tag Check Override) always cleared for signal handlers */ regs->pstate &= ~PSR_TCO_BIT; + /* Signal handlers are invoked with ZA and streaming mode disabled */ + if (system_supports_sme()) { + current->thread.svcr &= ~(SVCR_ZA_MASK | + SVCR_SM_MASK); + sme_smstop(); + } + if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else @@ -1011,6 +1179,7 @@ static_assert(offsetof(siginfo_t, si_upper) == 0x28); static_assert(offsetof(siginfo_t, si_pkey) == 0x20); static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); static_assert(offsetof(siginfo_t, si_band) == 0x10); static_assert(offsetof(siginfo_t, si_fd) == 0x18); static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index d984282b979f..4700f8522d27 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -487,6 +487,7 @@ static_assert(offsetof(compat_siginfo_t, si_upper) == 0x18); static_assert(offsetof(compat_siginfo_t, si_pkey) == 0x14); static_assert(offsetof(compat_siginfo_t, si_perf_data) == 0x10); static_assert(offsetof(compat_siginfo_t, si_perf_type) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_perf_flags) == 0x18); static_assert(offsetof(compat_siginfo_t, si_band) == 0x0c); static_assert(offsetof(compat_siginfo_t, si_fd) == 0x10); static_assert(offsetof(compat_siginfo_t, si_call_addr) == 0x0c); diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 3b46041f2b97..62ed361a4376 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -512,6 +512,7 @@ struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu) { return &cpu_madt_gicc[cpu]; } +EXPORT_SYMBOL_GPL(acpi_cpu_get_madt_gicc); /* * acpi_map_gic_cpu_interface - parse processor MADT entry diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index e4103e085681..0467cb79f080 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -19,43 +19,60 @@ #include <asm/stacktrace.h> /* - * AArch64 PCS assigns the frame pointer to x29. + * A snapshot of a frame record or fp/lr register values, along with some + * accounting information necessary for robust unwinding. * - * A simple function prologue looks like this: - * sub sp, sp, #0x10 - * stp x29, x30, [sp] - * mov x29, sp + * @fp: The fp value in the frame record (or the real fp) + * @pc: The lr value in the frame record (or the real lr) * - * A simple function epilogue looks like this: - * mov sp, x29 - * ldp x29, x30, [sp] - * add sp, sp, #0x10 + * @stacks_done: Stacks which have been entirely unwound, for which it is no + * longer valid to unwind to. + * + * @prev_fp: The fp that pointed to this frame record, or a synthetic value + * of 0. This is used to ensure that within a stack, each + * subsequent frame record is at an increasing address. + * @prev_type: The type of stack this frame record was on, or a synthetic + * value of STACK_TYPE_UNKNOWN. This is used to detect a + * transition from one stack to another. + * + * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance + * associated with the most recently encountered replacement lr + * value. */ +struct unwind_state { + unsigned long fp; + unsigned long pc; + DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES); + unsigned long prev_fp; + enum stack_type prev_type; +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; +#endif +}; - -static notrace void start_backtrace(struct stackframe *frame, unsigned long fp, - unsigned long pc) +static notrace void unwind_init(struct unwind_state *state, unsigned long fp, + unsigned long pc) { - frame->fp = fp; - frame->pc = pc; + state->fp = fp; + state->pc = pc; #ifdef CONFIG_KRETPROBES - frame->kr_cur = NULL; + state->kr_cur = NULL; #endif /* * Prime the first unwind. * - * In unwind_frame() we'll check that the FP points to a valid stack, + * In unwind_next() we'll check that the FP points to a valid stack, * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be * treated as a transition to whichever stack that happens to be. The * prev_fp value won't be used, but we set it to 0 such that it is * definitely not an accessible stack address. */ - bitmap_zero(frame->stacks_done, __NR_STACK_TYPES); - frame->prev_fp = 0; - frame->prev_type = STACK_TYPE_UNKNOWN; + bitmap_zero(state->stacks_done, __NR_STACK_TYPES); + state->prev_fp = 0; + state->prev_type = STACK_TYPE_UNKNOWN; } -NOKPROBE_SYMBOL(start_backtrace); +NOKPROBE_SYMBOL(unwind_init); /* * Unwind from one frame record (A) to the next frame record (B). @@ -64,15 +81,12 @@ NOKPROBE_SYMBOL(start_backtrace); * records (e.g. a cycle), determined based on the location and fp value of A * and the location (but not the fp value) of B. */ -static int notrace unwind_frame(struct task_struct *tsk, - struct stackframe *frame) +static int notrace unwind_next(struct task_struct *tsk, + struct unwind_state *state) { - unsigned long fp = frame->fp; + unsigned long fp = state->fp; struct stack_info info; - if (!tsk) - tsk = current; - /* Final frame; nothing to unwind */ if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) return -ENOENT; @@ -83,7 +97,7 @@ static int notrace unwind_frame(struct task_struct *tsk, if (!on_accessible_stack(tsk, fp, 16, &info)) return -EINVAL; - if (test_bit(info.type, frame->stacks_done)) + if (test_bit(info.type, state->stacks_done)) return -EINVAL; /* @@ -99,27 +113,27 @@ static int notrace unwind_frame(struct task_struct *tsk, * stack to another, it's never valid to unwind back to that first * stack. */ - if (info.type == frame->prev_type) { - if (fp <= frame->prev_fp) + if (info.type == state->prev_type) { + if (fp <= state->prev_fp) return -EINVAL; } else { - set_bit(frame->prev_type, frame->stacks_done); + set_bit(state->prev_type, state->stacks_done); } /* * Record this frame record's values and location. The prev_fp and - * prev_type are only meaningful to the next unwind_frame() invocation. + * prev_type are only meaningful to the next unwind_next() invocation. */ - frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); - frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); - frame->prev_fp = fp; - frame->prev_type = info.type; + state->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); + state->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); + state->prev_fp = fp; + state->prev_type = info.type; - frame->pc = ptrauth_strip_insn_pac(frame->pc); + state->pc = ptrauth_strip_insn_pac(state->pc); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (tsk->ret_stack && - (frame->pc == (unsigned long)return_to_handler)) { + (state->pc == (unsigned long)return_to_handler)) { unsigned long orig_pc; /* * This is a case where function graph tracer has @@ -127,37 +141,37 @@ static int notrace unwind_frame(struct task_struct *tsk, * to hook a function return. * So replace it to an original value. */ - orig_pc = ftrace_graph_ret_addr(tsk, NULL, frame->pc, - (void *)frame->fp); - if (WARN_ON_ONCE(frame->pc == orig_pc)) + orig_pc = ftrace_graph_ret_addr(tsk, NULL, state->pc, + (void *)state->fp); + if (WARN_ON_ONCE(state->pc == orig_pc)) return -EINVAL; - frame->pc = orig_pc; + state->pc = orig_pc; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #ifdef CONFIG_KRETPROBES - if (is_kretprobe_trampoline(frame->pc)) - frame->pc = kretprobe_find_ret_addr(tsk, (void *)frame->fp, &frame->kr_cur); + if (is_kretprobe_trampoline(state->pc)) + state->pc = kretprobe_find_ret_addr(tsk, (void *)state->fp, &state->kr_cur); #endif return 0; } -NOKPROBE_SYMBOL(unwind_frame); +NOKPROBE_SYMBOL(unwind_next); -static void notrace walk_stackframe(struct task_struct *tsk, - struct stackframe *frame, - bool (*fn)(void *, unsigned long), void *data) +static void notrace unwind(struct task_struct *tsk, + struct unwind_state *state, + stack_trace_consume_fn consume_entry, void *cookie) { while (1) { int ret; - if (!fn(data, frame->pc)) + if (!consume_entry(cookie, state->pc)) break; - ret = unwind_frame(tsk, frame); + ret = unwind_next(tsk, state); if (ret < 0) break; } } -NOKPROBE_SYMBOL(walk_stackframe); +NOKPROBE_SYMBOL(unwind); static bool dump_backtrace_entry(void *arg, unsigned long where) { @@ -196,17 +210,17 @@ noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) { - struct stackframe frame; + struct unwind_state state; if (regs) - start_backtrace(&frame, regs->regs[29], regs->pc); + unwind_init(&state, regs->regs[29], regs->pc); else if (task == current) - start_backtrace(&frame, + unwind_init(&state, (unsigned long)__builtin_frame_address(1), (unsigned long)__builtin_return_address(0)); else - start_backtrace(&frame, thread_saved_fp(task), + unwind_init(&state, thread_saved_fp(task), thread_saved_pc(task)); - walk_stackframe(task, &frame, consume_entry, cookie); + unwind(task, &state, consume_entry, cookie); } diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index 12c6864e51e1..df14336c3a29 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -113,6 +113,6 @@ long compat_arm_syscall(struct pt_regs *regs, int scno) addr = instruction_pointer(regs) - (compat_thumb_mode(regs) ? 2 : 4); arm64_notify_die("Oops - bad compat syscall(2)", regs, - SIGILL, ILL_ILLTRP, addr, scno); + SIGILL, ILL_ILLTRP, addr, 0); return 0; } diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index c938603b3ba0..733451fe7e41 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -158,11 +158,36 @@ trace_exit: syscall_trace_exit(regs); } -static inline void sve_user_discard(void) +/* + * As per the ABI exit SME streaming mode and clear the SVE state not + * shared with FPSIMD on syscall entry. + */ +static inline void fp_user_discard(void) { + /* + * If SME is active then exit streaming mode. If ZA is active + * then flush the SVE registers but leave userspace access to + * both SVE and SME enabled, otherwise disable SME for the + * task and fall through to disabling SVE too. This means + * that after a syscall we never have any streaming mode + * register state to track, if this changes the KVM code will + * need updating. + */ + if (system_supports_sme() && test_thread_flag(TIF_SME)) { + u64 svcr = read_sysreg_s(SYS_SVCR); + + if (svcr & SVCR_SM_MASK) + sme_smstop_sm(); + } + if (!system_supports_sve()) return; + /* + * If SME is not active then disable SVE, the registers will + * be cleared when userspace next attempts to access them and + * we do not need to track the SVE register state until then. + */ clear_thread_flag(TIF_SVE); /* @@ -177,7 +202,7 @@ static inline void sve_user_discard(void) void do_el0_svc(struct pt_regs *regs) { - sve_user_discard(); + fp_user_discard(); el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table); } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 0529fd57567e..9ac7a81b79be 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -242,7 +242,7 @@ static void arm64_show_signal(int signo, const char *str) static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct task_struct *tsk = current; - unsigned int esr = tsk->thread.fault_code; + unsigned long esr = tsk->thread.fault_code; struct pt_regs *regs = task_pt_regs(tsk); /* Leave if the signal won't be shown */ @@ -253,7 +253,7 @@ static void arm64_show_signal(int signo, const char *str) pr_info("%s[%d]: unhandled exception: ", tsk->comm, task_pid_nr(tsk)); if (esr) - pr_cont("%s, ESR 0x%08x, ", esr_get_class_string(esr), esr); + pr_cont("%s, ESR 0x%016lx, ", esr_get_class_string(esr), esr); pr_cont("%s", str); print_vma_addr(KERN_CONT " in ", regs->pc); @@ -287,7 +287,7 @@ void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, void arm64_notify_die(const char *str, struct pt_regs *regs, int signo, int sicode, unsigned long far, - int err) + unsigned long err) { if (user_mode(regs)) { WARN_ON(regs != current_pt_regs()); @@ -439,7 +439,7 @@ exit: return fn ? fn(regs, instr) : 1; } -void force_signal_inject(int signal, int code, unsigned long address, unsigned int err) +void force_signal_inject(int signal, int code, unsigned long address, unsigned long err) { const char *desc; struct pt_regs *regs = current_pt_regs(); @@ -506,7 +506,7 @@ void do_bti(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_bti); -void do_ptrauth_fault(struct pt_regs *regs, unsigned int esr) +void do_ptrauth_fault(struct pt_regs *regs, unsigned long esr) { /* * Unexpected FPAC exception or pointer authentication failure in @@ -532,7 +532,7 @@ NOKPROBE_SYMBOL(do_ptrauth_fault); uaccess_ttbr0_disable(); \ } -static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) +static void user_cache_maint_handler(unsigned long esr, struct pt_regs *regs) { unsigned long tagged_address, address; int rt = ESR_ELx_SYS64_ISS_RT(esr); @@ -572,7 +572,7 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void ctr_read_handler(unsigned int esr, struct pt_regs *regs) +static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = ESR_ELx_SYS64_ISS_RT(esr); unsigned long val = arm64_ftr_reg_user_value(&arm64_ftr_reg_ctrel0); @@ -591,7 +591,7 @@ static void ctr_read_handler(unsigned int esr, struct pt_regs *regs) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs) +static void cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = ESR_ELx_SYS64_ISS_RT(esr); @@ -599,7 +599,7 @@ static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs) +static void cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = ESR_ELx_SYS64_ISS_RT(esr); @@ -607,7 +607,7 @@ static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void mrs_handler(unsigned int esr, struct pt_regs *regs) +static void mrs_handler(unsigned long esr, struct pt_regs *regs) { u32 sysreg, rt; @@ -618,15 +618,15 @@ static void mrs_handler(unsigned int esr, struct pt_regs *regs) force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } -static void wfi_handler(unsigned int esr, struct pt_regs *regs) +static void wfi_handler(unsigned long esr, struct pt_regs *regs) { arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } struct sys64_hook { - unsigned int esr_mask; - unsigned int esr_val; - void (*handler)(unsigned int esr, struct pt_regs *regs); + unsigned long esr_mask; + unsigned long esr_val; + void (*handler)(unsigned long esr, struct pt_regs *regs); }; static const struct sys64_hook sys64_hooks[] = { @@ -675,7 +675,7 @@ static const struct sys64_hook sys64_hooks[] = { }; #ifdef CONFIG_COMPAT -static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs) +static bool cp15_cond_valid(unsigned long esr, struct pt_regs *regs) { int cond; @@ -695,7 +695,7 @@ static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs) return aarch32_opcode_cond_checks[cond](regs->pstate); } -static void compat_cntfrq_read_handler(unsigned int esr, struct pt_regs *regs) +static void compat_cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { int reg = (esr & ESR_ELx_CP15_32_ISS_RT_MASK) >> ESR_ELx_CP15_32_ISS_RT_SHIFT; @@ -712,7 +712,7 @@ static const struct sys64_hook cp15_32_hooks[] = { {}, }; -static void compat_cntvct_read_handler(unsigned int esr, struct pt_regs *regs) +static void compat_cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = (esr & ESR_ELx_CP15_64_ISS_RT_MASK) >> ESR_ELx_CP15_64_ISS_RT_SHIFT; int rt2 = (esr & ESR_ELx_CP15_64_ISS_RT2_MASK) >> ESR_ELx_CP15_64_ISS_RT2_SHIFT; @@ -737,7 +737,7 @@ static const struct sys64_hook cp15_64_hooks[] = { {}, }; -void do_cp15instr(unsigned int esr, struct pt_regs *regs) +void do_cp15instr(unsigned long esr, struct pt_regs *regs) { const struct sys64_hook *hook, *hook_base; @@ -778,7 +778,7 @@ void do_cp15instr(unsigned int esr, struct pt_regs *regs) NOKPROBE_SYMBOL(do_cp15instr); #endif -void do_sysinstr(unsigned int esr, struct pt_regs *regs) +void do_sysinstr(unsigned long esr, struct pt_regs *regs) { const struct sys64_hook *hook; @@ -821,6 +821,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_SVE] = "SVE", [ESR_ELx_EC_ERET] = "ERET/ERETAA/ERETAB", [ESR_ELx_EC_FPAC] = "FPAC", + [ESR_ELx_EC_SME] = "SME", [ESR_ELx_EC_IMP_DEF] = "EL3 IMP DEF", [ESR_ELx_EC_IABT_LOW] = "IABT (lower EL)", [ESR_ELx_EC_IABT_CUR] = "IABT (current EL)", @@ -842,7 +843,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_BRK64] = "BRK (AArch64)", }; -const char *esr_get_class_string(u32 esr) +const char *esr_get_class_string(unsigned long esr) { return esr_class_str[ESR_ELx_EC(esr)]; } @@ -851,7 +852,7 @@ const char *esr_get_class_string(u32 esr) * bad_el0_sync handles unexpected, but potentially recoverable synchronous * exceptions taken from EL0. */ -void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) +void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr) { unsigned long pc = instruction_pointer(regs); @@ -867,7 +868,7 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) __aligned(16); -void panic_bad_stack(struct pt_regs *regs, unsigned int esr, unsigned long far) +void panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far) { unsigned long tsk_stk = (unsigned long)current->stack; unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); @@ -876,7 +877,7 @@ void panic_bad_stack(struct pt_regs *regs, unsigned int esr, unsigned long far) console_verbose(); pr_emerg("Insufficient stack space to handle exception!"); - pr_emerg("ESR: 0x%08x -- %s\n", esr, esr_get_class_string(esr)); + pr_emerg("ESR: 0x%016lx -- %s\n", esr, esr_get_class_string(esr)); pr_emerg("FAR: 0x%016lx\n", far); pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", @@ -897,11 +898,11 @@ void panic_bad_stack(struct pt_regs *regs, unsigned int esr, unsigned long far) } #endif -void __noreturn arm64_serror_panic(struct pt_regs *regs, u32 esr) +void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr) { console_verbose(); - pr_crit("SError Interrupt on CPU%d, code 0x%08x -- %s\n", + pr_crit("SError Interrupt on CPU%d, code 0x%016lx -- %s\n", smp_processor_id(), esr, esr_get_class_string(esr)); if (regs) __show_regs(regs); @@ -912,9 +913,9 @@ void __noreturn arm64_serror_panic(struct pt_regs *regs, u32 esr) unreachable(); } -bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) +bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr) { - u32 aet = arm64_ras_serror_get_severity(esr); + unsigned long aet = arm64_ras_serror_get_severity(esr); switch (aet) { case ESR_ELx_AET_CE: /* corrected error */ @@ -944,7 +945,7 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) } } -void do_serror(struct pt_regs *regs, unsigned int esr) +void do_serror(struct pt_regs *regs, unsigned long esr) { /* non-RAS errors are not containable */ if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) @@ -965,7 +966,7 @@ int is_valid_bugaddr(unsigned long addr) return 1; } -static int bug_handler(struct pt_regs *regs, unsigned int esr) +static int bug_handler(struct pt_regs *regs, unsigned long esr) { switch (report_bug(regs->pc, regs)) { case BUG_TRAP_TYPE_BUG: @@ -990,7 +991,7 @@ static struct break_hook bug_break_hook = { .imm = BUG_BRK_IMM, }; -static int reserved_fault_handler(struct pt_regs *regs, unsigned int esr) +static int reserved_fault_handler(struct pt_regs *regs, unsigned long esr) { pr_err("%s generated an invalid instruction at %pS!\n", "Kernel text patching", @@ -1012,7 +1013,7 @@ static struct break_hook fault_break_hook = { #define KASAN_ESR_SIZE_MASK 0x0f #define KASAN_ESR_SIZE(esr) (1 << ((esr) & KASAN_ESR_SIZE_MASK)) -static int kasan_handler(struct pt_regs *regs, unsigned int esr) +static int kasan_handler(struct pt_regs *regs, unsigned long esr) { bool recover = esr & KASAN_ESR_RECOVER; bool write = esr & KASAN_ESR_WRITE; @@ -1055,11 +1056,11 @@ static struct break_hook kasan_break_hook = { * Initial handler for AArch64 BRK exceptions * This handler only used until debug_traps_init(). */ -int __init early_brk64(unsigned long addr, unsigned int esr, +int __init early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs) { #ifdef CONFIG_KASAN_SW_TAGS - unsigned int comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; + unsigned long comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; if ((comment & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) return kasan_handler(regs, esr) != DBG_HOOK_HANDLED; diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 172452f79e46..f6e25d7c346a 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -32,7 +32,8 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO # -Wmissing-prototypes and -Wmissing-declarations are removed from # the CFLAGS of vgettimeofday.c to make possible to build the # kernel with CONFIG_WERROR enabled. -CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) $(GCC_PLUGINS_CFLAGS) \ +CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \ + $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \ $(CC_FLAGS_LTO) -Wmissing-prototypes -Wmissing-declarations KASAN_SANITIZE := n KCSAN_SANITIZE := n @@ -52,9 +53,6 @@ GCOV_PROFILE := n targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) -# Force dependency (incbin is bad) -$(obj)/vdso.o : $(obj)/vdso.so - # Link rule for the .so file, .lds has to be first $(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold_and_vdso_check) diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index ed181bedbffc..05ba1aae1b6f 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -131,9 +131,6 @@ obj-vdso := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) -# Force dependency (vdso.s includes vdso.so through incbin) -$(obj)/vdso.o: $(obj)/vdso.so - include/generated/vdso32-offsets.h: $(obj)/vdso.so.dbg FORCE $(call if_changed,vdsosym) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index edaf0faf766f..2d4a8f995175 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -93,7 +93,6 @@ jiffies = jiffies_64; #ifdef CONFIG_HIBERNATION #define HIBERNATE_TEXT \ - . = ALIGN(SZ_4K); \ __hibernate_exit_text_start = .; \ *(.hibernate_exit.text) \ __hibernate_exit_text_end = .; @@ -103,7 +102,6 @@ jiffies = jiffies_64; #ifdef CONFIG_KEXEC_CORE #define KEXEC_TEXT \ - . = ALIGN(SZ_4K); \ __relocate_new_kernel_start = .; \ *(.kexec_relocate.text) \ __relocate_new_kernel_end = .; @@ -170,9 +168,6 @@ SECTIONS KPROBES_TEXT HYPERVISOR_TEXT IDMAP_TEXT - HIBERNATE_TEXT - KEXEC_TEXT - TRAMP_TEXT *(.gnu.warning) . = ALIGN(16); *(.got) /* Global offset table */ @@ -194,6 +189,14 @@ SECTIONS HYPERVISOR_DATA_SECTIONS + /* code sections that are never executed via the kernel mapping */ + .rodata.text : { + TRAMP_TEXT + HIBERNATE_TEXT + KEXEC_TEXT + . = ALIGN(PAGE_SIZE); + } + idmap_pg_dir = .; . += IDMAP_DIR_SIZE; idmap_pg_end = .; @@ -337,8 +340,8 @@ ASSERT(__hyp_idmap_text_end - __hyp_idmap_text_start <= PAGE_SIZE, ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, "ID map text too big or misaligned") #ifdef CONFIG_HIBERNATION -ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) - <= SZ_4K, "Hibernate exit text too big or misaligned") +ASSERT(__hibernate_exit_text_end - __hibernate_exit_text_start <= SZ_4K, + "Hibernate exit text is bigger than 4 KiB") #endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) <= 3*PAGE_SIZE, @@ -362,7 +365,7 @@ ASSERT(swapper_pg_dir - tramp_pg_dir == TRAMP_SWAPPER_OFFSET, #ifdef CONFIG_KEXEC_CORE /* kexec relocation code should fit into one KEXEC_CONTROL_PAGE_SIZE */ -ASSERT(__relocate_new_kernel_end - (__relocate_new_kernel_start & ~(SZ_4K - 1)) - <= SZ_4K, "kexec relocation code is too big or misaligned") +ASSERT(__relocate_new_kernel_end - __relocate_new_kernel_start <= SZ_4K, + "kexec relocation code is bigger than 4 KiB") ASSERT(KEXEC_CONTROL_PAGE_SIZE >= SZ_4K, "KEXEC_CONTROL_PAGE_SIZE is broken") #endif diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 523bc934fe2f..cedc3ba2c098 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -783,6 +783,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) ret = 1; run->exit_reason = KVM_EXIT_UNKNOWN; + run->flags = 0; while (ret > 0) { /* * Check conditions before entering the guest @@ -1436,7 +1437,8 @@ static int kvm_init_vector_slots(void) base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT); - if (kvm_system_needs_idmapped_vectors() && !has_vhe()) { + if (kvm_system_needs_idmapped_vectors() && + !is_protected_kvm_enabled()) { err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs), __BP_HARDEN_HYP_VECS_SZ, &base); if (err) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 397fdac75cb1..3d251a4d2cf7 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -82,6 +82,26 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) vcpu->arch.flags |= KVM_ARM64_HOST_SVE_ENABLED; + + /* + * We don't currently support SME guests but if we leave + * things in streaming mode then when the guest starts running + * FPSIMD or SVE code it may generate SME traps so as a + * special case if we are in streaming mode we force the host + * state to be saved now and exit streaming mode so that we + * don't have to handle any SME traps for valid guest + * operations. Do this for ZA as well for now for simplicity. + */ + if (system_supports_sme()) { + if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) + vcpu->arch.flags |= KVM_ARM64_HOST_SME_ENABLED; + + if (read_sysreg_s(SYS_SVCR) & + (SVCR_SM_MASK | SVCR_ZA_MASK)) { + vcpu->arch.flags &= ~KVM_ARM64_FP_HOST; + fpsimd_save_and_flush_cpu_state(); + } + } } /* @@ -109,9 +129,14 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) WARN_ON_ONCE(!irqs_disabled()); if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { + /* + * Currently we do not support SME guests so SVCR is + * always 0 and we just need a variable to point to. + */ fpsimd_bind_state_to_cpu(&vcpu->arch.ctxt.fp_regs, vcpu->arch.sve_state, - vcpu->arch.sve_max_vl); + vcpu->arch.sve_max_vl, + NULL, 0, &vcpu->arch.svcr); clear_thread_flag(TIF_FOREIGN_FPSTATE); update_thread_flag(TIF_SVE, vcpu_has_sve(vcpu)); @@ -130,6 +155,22 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) local_irq_save(flags); + /* + * If we have VHE then the Hyp code will reset CPACR_EL1 to + * CPACR_EL1_DEFAULT and we need to reenable SME. + */ + if (has_vhe() && system_supports_sme()) { + /* Also restore EL0 state seen on entry */ + if (vcpu->arch.flags & KVM_ARM64_HOST_SME_ENABLED) + sysreg_clear_set(CPACR_EL1, 0, + CPACR_EL1_SMEN_EL0EN | + CPACR_EL1_SMEN_EL1EN); + else + sysreg_clear_set(CPACR_EL1, + CPACR_EL1_SMEN_EL0EN, + CPACR_EL1_SMEN_EL1EN); + } + if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { if (vcpu_has_sve(vcpu)) { __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 97fe14aab1a3..0b829292dc54 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -26,7 +26,7 @@ typedef int (*exit_handle_fn)(struct kvm_vcpu *); -static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u32 esr) +static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u64 esr) { if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(NULL, esr)) kvm_inject_vabt(vcpu); @@ -117,10 +117,12 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - u32 esr = kvm_vcpu_get_esr(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); run->exit_reason = KVM_EXIT_DEBUG; - run->debug.arch.hsr = esr; + run->debug.arch.hsr = lower_32_bits(esr); + run->debug.arch.hsr_high = upper_32_bits(esr); + run->flags = KVM_DEBUG_ARCH_HSR_HIGH_VALID; if (ESR_ELx_EC(esr) == ESR_ELx_EC_WATCHPT_LOW) run->debug.arch.far = vcpu->arch.fault.far_el2; @@ -130,9 +132,9 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) { - u32 esr = kvm_vcpu_get_esr(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); - kvm_pr_unimpl("Unknown exception class: esr: %#08x -- %s\n", + kvm_pr_unimpl("Unknown exception class: esr: %#016llx -- %s\n", esr, esr_get_class_string(esr)); kvm_inject_undefined(vcpu); @@ -187,7 +189,7 @@ static exit_handle_fn arm_exit_handlers[] = { static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) { - u32 esr = kvm_vcpu_get_esr(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); u8 esr_ec = ESR_ELx_EC(esr); return arm_exit_handlers[esr_ec]; @@ -334,6 +336,6 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, */ kvm_err("Hyp Offset: 0x%llx\n", hyp_offset); - panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n", + panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%016llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n", spsr, elr_virt, esr, far, hpfar, par, vcpu); } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 5d31f6c64c8c..37d9f211c200 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -266,7 +266,7 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) return true; } -static inline bool esr_is_ptrauth_trap(u32 esr) +static inline bool esr_is_ptrauth_trap(u64 esr) { switch (esr_sys64_to_sysreg(esr)) { case SYS_APIAKEYLO_EL1: diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h index 5ad626527d41..fd55014b3497 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -159,20 +159,20 @@ * No restrictions on instructions implemented in AArch64. */ #define PVM_ID_AA64ISAR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR0_AES) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_SHA1) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_SHA2) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_CRC32) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_ATOMICS) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_RDM) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_SHA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_SM3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_SM4) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_DP) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_FHM) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_TS) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_TLB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_RNDR) \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \ ) #define PVM_ID_AA64ISAR1_ALLOW (\ diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 6410d21d8695..caace61ea459 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -47,10 +47,24 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val |= CPTR_EL2_TFP | CPTR_EL2_TZ; __activate_traps_fpsimd32(vcpu); } + if (cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; write_sysreg(val, cptr_el2); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); + if (cpus_have_final_cap(ARM64_SME)) { + val = read_sysreg_s(SYS_HFGRTR_EL2); + val &= ~(HFGxTR_EL2_nTPIDR2_EL0_MASK | + HFGxTR_EL2_nSMPRI_EL1_MASK); + write_sysreg_s(val, SYS_HFGRTR_EL2); + + val = read_sysreg_s(SYS_HFGWTR_EL2); + val &= ~(HFGxTR_EL2_nTPIDR2_EL0_MASK | + HFGxTR_EL2_nSMPRI_EL1_MASK); + write_sysreg_s(val, SYS_HFGWTR_EL2); + } + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt; @@ -94,9 +108,25 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); + if (cpus_have_final_cap(ARM64_SME)) { + u64 val; + + val = read_sysreg_s(SYS_HFGRTR_EL2); + val |= HFGxTR_EL2_nTPIDR2_EL0_MASK | + HFGxTR_EL2_nSMPRI_EL1_MASK; + write_sysreg_s(val, SYS_HFGRTR_EL2); + + val = read_sysreg_s(SYS_HFGWTR_EL2); + val |= HFGxTR_EL2_nTPIDR2_EL0_MASK | + HFGxTR_EL2_nSMPRI_EL1_MASK; + write_sysreg_s(val, SYS_HFGWTR_EL2); + } + cptr = CPTR_EL2_DEFAULT; if (vcpu_has_sve(vcpu) && (vcpu->arch.flags & KVM_ARM64_FP_ENABLED)) cptr |= CPTR_EL2_TZ; + if (cpus_have_final_cap(ARM64_SME)) + cptr &= ~CPTR_EL2_TSM; write_sysreg(cptr, cptr_el2); write_sysreg(__kvm_hyp_host_vector, vbar_el2); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 33f5181af330..619f94fc95fa 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -33,7 +33,7 @@ u64 id_aa64mmfr2_el1_sys_val; */ static void inject_undef64(struct kvm_vcpu *vcpu) { - u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 4fb419f7b8b6..6cb638b184b1 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -473,7 +473,7 @@ static int __vgic_v3_bpr_min(void) static int __vgic_v3_get_group(struct kvm_vcpu *vcpu) { - u32 esr = kvm_vcpu_get_esr(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; return crm != 8; @@ -1016,7 +1016,7 @@ static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) { int rt; - u32 esr; + u64 esr; u32 vmcr; void (*fn)(struct kvm_vcpu *, u32, int); bool is_read; diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 262dfe03134d..969f20daf97a 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -41,7 +41,8 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val = read_sysreg(cpacr_el1); val |= CPACR_EL1_TTA; - val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN); + val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN | + CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN); /* * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to @@ -62,6 +63,10 @@ static void __activate_traps(struct kvm_vcpu *vcpu) __activate_traps_fpsimd32(vcpu); } + if (cpus_have_final_cap(ARM64_SME)) + write_sysreg(read_sysreg(sctlr_el2) & ~SCTLR_ELx_ENTP2, + sctlr_el2); + write_sysreg(val, cpacr_el1); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1); @@ -83,6 +88,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); + if (cpus_have_final_cap(ARM64_SME)) + write_sysreg(read_sysreg(sctlr_el2) | SCTLR_ELx_ENTP2, + sctlr_el2); + write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1); if (!arm64_kernel_unmapped_at_el0()) diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index ba20405d2dc2..55a5dbe957e0 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -18,7 +18,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr { unsigned long cpsr = *vcpu_cpsr(vcpu); bool is_aarch32 = vcpu_mode_is_32bit(vcpu); - u32 esr = 0; + u64 esr = 0; vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | KVM_ARM64_EXCEPT_AA64_ELx_SYNC | @@ -50,7 +50,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr static void inject_undef64(struct kvm_vcpu *vcpu) { - u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | KVM_ARM64_EXCEPT_AA64_ELx_SYNC | diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 7b45c040cc27..18b403b58b53 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1123,8 +1123,7 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3); val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); - if (irqchip_in_kernel(vcpu->kvm) && - vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + if (kvm_vgic_global_state.type == VGIC_V3) { val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_GIC); val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_GIC), 1); } @@ -1132,6 +1131,8 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, case SYS_ID_AA64PFR1_EL1: if (!kvm_has_mte(vcpu->kvm)) val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); + + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_SME); break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) @@ -1553,7 +1554,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_UNALLOCATED(4,2), ID_UNALLOCATED(4,3), ID_SANITISED(ID_AA64ZFR0_EL1), - ID_UNALLOCATED(4,5), + ID_HIDDEN(ID_AA64SMFR0_EL1), ID_UNALLOCATED(4,6), ID_UNALLOCATED(4,7), @@ -1596,6 +1597,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility }, { SYS_DESC(SYS_TRFCR_EL1), undef_access }, + { SYS_DESC(SYS_SMPRI_EL1), undef_access }, + { SYS_DESC(SYS_SMCR_EL1), undef_access }, { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, @@ -1678,8 +1681,10 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, { SYS_DESC(SYS_CLIDR_EL1), access_clidr }, + { SYS_DESC(SYS_SMIDR_EL1), undef_access }, { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, { SYS_DESC(SYS_CTR_EL0), access_ctr }, + { SYS_DESC(SYS_SVCR), undef_access }, { PMU_SYS_REG(SYS_PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, .reg = PMCR_EL0 }, @@ -1719,6 +1724,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, + { SYS_DESC(SYS_TPIDR2_EL0), undef_access }, { SYS_DESC(SYS_SCXTNUM_EL0), undef_access }, @@ -2304,7 +2310,7 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, size_t nr_global) { struct sys_reg_params params; - u32 esr = kvm_vcpu_get_esr(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); int Rt2 = (esr >> 10) & 0x1f; @@ -2354,7 +2360,7 @@ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, size_t nr_global) { struct sys_reg_params params; - u32 esr = kvm_vcpu_get_esr(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); params.CRm = (esr >> 1) & 0xf; diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c index 5e90887deec4..695d7368fadc 100644 --- a/arch/arm64/lib/insn.c +++ b/arch/arm64/lib/insn.c @@ -299,29 +299,24 @@ static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type, return insn; } +static const u32 aarch64_insn_ldst_size[] = { + [AARCH64_INSN_SIZE_8] = 0, + [AARCH64_INSN_SIZE_16] = 1, + [AARCH64_INSN_SIZE_32] = 2, + [AARCH64_INSN_SIZE_64] = 3, +}; + static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type, u32 insn) { u32 size; - switch (type) { - case AARCH64_INSN_SIZE_8: - size = 0; - break; - case AARCH64_INSN_SIZE_16: - size = 1; - break; - case AARCH64_INSN_SIZE_32: - size = 2; - break; - case AARCH64_INSN_SIZE_64: - size = 3; - break; - default: + if (type < AARCH64_INSN_SIZE_8 || type > AARCH64_INSN_SIZE_64) { pr_err("%s: unknown size encoding %d\n", __func__, type); return AARCH64_BREAK_FAULT; } + size = aarch64_insn_ldst_size[type]; insn &= ~GENMASK(31, 30); insn |= size << 30; @@ -504,6 +499,50 @@ u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg, offset); } +u32 aarch64_insn_gen_load_store_imm(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + unsigned int imm, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type) +{ + u32 insn; + u32 shift; + + if (size < AARCH64_INSN_SIZE_8 || size > AARCH64_INSN_SIZE_64) { + pr_err("%s: unknown size encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + shift = aarch64_insn_ldst_size[size]; + if (imm & ~(BIT(12 + shift) - BIT(shift))) { + pr_err("%s: invalid imm: %d\n", __func__, imm); + return AARCH64_BREAK_FAULT; + } + + imm >>= shift; + + switch (type) { + case AARCH64_INSN_LDST_LOAD_IMM_OFFSET: + insn = aarch64_insn_get_ldr_imm_value(); + break; + case AARCH64_INSN_LDST_STORE_IMM_OFFSET: + insn = aarch64_insn_get_str_imm_value(); + break; + default: + pr_err("%s: unknown load/store encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_ldst_size(size, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + base); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm); +} + u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, enum aarch64_insn_register reg2, enum aarch64_insn_register base, diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S index 8590af3c98c0..eeb9e45bcce8 100644 --- a/arch/arm64/lib/mte.S +++ b/arch/arm64/lib/mte.S @@ -93,7 +93,7 @@ SYM_FUNC_START(mte_copy_tags_from_user) mov x3, x1 cbz x2, 2f 1: - user_ldst 2f, ldtrb, w4, x1, 0 +USER(2f, ldtrb w4, [x1]) lsl x4, x4, #MTE_TAG_SHIFT stg x4, [x0], #MTE_GRANULE_SIZE add x1, x1, #1 @@ -120,7 +120,7 @@ SYM_FUNC_START(mte_copy_tags_to_user) 1: ldg x4, [x1] ubfx x4, x4, #MTE_TAG_SHIFT, #MTE_TAG_SIZE - user_ldst 2f, sttrb, w4, x0, 0 +USER(2f, sttrb w4, [x0]) add x0, x0, #1 add x1, x1, #MTE_GRANULE_SIZE subs x2, x2, #1 diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index b5447e53cd73..0dea80bf6de4 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -16,8 +16,8 @@ void copy_highpage(struct page *to, struct page *from) { - struct page *kto = page_address(to); - struct page *kfrom = page_address(from); + void *kto = page_address(to); + void *kfrom = page_address(from); copy_page(kto, kfrom); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 77341b160aca..c5e11768e5c1 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -43,7 +43,7 @@ #include <asm/traps.h> struct fault_info { - int (*fn)(unsigned long far, unsigned int esr, + int (*fn)(unsigned long far, unsigned long esr, struct pt_regs *regs); int sig; int code; @@ -53,17 +53,17 @@ struct fault_info { static const struct fault_info fault_info[]; static struct fault_info debug_fault_info[]; -static inline const struct fault_info *esr_to_fault_info(unsigned int esr) +static inline const struct fault_info *esr_to_fault_info(unsigned long esr) { return fault_info + (esr & ESR_ELx_FSC); } -static inline const struct fault_info *esr_to_debug_fault_info(unsigned int esr) +static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr) { return debug_fault_info + DBG_ESR_EVT(esr); } -static void data_abort_decode(unsigned int esr) +static void data_abort_decode(unsigned long esr) { pr_alert("Data abort info:\n"); @@ -85,11 +85,11 @@ static void data_abort_decode(unsigned int esr) (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT); } -static void mem_abort_decode(unsigned int esr) +static void mem_abort_decode(unsigned long esr) { pr_alert("Mem abort info:\n"); - pr_alert(" ESR = 0x%08x\n", esr); + pr_alert(" ESR = 0x%016lx\n", esr); pr_alert(" EC = 0x%02lx: %s, IL = %u bits\n", ESR_ELx_EC(esr), esr_get_class_string(esr), (esr & ESR_ELx_IL) ? 32 : 16); @@ -99,7 +99,7 @@ static void mem_abort_decode(unsigned int esr) pr_alert(" EA = %lu, S1PTW = %lu\n", (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT, (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT); - pr_alert(" FSC = 0x%02x: %s\n", (esr & ESR_ELx_FSC), + pr_alert(" FSC = 0x%02lx: %s\n", (esr & ESR_ELx_FSC), esr_to_fault_info(esr)->name); if (esr_is_data_abort(esr)) @@ -229,20 +229,20 @@ int ptep_set_access_flags(struct vm_area_struct *vma, return 1; } -static bool is_el1_instruction_abort(unsigned int esr) +static bool is_el1_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR; } -static bool is_el1_data_abort(unsigned int esr) +static bool is_el1_data_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR; } -static inline bool is_el1_permission_fault(unsigned long addr, unsigned int esr, +static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { - unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; + unsigned long fsc_type = esr & ESR_ELx_FSC_TYPE; if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr)) return false; @@ -258,7 +258,7 @@ static inline bool is_el1_permission_fault(unsigned long addr, unsigned int esr, } static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, - unsigned int esr, + unsigned long esr, struct pt_regs *regs) { unsigned long flags; @@ -290,7 +290,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, } static void die_kernel_fault(const char *msg, unsigned long addr, - unsigned int esr, struct pt_regs *regs) + unsigned long esr, struct pt_regs *regs) { bust_spinlocks(1); @@ -308,7 +308,7 @@ static void die_kernel_fault(const char *msg, unsigned long addr, } #ifdef CONFIG_KASAN_HW_TAGS -static void report_tag_fault(unsigned long addr, unsigned int esr, +static void report_tag_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { /* @@ -320,11 +320,11 @@ static void report_tag_fault(unsigned long addr, unsigned int esr, } #else /* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */ -static inline void report_tag_fault(unsigned long addr, unsigned int esr, +static inline void report_tag_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { } #endif -static void do_tag_recovery(unsigned long addr, unsigned int esr, +static void do_tag_recovery(unsigned long addr, unsigned long esr, struct pt_regs *regs) { @@ -335,13 +335,14 @@ static void do_tag_recovery(unsigned long addr, unsigned int esr, * It will be done lazily on the other CPUs when they will hit a * tag fault. */ - sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, SCTLR_ELx_TCF_NONE); + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK, + SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF, NONE)); isb(); } -static bool is_el1_mte_sync_tag_check_fault(unsigned int esr) +static bool is_el1_mte_sync_tag_check_fault(unsigned long esr) { - unsigned int fsc = esr & ESR_ELx_FSC; + unsigned long fsc = esr & ESR_ELx_FSC; if (!is_el1_data_abort(esr)) return false; @@ -352,7 +353,7 @@ static bool is_el1_mte_sync_tag_check_fault(unsigned int esr) return false; } -static void __do_kernel_fault(unsigned long addr, unsigned int esr, +static void __do_kernel_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { const char *msg; @@ -393,7 +394,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, die_kernel_fault(msg, addr, esr, regs); } -static void set_thread_esr(unsigned long address, unsigned int esr) +static void set_thread_esr(unsigned long address, unsigned long esr) { current->thread.fault_address = address; @@ -441,7 +442,7 @@ static void set_thread_esr(unsigned long address, unsigned int esr) * exception level). Fail safe by not providing an ESR * context record at all. */ - WARN(1, "ESR 0x%x is not DABT or IABT from EL0\n", esr); + WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr); esr = 0; break; } @@ -450,7 +451,7 @@ static void set_thread_esr(unsigned long address, unsigned int esr) current->thread.fault_code = esr; } -static void do_bad_area(unsigned long far, unsigned int esr, +static void do_bad_area(unsigned long far, unsigned long esr, struct pt_regs *regs) { unsigned long addr = untagged_addr(far); @@ -501,7 +502,7 @@ static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, return handle_mm_fault(vma, addr, mm_flags, regs); } -static bool is_el0_instruction_abort(unsigned int esr) +static bool is_el0_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; } @@ -510,12 +511,12 @@ static bool is_el0_instruction_abort(unsigned int esr) * Note: not valid for EL1 DC IVAC, but we never use that such that it * should fault. EL0 cannot issue DC IVAC (undef). */ -static bool is_write_abort(unsigned int esr) +static bool is_write_abort(unsigned long esr) { return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); } -static int __kprobes do_page_fault(unsigned long far, unsigned int esr, +static int __kprobes do_page_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf; @@ -671,7 +672,7 @@ no_context: } static int __kprobes do_translation_fault(unsigned long far, - unsigned int esr, + unsigned long esr, struct pt_regs *regs) { unsigned long addr = untagged_addr(far); @@ -683,19 +684,19 @@ static int __kprobes do_translation_fault(unsigned long far, return 0; } -static int do_alignment_fault(unsigned long far, unsigned int esr, +static int do_alignment_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { do_bad_area(far, esr, regs); return 0; } -static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs) +static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs) { return 1; /* "fault" */ } -static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs) +static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf; unsigned long siaddr; @@ -725,7 +726,7 @@ static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs) return 0; } -static int do_tag_check_fault(unsigned long far, unsigned int esr, +static int do_tag_check_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { /* @@ -805,7 +806,7 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, }; -void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs) +void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_fault_info(esr); unsigned long addr = untagged_addr(far); @@ -825,14 +826,14 @@ void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs) } NOKPROBE_SYMBOL(do_mem_abort); -void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) +void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs) { arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN, addr, esr); } NOKPROBE_SYMBOL(do_sp_pc_abort); -int __init early_brk64(unsigned long addr, unsigned int esr, +int __init early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs); /* @@ -852,7 +853,7 @@ static struct fault_info __refdata debug_fault_info[] = { }; void __init hook_debug_fault_code(int nr, - int (*fn)(unsigned long, unsigned int, struct pt_regs *), + int (*fn)(unsigned long, unsigned long, struct pt_regs *), int sig, int code, const char *name) { BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info)); @@ -885,7 +886,7 @@ static void debug_exception_exit(struct pt_regs *regs) } NOKPROBE_SYMBOL(debug_exception_exit); -void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, +void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_debug_fault_info(esr); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index cbace1c9e137..64bb078e2e7b 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -158,6 +158,28 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize) return contig_ptes; } +pte_t huge_ptep_get(pte_t *ptep) +{ + int ncontig, i; + size_t pgsize; + pte_t orig_pte = ptep_get(ptep); + + if (!pte_present(orig_pte) || !pte_cont(orig_pte)) + return orig_pte; + + ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize); + for (i = 0; i < ncontig; i++, ptep++) { + pte_t pte = ptep_get(ptep); + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + return orig_pte; +} + /* * Changing some bits of contiguous entries requires us to follow a * Break-Before-Make approach, breaking the whole contiguous set @@ -166,15 +188,14 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize) * * This helper performs the break step. */ -static pte_t get_clear_flush(struct mm_struct *mm, +static pte_t get_clear_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pgsize, unsigned long ncontig) { - pte_t orig_pte = huge_ptep_get(ptep); - bool valid = pte_valid(orig_pte); - unsigned long i, saddr = addr; + pte_t orig_pte = ptep_get(ptep); + unsigned long i; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { pte_t pte = ptep_get_and_clear(mm, addr, ptep); @@ -190,11 +211,6 @@ static pte_t get_clear_flush(struct mm_struct *mm, if (pte_young(pte)) orig_pte = pte_mkyoung(orig_pte); } - - if (valid) { - struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); - flush_tlb_range(&vma, saddr, addr); - } return orig_pte; } @@ -385,14 +401,14 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, { int ncontig; size_t pgsize; - pte_t orig_pte = huge_ptep_get(ptep); + pte_t orig_pte = ptep_get(ptep); if (!pte_cont(orig_pte)) return ptep_get_and_clear(mm, addr, ptep); ncontig = find_num_contig(mm, addr, ptep, &pgsize); - return get_clear_flush(mm, addr, ptep, pgsize, ncontig); + return get_clear_contig(mm, addr, ptep, pgsize, ncontig); } /* @@ -408,11 +424,11 @@ static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig) { int i; - if (pte_write(pte) != pte_write(huge_ptep_get(ptep))) + if (pte_write(pte) != pte_write(ptep_get(ptep))) return 1; for (i = 0; i < ncontig; i++) { - pte_t orig_pte = huge_ptep_get(ptep + i); + pte_t orig_pte = ptep_get(ptep + i); if (pte_dirty(pte) != pte_dirty(orig_pte)) return 1; @@ -443,7 +459,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, if (!__cont_access_flags_changed(ptep, pte, ncontig)) return 0; - orig_pte = get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); + orig_pte = get_clear_contig(vma->vm_mm, addr, ptep, pgsize, ncontig); /* Make sure we don't lose the dirty or young state */ if (pte_dirty(orig_pte)) @@ -476,7 +492,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, ncontig = find_num_contig(mm, addr, ptep, &pgsize); dpfn = pgsize >> PAGE_SHIFT; - pte = get_clear_flush(mm, addr, ptep, pgsize, ncontig); + pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig); pte = pte_wrprotect(pte); hugeprot = pte_pgprot(pte); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1e7b1550e2fc..a1410143ea62 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -90,6 +90,32 @@ phys_addr_t __ro_after_init arm64_dma_phys_limit; phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1; #endif +/* Current arm64 boot protocol requires 2MB alignment */ +#define CRASH_ALIGN SZ_2M + +#define CRASH_ADDR_LOW_MAX arm64_dma_phys_limit +#define CRASH_ADDR_HIGH_MAX (PHYS_MASK + 1) + +static int __init reserve_crashkernel_low(unsigned long long low_size) +{ + unsigned long long low_base; + + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); + if (!low_base) { + pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); + return -ENOMEM; + } + + pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", + low_base, low_base + low_size, low_size >> 20); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); + + return 0; +} + /* * reserve_crashkernel() - reserves memory for crash kernel * @@ -100,17 +126,35 @@ phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1; static void __init reserve_crashkernel(void) { unsigned long long crash_base, crash_size; - unsigned long long crash_max = arm64_dma_phys_limit; + unsigned long long crash_low_size = 0; + unsigned long long crash_max = CRASH_ADDR_LOW_MAX; + char *cmdline = boot_command_line; int ret; if (!IS_ENABLED(CONFIG_KEXEC_CORE)) return; - ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + /* crashkernel=X[@offset] */ + ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), &crash_size, &crash_base); - /* no crashkernel= or invalid value specified */ - if (ret || !crash_size) + if (ret == -ENOENT) { + ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base); + if (ret || !crash_size) + return; + + /* + * crashkernel=Y,low can be specified or not, but invalid value + * is not allowed. + */ + ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base); + if (ret && (ret != -ENOENT)) + return; + + crash_max = CRASH_ADDR_HIGH_MAX; + } else if (ret || !crash_size) { + /* The specified value is invalid */ return; + } crash_size = PAGE_ALIGN(crash_size); @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void) if (crash_base) crash_max = crash_base + crash_size; - /* Current arm64 boot protocol requires 2MB alignment */ - crash_base = memblock_phys_alloc_range(crash_size, SZ_2M, + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, crash_base, crash_max); if (!crash_base) { pr_warn("cannot allocate crashkernel (size:0x%llx)\n", @@ -127,6 +170,12 @@ static void __init reserve_crashkernel(void) return; } + if ((crash_base >= CRASH_ADDR_LOW_MAX) && + crash_low_size && reserve_crashkernel_low(crash_low_size)) { + memblock_phys_free(crash_base, crash_size); + return; + } + pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", crash_base, crash_base + crash_size, crash_size >> 20); @@ -135,8 +184,12 @@ static void __init reserve_crashkernel(void) * map. Inform kmemleak so that it won't try to access it. */ kmemleak_ignore_phys(crash_base); + if (crashk_low_res.end) + kmemleak_ignore_phys(crashk_low_res.start); + crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); } /* @@ -157,7 +210,7 @@ static phys_addr_t __init max_zone_phys(unsigned int zone_bits) return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; } -static void __init zone_sizes_init(unsigned long min, unsigned long max) +static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; unsigned int __maybe_unused acpi_zone_dma_bits; @@ -176,7 +229,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) if (!arm64_dma_phys_limit) arm64_dma_phys_limit = dma32_phys_limit; #endif - max_zone_pfns[ZONE_NORMAL] = max; + max_zone_pfns[ZONE_NORMAL] = max_pfn; free_area_init(max_zone_pfns); } @@ -374,7 +427,7 @@ void __init bootmem_init(void) * done after the fixed reservations */ sparse_init(); - zone_sizes_init(min, max); + zone_sizes_init(); /* * Reserve the CMA area after arm64_dma_phys_limit was initialised. diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index b7c81dacabf0..b21f91cd830d 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -99,3 +99,11 @@ void __init early_ioremap_init(void) { early_ioremap_setup(); } + +bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + unsigned long pfn = PHYS_PFN(offset); + + return pfn_is_map_memory(pfn); +} diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index d7da8ca40d2e..4ea2eefbc053 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -238,7 +238,7 @@ int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0, int this_level, index, level_lsb, level_msb; dst_addr &= PAGE_MASK; - prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_EXEC)); + prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_ROX)); for (this_level = 3; this_level >= 0; this_level--) { levels[this_level] = trans_alloc(info); diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index dd59b5ad8fe4..194c95ccc1cf 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -66,6 +66,20 @@ #define A64_STR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, STORE) #define A64_LDR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, LOAD) +/* Load/store register (immediate offset) */ +#define A64_LS_IMM(Rt, Rn, imm, size, type) \ + aarch64_insn_gen_load_store_imm(Rt, Rn, imm, \ + AARCH64_INSN_SIZE_##size, \ + AARCH64_INSN_LDST_##type##_IMM_OFFSET) +#define A64_STRBI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 8, STORE) +#define A64_LDRBI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 8, LOAD) +#define A64_STRHI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 16, STORE) +#define A64_LDRHI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 16, LOAD) +#define A64_STR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, STORE) +#define A64_LDR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, LOAD) +#define A64_STR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, STORE) +#define A64_LDR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, LOAD) + /* Load/store register pair */ #define A64_LS_PAIR(Rt, Rt2, Rn, offset, ls, type) \ aarch64_insn_gen_load_store_pair(Rt, Rt2, Rn, offset, \ @@ -249,6 +263,9 @@ /* HINTs */ #define A64_HINT(x) aarch64_insn_gen_hint(x) +#define A64_PACIASP A64_HINT(AARCH64_INSN_HINT_PACIASP) +#define A64_AUTIASP A64_HINT(AARCH64_INSN_HINT_AUTIASP) + /* BTI */ #define A64_BTI_C A64_HINT(AARCH64_INSN_HINT_BTIC) #define A64_BTI_J A64_HINT(AARCH64_INSN_HINT_BTIJ) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index fcc675aa1670..8ab4035dea27 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -26,6 +26,7 @@ #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) #define TCALL_CNT (MAX_BPF_JIT_REG + 2) #define TMP_REG_3 (MAX_BPF_JIT_REG + 3) +#define FP_BOTTOM (MAX_BPF_JIT_REG + 4) #define check_imm(bits, imm) do { \ if ((((imm) > 0) && ((imm) >> (bits))) || \ @@ -63,6 +64,7 @@ static const int bpf2a64[] = { [TCALL_CNT] = A64_R(26), /* temporary register for blinding constants */ [BPF_REG_AX] = A64_R(9), + [FP_BOTTOM] = A64_R(27), }; struct jit_ctx { @@ -73,6 +75,7 @@ struct jit_ctx { int exentry_idx; __le32 *image; u32 stack_size; + int fpb_offset; }; static inline void emit(const u32 insn, struct jit_ctx *ctx) @@ -191,11 +194,53 @@ static bool is_addsub_imm(u32 imm) return !(imm & ~0xfff) || !(imm & ~0xfff000); } +/* + * There are 3 types of AArch64 LDR/STR (immediate) instruction: + * Post-index, Pre-index, Unsigned offset. + * + * For BPF ldr/str, the "unsigned offset" type is sufficient. + * + * "Unsigned offset" type LDR(immediate) format: + * + * 3 2 1 0 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |x x|1 1 1 0 0 1 0 1| imm12 | Rn | Rt | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * scale + * + * "Unsigned offset" type STR(immediate) format: + * 3 2 1 0 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |x x|1 1 1 0 0 1 0 0| imm12 | Rn | Rt | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * scale + * + * The offset is calculated from imm12 and scale in the following way: + * + * offset = (u64)imm12 << scale + */ +static bool is_lsi_offset(int offset, int scale) +{ + if (offset < 0) + return false; + + if (offset > (0xFFF << scale)) + return false; + + if (offset & ((1 << scale) - 1)) + return false; + + return true; +} + /* Tail call offset to jump into */ -#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) -#define PROLOGUE_OFFSET 8 +#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) || \ + IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) +#define PROLOGUE_OFFSET 9 #else -#define PROLOGUE_OFFSET 7 +#define PROLOGUE_OFFSET 8 #endif static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) @@ -207,6 +252,7 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) const u8 r9 = bpf2a64[BPF_REG_9]; const u8 fp = bpf2a64[BPF_REG_FP]; const u8 tcc = bpf2a64[TCALL_CNT]; + const u8 fpb = bpf2a64[FP_BOTTOM]; const int idx0 = ctx->idx; int cur_offset; @@ -233,8 +279,11 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) * */ + /* Sign lr */ + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL)) + emit(A64_PACIASP, ctx); /* BTI landing pad */ - if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) + else if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) emit(A64_BTI_C, ctx); /* Save FP and LR registers to stay align with ARM64 AAPCS */ @@ -245,6 +294,7 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) emit(A64_PUSH(r6, r7, A64_SP), ctx); emit(A64_PUSH(r8, r9, A64_SP), ctx); emit(A64_PUSH(fp, tcc, A64_SP), ctx); + emit(A64_PUSH(fpb, A64_R(28), A64_SP), ctx); /* Set up BPF prog stack base register */ emit(A64_MOV(1, fp, A64_SP), ctx); @@ -265,6 +315,8 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) emit(A64_BTI_J, ctx); } + emit(A64_SUB_I(1, fpb, fp, ctx->fpb_offset), ctx); + /* Stack must be multiples of 16B */ ctx->stack_size = round_up(prog->aux->stack_depth, 16); @@ -512,10 +564,13 @@ static void build_epilogue(struct jit_ctx *ctx) const u8 r8 = bpf2a64[BPF_REG_8]; const u8 r9 = bpf2a64[BPF_REG_9]; const u8 fp = bpf2a64[BPF_REG_FP]; + const u8 fpb = bpf2a64[FP_BOTTOM]; /* We're done with BPF stack */ emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); + /* Restore x27 and x28 */ + emit(A64_POP(fpb, A64_R(28), A64_SP), ctx); /* Restore fs (x25) and x26 */ emit(A64_POP(fp, A64_R(26), A64_SP), ctx); @@ -529,6 +584,10 @@ static void build_epilogue(struct jit_ctx *ctx) /* Set return value */ emit(A64_MOV(1, A64_R(0), r0), ctx); + /* Authenticate lr */ + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL)) + emit(A64_AUTIASP, ctx); + emit(A64_RET(A64_LR), ctx); } @@ -609,6 +668,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, const u8 src = bpf2a64[insn->src_reg]; const u8 tmp = bpf2a64[TMP_REG_1]; const u8 tmp2 = bpf2a64[TMP_REG_2]; + const u8 fp = bpf2a64[BPF_REG_FP]; + const u8 fpb = bpf2a64[FP_BOTTOM]; const s16 off = insn->off; const s32 imm = insn->imm; const int i = insn - ctx->prog->insnsi; @@ -617,6 +678,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, u8 jmp_cond; s32 jmp_offset; u32 a64_insn; + u8 src_adj; + u8 dst_adj; + int off_adj; int ret; switch (code) { @@ -971,19 +1035,45 @@ emit_cond_jmp: case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_PROBE_MEM | BPF_H: case BPF_LDX | BPF_PROBE_MEM | BPF_B: - emit_a64_mov_i(1, tmp, off, ctx); + if (ctx->fpb_offset > 0 && src == fp) { + src_adj = fpb; + off_adj = off + ctx->fpb_offset; + } else { + src_adj = src; + off_adj = off; + } switch (BPF_SIZE(code)) { case BPF_W: - emit(A64_LDR32(dst, src, tmp), ctx); + if (is_lsi_offset(off_adj, 2)) { + emit(A64_LDR32I(dst, src_adj, off_adj), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_LDR32(dst, src, tmp), ctx); + } break; case BPF_H: - emit(A64_LDRH(dst, src, tmp), ctx); + if (is_lsi_offset(off_adj, 1)) { + emit(A64_LDRHI(dst, src_adj, off_adj), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_LDRH(dst, src, tmp), ctx); + } break; case BPF_B: - emit(A64_LDRB(dst, src, tmp), ctx); + if (is_lsi_offset(off_adj, 0)) { + emit(A64_LDRBI(dst, src_adj, off_adj), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_LDRB(dst, src, tmp), ctx); + } break; case BPF_DW: - emit(A64_LDR64(dst, src, tmp), ctx); + if (is_lsi_offset(off_adj, 3)) { + emit(A64_LDR64I(dst, src_adj, off_adj), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_LDR64(dst, src, tmp), ctx); + } break; } @@ -1010,21 +1100,47 @@ emit_cond_jmp: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: case BPF_ST | BPF_MEM | BPF_DW: + if (ctx->fpb_offset > 0 && dst == fp) { + dst_adj = fpb; + off_adj = off + ctx->fpb_offset; + } else { + dst_adj = dst; + off_adj = off; + } /* Load imm to a register then store it */ - emit_a64_mov_i(1, tmp2, off, ctx); emit_a64_mov_i(1, tmp, imm, ctx); switch (BPF_SIZE(code)) { case BPF_W: - emit(A64_STR32(tmp, dst, tmp2), ctx); + if (is_lsi_offset(off_adj, 2)) { + emit(A64_STR32I(tmp, dst_adj, off_adj), ctx); + } else { + emit_a64_mov_i(1, tmp2, off, ctx); + emit(A64_STR32(tmp, dst, tmp2), ctx); + } break; case BPF_H: - emit(A64_STRH(tmp, dst, tmp2), ctx); + if (is_lsi_offset(off_adj, 1)) { + emit(A64_STRHI(tmp, dst_adj, off_adj), ctx); + } else { + emit_a64_mov_i(1, tmp2, off, ctx); + emit(A64_STRH(tmp, dst, tmp2), ctx); + } break; case BPF_B: - emit(A64_STRB(tmp, dst, tmp2), ctx); + if (is_lsi_offset(off_adj, 0)) { + emit(A64_STRBI(tmp, dst_adj, off_adj), ctx); + } else { + emit_a64_mov_i(1, tmp2, off, ctx); + emit(A64_STRB(tmp, dst, tmp2), ctx); + } break; case BPF_DW: - emit(A64_STR64(tmp, dst, tmp2), ctx); + if (is_lsi_offset(off_adj, 3)) { + emit(A64_STR64I(tmp, dst_adj, off_adj), ctx); + } else { + emit_a64_mov_i(1, tmp2, off, ctx); + emit(A64_STR64(tmp, dst, tmp2), ctx); + } break; } break; @@ -1034,19 +1150,45 @@ emit_cond_jmp: case BPF_STX | BPF_MEM | BPF_H: case BPF_STX | BPF_MEM | BPF_B: case BPF_STX | BPF_MEM | BPF_DW: - emit_a64_mov_i(1, tmp, off, ctx); + if (ctx->fpb_offset > 0 && dst == fp) { + dst_adj = fpb; + off_adj = off + ctx->fpb_offset; + } else { + dst_adj = dst; + off_adj = off; + } switch (BPF_SIZE(code)) { case BPF_W: - emit(A64_STR32(src, dst, tmp), ctx); + if (is_lsi_offset(off_adj, 2)) { + emit(A64_STR32I(src, dst_adj, off_adj), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_STR32(src, dst, tmp), ctx); + } break; case BPF_H: - emit(A64_STRH(src, dst, tmp), ctx); + if (is_lsi_offset(off_adj, 1)) { + emit(A64_STRHI(src, dst_adj, off_adj), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_STRH(src, dst, tmp), ctx); + } break; case BPF_B: - emit(A64_STRB(src, dst, tmp), ctx); + if (is_lsi_offset(off_adj, 0)) { + emit(A64_STRBI(src, dst_adj, off_adj), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_STRB(src, dst, tmp), ctx); + } break; case BPF_DW: - emit(A64_STR64(src, dst, tmp), ctx); + if (is_lsi_offset(off_adj, 3)) { + emit(A64_STR64I(src, dst_adj, off_adj), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_STR64(src, dst, tmp), ctx); + } break; } break; @@ -1069,6 +1211,79 @@ emit_cond_jmp: return 0; } +/* + * Return 0 if FP may change at runtime, otherwise find the minimum negative + * offset to FP, converts it to positive number, and align down to 8 bytes. + */ +static int find_fpb_offset(struct bpf_prog *prog) +{ + int i; + int offset = 0; + + for (i = 0; i < prog->len; i++) { + const struct bpf_insn *insn = &prog->insnsi[i]; + const u8 class = BPF_CLASS(insn->code); + const u8 mode = BPF_MODE(insn->code); + const u8 src = insn->src_reg; + const u8 dst = insn->dst_reg; + const s32 imm = insn->imm; + const s16 off = insn->off; + + switch (class) { + case BPF_STX: + case BPF_ST: + /* fp holds atomic operation result */ + if (class == BPF_STX && mode == BPF_ATOMIC && + ((imm == BPF_XCHG || + imm == (BPF_FETCH | BPF_ADD) || + imm == (BPF_FETCH | BPF_AND) || + imm == (BPF_FETCH | BPF_XOR) || + imm == (BPF_FETCH | BPF_OR)) && + src == BPF_REG_FP)) + return 0; + + if (mode == BPF_MEM && dst == BPF_REG_FP && + off < offset) + offset = insn->off; + break; + + case BPF_JMP32: + case BPF_JMP: + break; + + case BPF_LDX: + case BPF_LD: + /* fp holds load result */ + if (dst == BPF_REG_FP) + return 0; + + if (class == BPF_LDX && mode == BPF_MEM && + src == BPF_REG_FP && off < offset) + offset = off; + break; + + case BPF_ALU: + case BPF_ALU64: + default: + /* fp holds ALU result */ + if (dst == BPF_REG_FP) + return 0; + } + } + + if (offset < 0) { + /* + * safely be converted to a positive 'int', since insn->off + * is 's16' + */ + offset = -offset; + /* align down to 8 bytes */ + offset = ALIGN_DOWN(offset, 8); + } + + return offset; +} + static int build_body(struct jit_ctx *ctx, bool extra_pass) { const struct bpf_prog *prog = ctx->prog; @@ -1190,6 +1405,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) goto out_off; } + ctx.fpb_offset = find_fpb_offset(prog); + /* * 1. Initial fake pass to compute ctx->idx and ctx->offset. * diff --git a/arch/arm64/tools/Makefile b/arch/arm64/tools/Makefile index cf1307188150..07a93ab21a62 100644 --- a/arch/arm64/tools/Makefile +++ b/arch/arm64/tools/Makefile @@ -3,7 +3,7 @@ gen := arch/$(ARCH)/include/generated kapi := $(gen)/asm -kapi-hdrs-y := $(kapi)/cpucaps.h +kapi-hdrs-y := $(kapi)/cpucaps.h $(kapi)/sysreg-defs.h targets += $(addprefix ../../../, $(kapi-hdrs-y)) @@ -14,5 +14,11 @@ kapi: $(kapi-hdrs-y) quiet_cmd_gen_cpucaps = GEN $@ cmd_gen_cpucaps = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@ +quiet_cmd_gen_sysreg = GEN $@ + cmd_gen_sysreg = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@ + $(kapi)/cpucaps.h: $(src)/gen-cpucaps.awk $(src)/cpucaps FORCE $(call if_changed,gen_cpucaps) + +$(kapi)/sysreg-defs.h: $(src)/gen-sysreg.awk $(src)/sysreg FORCE + $(call if_changed,gen_sysreg) diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 3ed418f70e3b..e52b289a27c2 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -43,6 +43,8 @@ KVM_PROTECTED_MODE MISMATCHED_CACHE_TYPE MTE MTE_ASYMM +SME +SME_FA64 SPECTRE_V2 SPECTRE_V3A SPECTRE_V4 diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk new file mode 100755 index 000000000000..89bfb74e28de --- /dev/null +++ b/arch/arm64/tools/gen-sysreg.awk @@ -0,0 +1,268 @@ +#!/bin/awk -f +# SPDX-License-Identifier: GPL-2.0 +# gen-sysreg.awk: arm64 sysreg header generator +# +# Usage: awk -f gen-sysreg.awk sysregs.txt + +# Log an error and terminate +function fatal(msg) { + print "Error at " NR ": " msg > "/dev/stderr" + exit 1 +} + +# Sanity check that the start or end of a block makes sense at this point in +# the file. If not, produce an error and terminate. +# +# @this - the $Block or $EndBlock +# @prev - the only valid block to already be in (value of @block) +# @new - the new value of @block +function change_block(this, prev, new) { + if (block != prev) + fatal("unexpected " this " (inside " block ")") + + block = new +} + +# Sanity check the number of records for a field makes sense. If not, produce +# an error and terminate. +function expect_fields(nf) { + if (NF != nf) + fatal(NF " fields found where " nf " expected") +} + +# Print a CPP macro definition, padded with spaces so that the macro bodies +# line up in a column +function define(name, val) { + printf "%-48s%s\n", "#define " name, val +} + +# Print standard BITMASK/SHIFT/WIDTH CPP definitions for a field +function define_field(reg, field, msb, lsb) { + define(reg "_" field, "GENMASK(" msb ", " lsb ")") + define(reg "_" field "_MASK", "GENMASK(" msb ", " lsb ")") + define(reg "_" field "_SHIFT", lsb) + define(reg "_" field "_WIDTH", msb - lsb + 1) +} + +# Parse a "<msb>[:<lsb>]" string into the global variables @msb and @lsb +function parse_bitdef(reg, field, bitdef, _bits) +{ + if (bitdef ~ /^[0-9]+$/) { + msb = bitdef + lsb = bitdef + } else if (split(bitdef, _bits, ":") == 2) { + msb = _bits[1] + lsb = _bits[2] + } else { + fatal("invalid bit-range definition '" bitdef "'") + } + + + if (msb != next_bit) + fatal(reg "." field " starts at " msb " not " next_bit) + if (63 < msb || msb < 0) + fatal(reg "." field " invalid high bit in '" bitdef "'") + if (63 < lsb || lsb < 0) + fatal(reg "." field " invalid low bit in '" bitdef "'") + if (msb < lsb) + fatal(reg "." field " invalid bit-range '" bitdef "'") + if (low > high) + fatal(reg "." field " has invalid range " high "-" low) + + next_bit = lsb - 1 +} + +BEGIN { + print "#ifndef __ASM_SYSREG_DEFS_H" + print "#define __ASM_SYSREG_DEFS_H" + print "" + print "/* Generated file - do not edit */" + print "" + + block = "None" +} + +END { + print "#endif /* __ASM_SYSREG_DEFS_H */" +} + +# skip blank lines and comment lines +/^$/ { next } +/^#/ { next } + +/^SysregFields/ { + change_block("SysregFields", "None", "SysregFields") + expect_fields(2) + + reg = $2 + + res0 = "UL(0)" + res1 = "UL(0)" + + next_bit = 63 + + next +} + +/^EndSysregFields/ { + if (next_bit > 0) + fatal("Unspecified bits in " reg) + + change_block("EndSysregFields", "SysregFields", "None") + + define(reg "_RES0", "(" res0 ")") + define(reg "_RES1", "(" res1 ")") + print "" + + reg = null + res0 = null + res1 = null + + next +} + +/^Sysreg/ { + change_block("Sysreg", "None", "Sysreg") + expect_fields(7) + + reg = $2 + op0 = $3 + op1 = $4 + crn = $5 + crm = $6 + op2 = $7 + + res0 = "UL(0)" + res1 = "UL(0)" + + define("REG_" reg, "S" op0 "_" op1 "_C" crn "_C" crm "_" op2) + define("SYS_" reg, "sys_reg(" op0 ", " op1 ", " crn ", " crm ", " op2 ")") + + define("SYS_" reg "_Op0", op0) + define("SYS_" reg "_Op1", op1) + define("SYS_" reg "_CRn", crn) + define("SYS_" reg "_CRm", crm) + define("SYS_" reg "_Op2", op2) + + print "" + + next_bit = 63 + + next +} + +/^EndSysreg/ { + if (next_bit > 0) + fatal("Unspecified bits in " reg) + + change_block("EndSysreg", "Sysreg", "None") + + if (res0 != null) + define(reg "_RES0", "(" res0 ")") + if (res1 != null) + define(reg "_RES1", "(" res1 ")") + if (res0 != null || res1 != null) + print "" + + reg = null + op0 = null + op1 = null + crn = null + crm = null + op2 = null + res0 = null + res1 = null + + next +} + +# Currently this is effectivey a comment, in future we may want to emit +# defines for the fields. +/^Fields/ && (block == "Sysreg") { + expect_fields(2) + + if (next_bit != 63) + fatal("Some fields already defined for " reg) + + print "/* For " reg " fields see " $2 " */" + print "" + + next_bit = 0 + res0 = null + res1 = null + + next +} + + +/^Res0/ && (block == "Sysreg" || block == "SysregFields") { + expect_fields(2) + parse_bitdef(reg, "RES0", $2) + field = "RES0_" msb "_" lsb + + res0 = res0 " | GENMASK_ULL(" msb ", " lsb ")" + + next +} + +/^Res1/ && (block == "Sysreg" || block == "SysregFields") { + expect_fields(2) + parse_bitdef(reg, "RES1", $2) + field = "RES1_" msb "_" lsb + + res1 = res1 " | GENMASK_ULL(" msb ", " lsb ")" + + next +} + +/^Field/ && (block == "Sysreg" || block == "SysregFields") { + expect_fields(3) + field = $3 + parse_bitdef(reg, field, $2) + + define_field(reg, field, msb, lsb) + print "" + + next +} + +/^Raz/ && (block == "Sysreg" || block == "SysregFields") { + expect_fields(2) + parse_bitdef(reg, field, $2) + + next +} + +/^Enum/ { + change_block("Enum", "Sysreg", "Enum") + expect_fields(3) + field = $3 + parse_bitdef(reg, field, $2) + + define_field(reg, field, msb, lsb) + + next +} + +/^EndEnum/ { + change_block("EndEnum", "Enum", "Sysreg") + field = null + msb = null + lsb = null + print "" + next +} + +/0b[01]+/ && block = "Enum" { + expect_fields(2) + val = $1 + name = $2 + + define(reg "_" field "_" name, "UL(" val ")") + next +} + +# Any lines not handled by previous rules are unexpected +{ + fatal("unhandled statement") +} diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg new file mode 100644 index 000000000000..ff5e552f7420 --- /dev/null +++ b/arch/arm64/tools/sysreg @@ -0,0 +1,369 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# System register metadata + +# Each System register is described by a Sysreg block: + +# Sysreg <name> <op0> <op1> <crn> <crm> <op2> +# <field> +# ... +# EndSysreg + +# Within a Sysreg block, each field can be described as one of: + +# Res0 <msb>[:<lsb>] + +# Res1 <msb>[:<lsb>] + +# Field <msb>[:<lsb>] <name> + +# Enum <msb>[:<lsb>] <name> +# <enumval> <enumname> +# ... +# EndEnum + +# Alternatively if multiple registers share the same layout then +# a SysregFields block can be used to describe the shared layout + +# SysregFields <fieldsname> +# <field> +# ... +# EndSysregFields + +# and referenced from within the Sysreg: + +# Sysreg <name> <op0> <op1> <crn> <crm> <op2> +# Fields <fieldsname> +# EndSysreg + +# For ID registers we adopt a few conventions for translating the +# language in the ARM into defines: +# +# NI - Not implemented +# IMP - Implemented +# +# In general it is recommended that new enumeration items be named for the +# feature that introduces them (eg, FEAT_LS64_ACCDATA introduces enumeration +# item ACCDATA) though it may be more taseful to do something else. + +Sysreg ID_AA64ISAR0_EL1 3 0 0 6 0 +Enum 63:60 RNDR + 0b0000 NI + 0b0001 IMP +EndEnum +Enum 59:56 TLB + 0b0000 NI + 0b0001 OS + 0b0010 RANGE +EndEnum +Enum 55:52 TS + 0b0000 NI + 0b0001 FLAGM + 0b0010 FLAGM2 +EndEnum +Enum 51:48 FHM + 0b0000 NI + 0b0001 IMP +EndEnum +Enum 47:44 DP + 0b0000 NI + 0b0001 IMP +EndEnum +Enum 43:40 SM4 + 0b0000 NI + 0b0001 IMP +EndEnum +Enum 39:36 SM3 + 0b0000 NI + 0b0001 IMP +EndEnum +Enum 35:32 SHA3 + 0b0000 NI + 0b0001 IMP +EndEnum +Enum 31:28 RDM + 0b0000 NI + 0b0001 IMP +EndEnum +Enum 27:24 TME + 0b0000 NI + 0b0001 IMP +EndEnum +Enum 23:20 ATOMIC + 0b0000 NI + 0b0010 IMP +EndEnum +Enum 19:16 CRC32 + 0b0000 NI + 0b0001 IMP +EndEnum +Enum 15:12 SHA2 + 0b0000 NI + 0b0001 SHA256 + 0b0010 SHA512 +EndEnum +Enum 11:8 SHA1 + 0b0000 NI + 0b0001 IMP +EndEnum +Enum 7:4 AES + 0b0000 NI + 0b0001 AES + 0b0010 PMULL +EndEnum +Res0 3:0 +EndSysreg + +Sysreg SCTLR_EL1 3 0 1 0 0 +Field 63 TIDCP +Field 62 SPINMASK +Field 61 NMI +Field 60 EnTP2 +Res0 59:58 +Field 57 EPAN +Field 56 EnALS +Field 55 EnAS0 +Field 54 EnASR +Field 53 TME +Field 52 TME0 +Field 51 TMT +Field 50 TMT0 +Field 49:46 TWEDEL +Field 45 TWEDEn +Field 44 DSSBS +Field 43 ATA +Field 42 ATA0 +Enum 41:40 TCF + 0b00 NONE + 0b01 SYNC + 0b10 ASYNC + 0b11 ASYMM +EndEnum +Enum 39:38 TCF0 + 0b00 NONE + 0b01 SYNC + 0b10 ASYNC + 0b11 ASYMM +EndEnum +Field 37 ITFSB +Field 36 BT1 +Field 35 BT0 +Res0 34 +Field 33 MSCEn +Field 32 CMOW +Field 31 EnIA +Field 30 EnIB +Field 29 LSMAOE +Field 28 nTLSMD +Field 27 EnDA +Field 26 UCI +Field 25 EE +Field 24 E0E +Field 23 SPAN +Field 22 EIS +Field 21 IESB +Field 20 TSCXT +Field 19 WXN +Field 18 nTWE +Res0 17 +Field 16 nTWI +Field 15 UCT +Field 14 DZE +Field 13 EnDB +Field 12 I +Field 11 EOS +Field 10 EnRCTX +Field 9 UMA +Field 8 SED +Field 7 ITD +Field 6 nAA +Field 5 CP15BEN +Field 4 SA0 +Field 3 SA +Field 2 C +Field 1 A +Field 0 M +EndSysreg + +SysregFields CPACR_ELx +Res0 63:29 +Field 28 TTA +Res0 27:26 +Field 25:24 SMEN +Res0 23:22 +Field 21:20 FPEN +Res0 19:18 +Field 17:16 ZEN +Res0 15:0 +EndSysregFields + +Sysreg CPACR_EL1 3 0 1 0 2 +Fields CPACR_ELx +EndSysreg + +Sysreg SMPRI_EL1 3 0 1 2 4 +Res0 63:4 +Field 3:0 PRIORITY +EndSysreg + +SysregFields ZCR_ELx +Res0 63:9 +Raz 8:4 +Field 3:0 LEN +EndSysregFields + +Sysreg ZCR_EL1 3 0 1 2 0 +Fields ZCR_ELx +EndSysreg + +SysregFields SMCR_ELx +Res0 63:32 +Field 31 FA64 +Res0 30:9 +Raz 8:4 +Field 3:0 LEN +EndSysregFields + +Sysreg SMCR_EL1 3 0 1 2 6 +Fields SMCR_ELx +EndSysreg + +Sysreg FAR_EL1 3 0 6 0 0 +Field 63:0 ADDR +EndSysreg + +SysregFields CONTEXTIDR_ELx +Res0 63:32 +Field 31:0 PROCID +EndSysregFields + +Sysreg CONTEXTIDR_EL1 3 0 13 0 1 +Fields CONTEXTIDR_ELx +EndSysreg + +Sysreg CLIDR_EL1 3 1 0 0 1 +Res0 63:47 +Field 46:33 Ttypen +Field 32:30 ICB +Field 29:27 LoUU +Field 26:24 LoC +Field 23:21 LoUIS +Field 20:18 Ctype7 +Field 17:15 Ctype6 +Field 14:12 Ctype5 +Field 11:9 Ctype4 +Field 8:6 Ctype3 +Field 5:3 Ctype2 +Field 2:0 Ctype1 +EndSysreg + +Sysreg SMIDR_EL1 3 1 0 0 6 +Res0 63:32 +Field 31:24 IMPLEMENTER +Field 23:16 REVISION +Field 15 SMPS +Res0 14:12 +Field 11:0 AFFINITY +EndSysreg + +Sysreg CSSELR_EL1 3 2 0 0 0 +Res0 63:5 +Field 4 TnD +Field 3:1 Level +Field 0 InD +EndSysreg + +Sysreg SVCR 3 3 4 2 2 +Res0 63:2 +Field 1 ZA +Field 0 SM +EndSysreg + +Sysreg ZCR_EL2 3 4 1 2 0 +Fields ZCR_ELx +EndSysreg + +Sysreg SMPRIMAP_EL2 3 4 1 2 5 +Field 63:60 P15 +Field 59:56 P14 +Field 55:52 P13 +Field 51:48 P12 +Field 47:44 P11 +Field 43:40 P10 +Field 39:36 F9 +Field 35:32 P8 +Field 31:28 P7 +Field 27:24 P6 +Field 23:20 P5 +Field 19:16 P4 +Field 15:12 P3 +Field 11:8 P2 +Field 7:4 P1 +Field 3:0 P0 +EndSysreg + +Sysreg SMCR_EL2 3 4 1 2 6 +Fields SMCR_ELx +EndSysreg + +Sysreg DACR32_EL2 3 4 3 0 0 +Res0 63:32 +Field 31:30 D15 +Field 29:28 D14 +Field 27:26 D13 +Field 25:24 D12 +Field 23:22 D11 +Field 21:20 D10 +Field 19:18 D9 +Field 17:16 D8 +Field 15:14 D7 +Field 13:12 D6 +Field 11:10 D5 +Field 9:8 D4 +Field 7:6 D3 +Field 5:4 D2 +Field 3:2 D1 +Field 1:0 D0 +EndSysreg + +Sysreg FAR_EL2 3 4 6 0 0 +Field 63:0 ADDR +EndSysreg + +Sysreg CONTEXTIDR_EL2 3 4 13 0 1 +Fields CONTEXTIDR_ELx +EndSysreg + +Sysreg CPACR_EL12 3 5 1 0 2 +Fields CPACR_ELx +EndSysreg + +Sysreg ZCR_EL12 3 5 1 2 0 +Fields ZCR_ELx +EndSysreg + +Sysreg SMCR_EL12 3 5 1 2 6 +Fields SMCR_ELx +EndSysreg + +Sysreg FAR_EL12 3 5 6 0 0 +Field 63:0 ADDR +EndSysreg + +Sysreg CONTEXTIDR_EL12 3 5 13 0 1 +Fields CONTEXTIDR_ELx +EndSysreg + +SysregFields TTBRx_EL1 +Field 63:48 ASID +Field 47:1 BADDR +Field 0 CnP +EndSysregFields + +Sysreg TTBR0_EL1 3 0 2 0 0 +Fields TTBRx_EL1 +EndSysreg + +Sysreg TTBR1_EL1 3 0 2 0 1 +Fields TTBRx_EL1 +EndSysreg diff --git a/arch/csky/Kbuild b/arch/csky/Kbuild index 4e39f7abdeb6..0621eaea4196 100644 --- a/arch/csky/Kbuild +++ b/arch/csky/Kbuild @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only +obj-y += kernel/ mm/ + # for cleaning subdir- += boot diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 75ef86605d69..21d72b078eef 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -320,6 +320,14 @@ config HOTPLUG_CPU controlled through /sys/devices/system/cpu/cpu1/hotplug/target. Say N if you want to disable CPU hotplug. + +config HAVE_EFFICIENT_UNALIGNED_STRING_OPS + bool "Enable EFFICIENT_UNALIGNED_STRING_OPS for abiv2" + depends on CPU_CK807 || CPU_CK810 || CPU_CK860 + help + Say Y here to enable EFFICIENT_UNALIGNED_STRING_OPS. Some CPU models could + deal with unaligned access by hardware. + endmenu source "arch/csky/Kconfig.platforms" diff --git a/arch/csky/Makefile b/arch/csky/Makefile index 866805077636..4e1d619fd5c6 100644 --- a/arch/csky/Makefile +++ b/arch/csky/Makefile @@ -61,15 +61,12 @@ KBUILD_AFLAGS += $(KBUILD_CFLAGS) head-y := arch/csky/kernel/head.o -core-y += arch/csky/kernel/ -core-y += arch/csky/mm/ core-y += arch/csky/$(CSKYABI)/ libs-y += arch/csky/lib/ \ $(shell $(CC) $(KBUILD_CFLAGS) $(KCFLAGS) -print-libgcc-file-name) boot := arch/csky/boot -core-y += $(boot)/dts/ all: zImage diff --git a/arch/csky/abiv1/Makefile b/arch/csky/abiv1/Makefile index 601ce3b2fb85..a4b2ade0fc67 100644 --- a/arch/csky/abiv1/Makefile +++ b/arch/csky/abiv1/Makefile @@ -4,5 +4,3 @@ obj-y += bswapdi.o obj-y += bswapsi.o obj-y += cacheflush.o obj-y += mmap.o -obj-y += memcpy.o -obj-y += strksyms.o diff --git a/arch/csky/abiv1/memcpy.S b/arch/csky/abiv1/memcpy.S deleted file mode 100644 index 5078eb5169fa..000000000000 --- a/arch/csky/abiv1/memcpy.S +++ /dev/null @@ -1,347 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. - -#include <linux/linkage.h> - -.macro GET_FRONT_BITS rx y -#ifdef __cskyLE__ - lsri \rx, \y -#else - lsli \rx, \y -#endif -.endm - -.macro GET_AFTER_BITS rx y -#ifdef __cskyLE__ - lsli \rx, \y -#else - lsri \rx, \y -#endif -.endm - -/* void *memcpy(void *dest, const void *src, size_t n); */ -ENTRY(memcpy) - mov r7, r2 - cmplti r4, 4 - bt .L_copy_by_byte - mov r6, r2 - andi r6, 3 - cmpnei r6, 0 - jbt .L_dest_not_aligned - mov r6, r3 - andi r6, 3 - cmpnei r6, 0 - jbt .L_dest_aligned_but_src_not_aligned -.L0: - cmplti r4, 16 - jbt .L_aligned_and_len_less_16bytes - subi sp, 8 - stw r8, (sp, 0) -.L_aligned_and_len_larger_16bytes: - ldw r1, (r3, 0) - ldw r5, (r3, 4) - ldw r8, (r3, 8) - stw r1, (r7, 0) - ldw r1, (r3, 12) - stw r5, (r7, 4) - stw r8, (r7, 8) - stw r1, (r7, 12) - subi r4, 16 - addi r3, 16 - addi r7, 16 - cmplti r4, 16 - jbf .L_aligned_and_len_larger_16bytes - ldw r8, (sp, 0) - addi sp, 8 - cmpnei r4, 0 - jbf .L_return - -.L_aligned_and_len_less_16bytes: - cmplti r4, 4 - bt .L_copy_by_byte -.L1: - ldw r1, (r3, 0) - stw r1, (r7, 0) - subi r4, 4 - addi r3, 4 - addi r7, 4 - cmplti r4, 4 - jbf .L1 - br .L_copy_by_byte - -.L_return: - rts - -.L_copy_by_byte: /* len less than 4 bytes */ - cmpnei r4, 0 - jbf .L_return -.L4: - ldb r1, (r3, 0) - stb r1, (r7, 0) - addi r3, 1 - addi r7, 1 - decne r4 - jbt .L4 - rts - -/* - * If dest is not aligned, just copying some bytes makes the dest align. - * Afther that, we judge whether the src is aligned. - */ -.L_dest_not_aligned: - mov r5, r3 - rsub r5, r5, r7 - abs r5, r5 - cmplt r5, r4 - bt .L_copy_by_byte - mov r5, r7 - sub r5, r3 - cmphs r5, r4 - bf .L_copy_by_byte - mov r5, r6 -.L5: - ldb r1, (r3, 0) /* makes the dest align. */ - stb r1, (r7, 0) - addi r5, 1 - subi r4, 1 - addi r3, 1 - addi r7, 1 - cmpnei r5, 4 - jbt .L5 - cmplti r4, 4 - jbt .L_copy_by_byte - mov r6, r3 /* judge whether the src is aligned. */ - andi r6, 3 - cmpnei r6, 0 - jbf .L0 - -/* Judge the number of misaligned, 1, 2, 3? */ -.L_dest_aligned_but_src_not_aligned: - mov r5, r3 - rsub r5, r5, r7 - abs r5, r5 - cmplt r5, r4 - bt .L_copy_by_byte - bclri r3, 0 - bclri r3, 1 - ldw r1, (r3, 0) - addi r3, 4 - cmpnei r6, 2 - bf .L_dest_aligned_but_src_not_aligned_2bytes - cmpnei r6, 3 - bf .L_dest_aligned_but_src_not_aligned_3bytes - -.L_dest_aligned_but_src_not_aligned_1byte: - mov r5, r7 - sub r5, r3 - cmphs r5, r4 - bf .L_copy_by_byte - cmplti r4, 16 - bf .L11 -.L10: /* If the len is less than 16 bytes */ - GET_FRONT_BITS r1 8 - mov r5, r1 - ldw r6, (r3, 0) - mov r1, r6 - GET_AFTER_BITS r6 24 - or r5, r6 - stw r5, (r7, 0) - subi r4, 4 - addi r3, 4 - addi r7, 4 - cmplti r4, 4 - bf .L10 - subi r3, 3 - br .L_copy_by_byte -.L11: - subi sp, 16 - stw r8, (sp, 0) - stw r9, (sp, 4) - stw r10, (sp, 8) - stw r11, (sp, 12) -.L12: - ldw r5, (r3, 0) - ldw r11, (r3, 4) - ldw r8, (r3, 8) - ldw r9, (r3, 12) - - GET_FRONT_BITS r1 8 /* little or big endian? */ - mov r10, r5 - GET_AFTER_BITS r5 24 - or r5, r1 - - GET_FRONT_BITS r10 8 - mov r1, r11 - GET_AFTER_BITS r11 24 - or r11, r10 - - GET_FRONT_BITS r1 8 - mov r10, r8 - GET_AFTER_BITS r8 24 - or r8, r1 - - GET_FRONT_BITS r10 8 - mov r1, r9 - GET_AFTER_BITS r9 24 - or r9, r10 - - stw r5, (r7, 0) - stw r11, (r7, 4) - stw r8, (r7, 8) - stw r9, (r7, 12) - subi r4, 16 - addi r3, 16 - addi r7, 16 - cmplti r4, 16 - jbf .L12 - ldw r8, (sp, 0) - ldw r9, (sp, 4) - ldw r10, (sp, 8) - ldw r11, (sp, 12) - addi sp , 16 - cmplti r4, 4 - bf .L10 - subi r3, 3 - br .L_copy_by_byte - -.L_dest_aligned_but_src_not_aligned_2bytes: - cmplti r4, 16 - bf .L21 -.L20: - GET_FRONT_BITS r1 16 - mov r5, r1 - ldw r6, (r3, 0) - mov r1, r6 - GET_AFTER_BITS r6 16 - or r5, r6 - stw r5, (r7, 0) - subi r4, 4 - addi r3, 4 - addi r7, 4 - cmplti r4, 4 - bf .L20 - subi r3, 2 - br .L_copy_by_byte - rts - -.L21: /* n > 16 */ - subi sp, 16 - stw r8, (sp, 0) - stw r9, (sp, 4) - stw r10, (sp, 8) - stw r11, (sp, 12) - -.L22: - ldw r5, (r3, 0) - ldw r11, (r3, 4) - ldw r8, (r3, 8) - ldw r9, (r3, 12) - - GET_FRONT_BITS r1 16 - mov r10, r5 - GET_AFTER_BITS r5 16 - or r5, r1 - - GET_FRONT_BITS r10 16 - mov r1, r11 - GET_AFTER_BITS r11 16 - or r11, r10 - - GET_FRONT_BITS r1 16 - mov r10, r8 - GET_AFTER_BITS r8 16 - or r8, r1 - - GET_FRONT_BITS r10 16 - mov r1, r9 - GET_AFTER_BITS r9 16 - or r9, r10 - - stw r5, (r7, 0) - stw r11, (r7, 4) - stw r8, (r7, 8) - stw r9, (r7, 12) - subi r4, 16 - addi r3, 16 - addi r7, 16 - cmplti r4, 16 - jbf .L22 - ldw r8, (sp, 0) - ldw r9, (sp, 4) - ldw r10, (sp, 8) - ldw r11, (sp, 12) - addi sp, 16 - cmplti r4, 4 - bf .L20 - subi r3, 2 - br .L_copy_by_byte - - -.L_dest_aligned_but_src_not_aligned_3bytes: - cmplti r4, 16 - bf .L31 -.L30: - GET_FRONT_BITS r1 24 - mov r5, r1 - ldw r6, (r3, 0) - mov r1, r6 - GET_AFTER_BITS r6 8 - or r5, r6 - stw r5, (r7, 0) - subi r4, 4 - addi r3, 4 - addi r7, 4 - cmplti r4, 4 - bf .L30 - subi r3, 1 - br .L_copy_by_byte -.L31: - subi sp, 16 - stw r8, (sp, 0) - stw r9, (sp, 4) - stw r10, (sp, 8) - stw r11, (sp, 12) -.L32: - ldw r5, (r3, 0) - ldw r11, (r3, 4) - ldw r8, (r3, 8) - ldw r9, (r3, 12) - - GET_FRONT_BITS r1 24 - mov r10, r5 - GET_AFTER_BITS r5 8 - or r5, r1 - - GET_FRONT_BITS r10 24 - mov r1, r11 - GET_AFTER_BITS r11 8 - or r11, r10 - - GET_FRONT_BITS r1 24 - mov r10, r8 - GET_AFTER_BITS r8 8 - or r8, r1 - - GET_FRONT_BITS r10 24 - mov r1, r9 - GET_AFTER_BITS r9 8 - or r9, r10 - - stw r5, (r7, 0) - stw r11, (r7, 4) - stw r8, (r7, 8) - stw r9, (r7, 12) - subi r4, 16 - addi r3, 16 - addi r7, 16 - cmplti r4, 16 - jbf .L32 - ldw r8, (sp, 0) - ldw r9, (sp, 4) - ldw r10, (sp, 8) - ldw r11, (sp, 12) - addi sp, 16 - cmplti r4, 4 - bf .L30 - subi r3, 1 - br .L_copy_by_byte diff --git a/arch/csky/abiv1/strksyms.c b/arch/csky/abiv1/strksyms.c deleted file mode 100644 index c7ccbb27e8d7..000000000000 --- a/arch/csky/abiv1/strksyms.c +++ /dev/null @@ -1,6 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. - -#include <linux/module.h> - -EXPORT_SYMBOL(memcpy); diff --git a/arch/csky/abiv2/Makefile b/arch/csky/abiv2/Makefile index c561efa5533c..ea8005fe01a8 100644 --- a/arch/csky/abiv2/Makefile +++ b/arch/csky/abiv2/Makefile @@ -2,9 +2,11 @@ obj-y += cacheflush.o obj-$(CONFIG_CPU_HAS_FPU) += fpu.o obj-y += memcmp.o +ifeq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS), y) obj-y += memcpy.o obj-y += memmove.o obj-y += memset.o +endif obj-y += strcmp.o obj-y += strcpy.o obj-y += strlen.o diff --git a/arch/csky/abiv2/strksyms.c b/arch/csky/abiv2/strksyms.c index 06da723d8202..8d1fd28c6cf9 100644 --- a/arch/csky/abiv2/strksyms.c +++ b/arch/csky/abiv2/strksyms.c @@ -3,10 +3,12 @@ #include <linux/module.h> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memcmp); EXPORT_SYMBOL(memmove); +#endif +EXPORT_SYMBOL(memcmp); EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strcpy); EXPORT_SYMBOL(strlen); diff --git a/arch/csky/boot/Makefile b/arch/csky/boot/Makefile index dbc9b1bd72f0..c3cfde28f8e6 100644 --- a/arch/csky/boot/Makefile +++ b/arch/csky/boot/Makefile @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only targets := Image zImage uImage -targets += $(dtb-y) $(obj)/Image: vmlinux FORCE $(call if_changed,objcopy) diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h new file mode 100644 index 000000000000..60406ef9c2bb --- /dev/null +++ b/arch/csky/include/asm/atomic.h @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_CSKY_ATOMIC_H +#define __ASM_CSKY_ATOMIC_H + +#ifdef CONFIG_SMP +#include <asm-generic/atomic64.h> + +#include <asm/cmpxchg.h> +#include <asm/barrier.h> + +#define __atomic_acquire_fence() __bar_brarw() + +#define __atomic_release_fence() __bar_brwaw() + +static __always_inline int arch_atomic_read(const atomic_t *v) +{ + return READ_ONCE(v->counter); +} +static __always_inline void arch_atomic_set(atomic_t *v, int i) +{ + WRITE_ONCE(v->counter, i); +} + +#define ATOMIC_OP(op) \ +static __always_inline \ +void arch_atomic_##op(int i, atomic_t *v) \ +{ \ + unsigned long tmp; \ + __asm__ __volatile__ ( \ + "1: ldex.w %0, (%2) \n" \ + " " #op " %0, %1 \n" \ + " stex.w %0, (%2) \n" \ + " bez %0, 1b \n" \ + : "=&r" (tmp) \ + : "r" (i), "r" (&v->counter) \ + : "memory"); \ +} + +ATOMIC_OP(add) +ATOMIC_OP(sub) +ATOMIC_OP(and) +ATOMIC_OP( or) +ATOMIC_OP(xor) + +#undef ATOMIC_OP + +#define ATOMIC_FETCH_OP(op) \ +static __always_inline \ +int arch_atomic_fetch_##op##_relaxed(int i, atomic_t *v) \ +{ \ + register int ret, tmp; \ + __asm__ __volatile__ ( \ + "1: ldex.w %0, (%3) \n" \ + " mov %1, %0 \n" \ + " " #op " %0, %2 \n" \ + " stex.w %0, (%3) \n" \ + " bez %0, 1b \n" \ + : "=&r" (tmp), "=&r" (ret) \ + : "r" (i), "r"(&v->counter) \ + : "memory"); \ + return ret; \ +} + +#define ATOMIC_OP_RETURN(op, c_op) \ +static __always_inline \ +int arch_atomic_##op##_return_relaxed(int i, atomic_t *v) \ +{ \ + return arch_atomic_fetch_##op##_relaxed(i, v) c_op i; \ +} + +#define ATOMIC_OPS(op, c_op) \ + ATOMIC_FETCH_OP(op) \ + ATOMIC_OP_RETURN(op, c_op) + +ATOMIC_OPS(add, +) +ATOMIC_OPS(sub, -) + +#define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add_relaxed +#define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed + +#define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed +#define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed + +#undef ATOMIC_OPS +#undef ATOMIC_OP_RETURN + +#define ATOMIC_OPS(op) \ + ATOMIC_FETCH_OP(op) + +ATOMIC_OPS(and) +ATOMIC_OPS( or) +ATOMIC_OPS(xor) + +#define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed +#define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed +#define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor_relaxed + +#undef ATOMIC_OPS + +#undef ATOMIC_FETCH_OP + +static __always_inline int +arch_atomic_fetch_add_unless(atomic_t *v, int a, int u) +{ + int prev, tmp; + + __asm__ __volatile__ ( + RELEASE_FENCE + "1: ldex.w %0, (%3) \n" + " cmpne %0, %4 \n" + " bf 2f \n" + " mov %1, %0 \n" + " add %1, %2 \n" + " stex.w %1, (%3) \n" + " bez %1, 1b \n" + FULL_FENCE + "2:\n" + : "=&r" (prev), "=&r" (tmp) + : "r" (a), "r" (&v->counter), "r" (u) + : "memory"); + + return prev; +} +#define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless + +static __always_inline bool +arch_atomic_inc_unless_negative(atomic_t *v) +{ + int rc, tmp; + + __asm__ __volatile__ ( + RELEASE_FENCE + "1: ldex.w %0, (%2) \n" + " movi %1, 0 \n" + " blz %0, 2f \n" + " movi %1, 1 \n" + " addi %0, 1 \n" + " stex.w %0, (%2) \n" + " bez %0, 1b \n" + FULL_FENCE + "2:\n" + : "=&r" (tmp), "=&r" (rc) + : "r" (&v->counter) + : "memory"); + + return tmp ? true : false; + +} +#define arch_atomic_inc_unless_negative arch_atomic_inc_unless_negative + +static __always_inline bool +arch_atomic_dec_unless_positive(atomic_t *v) +{ + int rc, tmp; + + __asm__ __volatile__ ( + RELEASE_FENCE + "1: ldex.w %0, (%2) \n" + " movi %1, 0 \n" + " bhz %0, 2f \n" + " movi %1, 1 \n" + " subi %0, 1 \n" + " stex.w %0, (%2) \n" + " bez %0, 1b \n" + FULL_FENCE + "2:\n" + : "=&r" (tmp), "=&r" (rc) + : "r" (&v->counter) + : "memory"); + + return tmp ? true : false; +} +#define arch_atomic_dec_unless_positive arch_atomic_dec_unless_positive + +static __always_inline int +arch_atomic_dec_if_positive(atomic_t *v) +{ + int dec, tmp; + + __asm__ __volatile__ ( + RELEASE_FENCE + "1: ldex.w %0, (%2) \n" + " subi %1, %0, 1 \n" + " blz %1, 2f \n" + " stex.w %1, (%2) \n" + " bez %1, 1b \n" + FULL_FENCE + "2:\n" + : "=&r" (dec), "=&r" (tmp) + : "r" (&v->counter) + : "memory"); + + return dec - 1; +} +#define arch_atomic_dec_if_positive arch_atomic_dec_if_positive + +#define ATOMIC_OP() \ +static __always_inline \ +int arch_atomic_xchg_relaxed(atomic_t *v, int n) \ +{ \ + return __xchg_relaxed(n, &(v->counter), 4); \ +} \ +static __always_inline \ +int arch_atomic_cmpxchg_relaxed(atomic_t *v, int o, int n) \ +{ \ + return __cmpxchg_relaxed(&(v->counter), o, n, 4); \ +} \ +static __always_inline \ +int arch_atomic_cmpxchg_acquire(atomic_t *v, int o, int n) \ +{ \ + return __cmpxchg_acquire(&(v->counter), o, n, 4); \ +} \ +static __always_inline \ +int arch_atomic_cmpxchg(atomic_t *v, int o, int n) \ +{ \ + return __cmpxchg(&(v->counter), o, n, 4); \ +} + +#define ATOMIC_OPS() \ + ATOMIC_OP() + +ATOMIC_OPS() + +#define arch_atomic_xchg_relaxed arch_atomic_xchg_relaxed +#define arch_atomic_cmpxchg_relaxed arch_atomic_cmpxchg_relaxed +#define arch_atomic_cmpxchg_acquire arch_atomic_cmpxchg_acquire +#define arch_atomic_cmpxchg arch_atomic_cmpxchg + +#undef ATOMIC_OPS +#undef ATOMIC_OP + +#else +#include <asm-generic/atomic.h> +#endif + +#endif /* __ASM_CSKY_ATOMIC_H */ diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h index f4045dd53e17..15de58b10aec 100644 --- a/arch/csky/include/asm/barrier.h +++ b/arch/csky/include/asm/barrier.h @@ -37,17 +37,21 @@ * bar.brar * bar.bwaw */ +#define FULL_FENCE ".long 0x842fc000\n" +#define ACQUIRE_FENCE ".long 0x8427c000\n" +#define RELEASE_FENCE ".long 0x842ec000\n" + #define __bar_brw() asm volatile (".long 0x842cc000\n":::"memory") #define __bar_br() asm volatile (".long 0x8424c000\n":::"memory") #define __bar_bw() asm volatile (".long 0x8428c000\n":::"memory") #define __bar_arw() asm volatile (".long 0x8423c000\n":::"memory") #define __bar_ar() asm volatile (".long 0x8421c000\n":::"memory") #define __bar_aw() asm volatile (".long 0x8422c000\n":::"memory") -#define __bar_brwarw() asm volatile (".long 0x842fc000\n":::"memory") -#define __bar_brarw() asm volatile (".long 0x8427c000\n":::"memory") +#define __bar_brwarw() asm volatile (FULL_FENCE:::"memory") +#define __bar_brarw() asm volatile (ACQUIRE_FENCE:::"memory") #define __bar_bwarw() asm volatile (".long 0x842bc000\n":::"memory") #define __bar_brwar() asm volatile (".long 0x842dc000\n":::"memory") -#define __bar_brwaw() asm volatile (".long 0x842ec000\n":::"memory") +#define __bar_brwaw() asm volatile (RELEASE_FENCE:::"memory") #define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory") #define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory") #define __bar_bwaw() asm volatile (".long 0x842ac000\n":::"memory") @@ -56,7 +60,6 @@ #define __smp_rmb() __bar_brar() #define __smp_wmb() __bar_bwaw() -#define ACQUIRE_FENCE ".long 0x8427c000\n" #define __smp_acquire_fence() __bar_brarw() #define __smp_release_fence() __bar_brwaw() diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h index d1bef11f8dc9..5b8faccd65e4 100644 --- a/arch/csky/include/asm/cmpxchg.h +++ b/arch/csky/include/asm/cmpxchg.h @@ -64,15 +64,71 @@ extern void __bad_xchg(void); #define arch_cmpxchg_relaxed(ptr, o, n) \ (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr)))) -#define arch_cmpxchg(ptr, o, n) \ +#define __cmpxchg_acquire(ptr, old, new, size) \ ({ \ + __typeof__(ptr) __ptr = (ptr); \ + __typeof__(new) __new = (new); \ + __typeof__(new) __tmp; \ + __typeof__(old) __old = (old); \ + __typeof__(*(ptr)) __ret; \ + switch (size) { \ + case 4: \ + asm volatile ( \ + "1: ldex.w %0, (%3) \n" \ + " cmpne %0, %4 \n" \ + " bt 2f \n" \ + " mov %1, %2 \n" \ + " stex.w %1, (%3) \n" \ + " bez %1, 1b \n" \ + ACQUIRE_FENCE \ + "2: \n" \ + : "=&r" (__ret), "=&r" (__tmp) \ + : "r" (__new), "r"(__ptr), "r"(__old) \ + :); \ + break; \ + default: \ + __bad_xchg(); \ + } \ + __ret; \ +}) + +#define arch_cmpxchg_acquire(ptr, o, n) \ + (__cmpxchg_acquire((ptr), (o), (n), sizeof(*(ptr)))) + +#define __cmpxchg(ptr, old, new, size) \ +({ \ + __typeof__(ptr) __ptr = (ptr); \ + __typeof__(new) __new = (new); \ + __typeof__(new) __tmp; \ + __typeof__(old) __old = (old); \ __typeof__(*(ptr)) __ret; \ - __smp_release_fence(); \ - __ret = arch_cmpxchg_relaxed(ptr, o, n); \ - __smp_acquire_fence(); \ + switch (size) { \ + case 4: \ + asm volatile ( \ + RELEASE_FENCE \ + "1: ldex.w %0, (%3) \n" \ + " cmpne %0, %4 \n" \ + " bt 2f \n" \ + " mov %1, %2 \n" \ + " stex.w %1, (%3) \n" \ + " bez %1, 1b \n" \ + FULL_FENCE \ + "2: \n" \ + : "=&r" (__ret), "=&r" (__tmp) \ + : "r" (__new), "r"(__ptr), "r"(__old) \ + :); \ + break; \ + default: \ + __bad_xchg(); \ + } \ __ret; \ }) +#define arch_cmpxchg(ptr, o, n) \ + (__cmpxchg((ptr), (o), (n), sizeof(*(ptr)))) + +#define arch_cmpxchg_local(ptr, o, n) \ + (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr)))) #else #include <asm-generic/cmpxchg.h> #endif diff --git a/arch/csky/include/asm/io.h b/arch/csky/include/asm/io.h index f82654053dc0..4725bb977b0f 100644 --- a/arch/csky/include/asm/io.h +++ b/arch/csky/include/asm/io.h @@ -5,7 +5,6 @@ #include <linux/pgtable.h> #include <linux/types.h> -#include <linux/version.h> /* * I/O memory access primitives. Reads are ordered relative to any @@ -33,6 +32,17 @@ #endif /* + * String version of I/O memory access operations. + */ +extern void __memcpy_fromio(void *, const volatile void __iomem *, size_t); +extern void __memcpy_toio(volatile void __iomem *, const void *, size_t); +extern void __memset_io(volatile void __iomem *, int, size_t); + +#define memset_io(c,v,l) __memset_io((c),(v),(l)) +#define memcpy_fromio(a,c,l) __memcpy_fromio((a),(c),(l)) +#define memcpy_toio(c,a,l) __memcpy_toio((c),(a),(l)) + +/* * I/O memory mapping functions. */ #define ioremap_wc(addr, size) \ diff --git a/arch/csky/kernel/Makefile b/arch/csky/kernel/Makefile index 6c0f36010ed0..4eb41421ca5b 100644 --- a/arch/csky/kernel/Makefile +++ b/arch/csky/kernel/Makefile @@ -2,7 +2,7 @@ extra-y := head.o vmlinux.lds obj-y += entry.o atomic.o signal.o traps.o irq.o time.o vdso.o vdso/ -obj-y += power.o syscall.o syscall_table.o setup.o +obj-y += power.o syscall.o syscall_table.o setup.o io.o obj-y += process.o cpu-probe.o ptrace.o stacktrace.o obj-y += probes/ diff --git a/arch/csky/kernel/io.c b/arch/csky/kernel/io.c new file mode 100644 index 000000000000..5883f13fa2b1 --- /dev/null +++ b/arch/csky/kernel/io.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/export.h> +#include <linux/types.h> +#include <linux/io.h> + +/* + * Copy data from IO memory space to "real" memory space. + */ +void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) +{ + while (count && !IS_ALIGNED((unsigned long)from, 4)) { + *(u8 *)to = __raw_readb(from); + from++; + to++; + count--; + } + + while (count >= 4) { + *(u32 *)to = __raw_readl(from); + from += 4; + to += 4; + count -= 4; + } + + while (count) { + *(u8 *)to = __raw_readb(from); + from++; + to++; + count--; + } +} +EXPORT_SYMBOL(__memcpy_fromio); + +/* + * Copy data from "real" memory space to IO memory space. + */ +void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) +{ + while (count && !IS_ALIGNED((unsigned long)to, 4)) { + __raw_writeb(*(u8 *)from, to); + from++; + to++; + count--; + } + + while (count >= 4) { + __raw_writel(*(u32 *)from, to); + from += 4; + to += 4; + count -= 4; + } + + while (count) { + __raw_writeb(*(u8 *)from, to); + from++; + to++; + count--; + } +} +EXPORT_SYMBOL(__memcpy_toio); + +/* + * "memset" on IO memory space. + */ +void __memset_io(volatile void __iomem *dst, int c, size_t count) +{ + u32 qc = (u8)c; + + qc |= qc << 8; + qc |= qc << 16; + + while (count && !IS_ALIGNED((unsigned long)dst, 4)) { + __raw_writeb(c, dst); + dst++; + count--; + } + + while (count >= 4) { + __raw_writel(qc, dst); + dst += 4; + count -= 4; + } + + while (count) { + __raw_writeb(c, dst); + dst++; + count--; + } +} +EXPORT_SYMBOL(__memset_io); diff --git a/arch/csky/kernel/module.c b/arch/csky/kernel/module.c index 6cd82d69c655..f11b3e573344 100644 --- a/arch/csky/kernel/module.c +++ b/arch/csky/kernel/module.c @@ -68,7 +68,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab, *location = rel[i].r_addend + sym->st_value; break; case R_CSKY_PC32: - /* Add the value, subtract its postition */ + /* Add the value, subtract its position */ *location = rel[i].r_addend + sym->st_value - (uint32_t)location; break; diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c index 42920f25e73c..34ba684d5962 100644 --- a/arch/csky/kernel/probes/kprobes.c +++ b/arch/csky/kernel/probes/kprobes.c @@ -30,7 +30,7 @@ static int __kprobes patch_text_cb(void *priv) struct csky_insn_patch *param = priv; unsigned int addr = (unsigned int)param->addr; - if (atomic_inc_return(¶m->cpu_count) == 1) { + if (atomic_inc_return(¶m->cpu_count) == num_online_cpus()) { *(u16 *) addr = cpu_to_le16(param->opcode); dcache_wb_range(addr, addr + 2); atomic_inc(¶m->cpu_count); diff --git a/arch/csky/kernel/probes/uprobes.c b/arch/csky/kernel/probes/uprobes.c index 1a9e0961b2b5..2d31a12e46cf 100644 --- a/arch/csky/kernel/probes/uprobes.c +++ b/arch/csky/kernel/probes/uprobes.c @@ -102,7 +102,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) struct uprobe_task *utask = current->utask; /* - * Task has received a fatal signal, so reset back to probbed + * Task has received a fatal signal, so reset back to probed * address. */ instruction_pointer_set(regs, utask->vaddr); diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index 3d0ca22cd0e2..5de04707aa07 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -2,7 +2,6 @@ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #include <linux/module.h> -#include <linux/version.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/sched/debug.h> diff --git a/arch/csky/lib/Makefile b/arch/csky/lib/Makefile index 7fbdbb2c4d12..d0ce6e2d7ab2 100644 --- a/arch/csky/lib/Makefile +++ b/arch/csky/lib/Makefile @@ -1,3 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only lib-y := usercopy.o delay.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o +ifneq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS), y) +lib-y += string.o +endif diff --git a/arch/csky/lib/string.c b/arch/csky/lib/string.c new file mode 100644 index 000000000000..d65626fcaeac --- /dev/null +++ b/arch/csky/lib/string.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * String functions optimized for hardware which doesn't + * handle unaligned memory accesses efficiently. + * + * Copyright (C) 2021 Matteo Croce + */ + +#include <linux/types.h> +#include <linux/module.h> + +/* Minimum size for a word copy to be convenient */ +#define BYTES_LONG sizeof(long) +#define WORD_MASK (BYTES_LONG - 1) +#define MIN_THRESHOLD (BYTES_LONG * 2) + +/* convenience union to avoid cast between different pointer types */ +union types { + u8 *as_u8; + unsigned long *as_ulong; + uintptr_t as_uptr; +}; + +union const_types { + const u8 *as_u8; + unsigned long *as_ulong; + uintptr_t as_uptr; +}; + +void *memcpy(void *dest, const void *src, size_t count) +{ + union const_types s = { .as_u8 = src }; + union types d = { .as_u8 = dest }; + int distance = 0; + + if (count < MIN_THRESHOLD) + goto copy_remainder; + + /* Copy a byte at time until destination is aligned. */ + for (; d.as_uptr & WORD_MASK; count--) + *d.as_u8++ = *s.as_u8++; + + distance = s.as_uptr & WORD_MASK; + + if (distance) { + unsigned long last, next; + + /* + * s is distance bytes ahead of d, and d just reached + * the alignment boundary. Move s backward to word align it + * and shift data to compensate for distance, in order to do + * word-by-word copy. + */ + s.as_u8 -= distance; + + next = s.as_ulong[0]; + for (; count >= BYTES_LONG; count -= BYTES_LONG) { + last = next; + next = s.as_ulong[1]; + + d.as_ulong[0] = last >> (distance * 8) | + next << ((BYTES_LONG - distance) * 8); + + d.as_ulong++; + s.as_ulong++; + } + + /* Restore s with the original offset. */ + s.as_u8 += distance; + } else { + /* + * If the source and dest lower bits are the same, do a simple + * 32/64 bit wide copy. + */ + for (; count >= BYTES_LONG; count -= BYTES_LONG) + *d.as_ulong++ = *s.as_ulong++; + } + +copy_remainder: + while (count--) + *d.as_u8++ = *s.as_u8++; + + return dest; +} +EXPORT_SYMBOL(memcpy); + +/* + * Simply check if the buffer overlaps an call memcpy() in case, + * otherwise do a simple one byte at time backward copy. + */ +void *memmove(void *dest, const void *src, size_t count) +{ + if (dest < src || src + count <= dest) + return memcpy(dest, src, count); + + if (dest > src) { + const char *s = src + count; + char *tmp = dest + count; + + while (count--) + *--tmp = *--s; + } + return dest; +} +EXPORT_SYMBOL(memmove); + +void *memset(void *s, int c, size_t count) +{ + union types dest = { .as_u8 = s }; + + if (count >= MIN_THRESHOLD) { + unsigned long cu = (unsigned long)c; + + /* Compose an ulong with 'c' repeated 4/8 times */ + cu |= cu << 8; + cu |= cu << 16; + /* Suppress warning on 32 bit machines */ + cu |= (cu << 16) << 16; + + for (; count && dest.as_uptr & WORD_MASK; count--) + *dest.as_u8++ = c; + + /* Copy using the largest size allowed */ + for (; count >= BYTES_LONG; count -= BYTES_LONG) + *dest.as_ulong++ = cu; + } + + /* copy the remainder */ + while (count--) + *dest.as_u8++ = c; + + return s; +} +EXPORT_SYMBOL(memset); diff --git a/arch/csky/mm/dma-mapping.c b/arch/csky/mm/dma-mapping.c index c3a775a7e8f9..82447029feb4 100644 --- a/arch/csky/mm/dma-mapping.c +++ b/arch/csky/mm/dma-mapping.c @@ -9,7 +9,6 @@ #include <linux/mm.h> #include <linux/scatterlist.h> #include <linux/types.h> -#include <linux/version.h> #include <asm/cache.h> static inline void cache_op(phys_addr_t paddr, size_t size, diff --git a/arch/ia64/include/asm/timex.h b/arch/ia64/include/asm/timex.h index 869a3ac6bf23..7ccc077a60be 100644 --- a/arch/ia64/include/asm/timex.h +++ b/arch/ia64/include/asm/timex.h @@ -39,6 +39,7 @@ get_cycles (void) ret = ia64_getreg(_IA64_REG_AR_ITC); return ret; } +#define get_cycles get_cycles extern void ia64_cpu_local_tick (void); extern unsigned long long ia64_native_sched_clock (void); diff --git a/arch/m68k/Kbuild b/arch/m68k/Kbuild index 18abb35c26a1..7762af9f6def 100644 --- a/arch/m68k/Kbuild +++ b/arch/m68k/Kbuild @@ -17,3 +17,4 @@ obj-$(CONFIG_M68060) += ifpsp060/ obj-$(CONFIG_M68KFPU_EMU) += math-emu/ obj-$(CONFIG_M68000) += 68000/ obj-$(CONFIG_COLDFIRE) += coldfire/ +obj-$(CONFIG_VIRT) += virt/ diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index 16ea9a67723c..3d5da25c73b5 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -327,7 +327,7 @@ comment "Processor Specific Options" config M68KFPU_EMU bool "Math emulation support" - depends on MMU + depends on M68KCLASSIC && FPU help At some point in the future, this will cause floating-point math instructions to be emulated by the kernel on machines that lack a diff --git a/arch/m68k/Kconfig.machine b/arch/m68k/Kconfig.machine index eeab4f3e6c19..188a8f8a0104 100644 --- a/arch/m68k/Kconfig.machine +++ b/arch/m68k/Kconfig.machine @@ -149,6 +149,23 @@ config SUN3 If you don't want to compile a kernel exclusively for a Sun 3, say N. +config VIRT + bool "Virtual M68k Machine support" + depends on MMU + select GENERIC_CLOCKEVENTS + select GOLDFISH + select GOLDFISH_TIMER + select GOLDFISH_TTY + select M68040 + select MMU_MOTOROLA if MMU + select RTC_CLASS + select RTC_DRV_GOLDFISH + select TTY + select VIRTIO_MMIO + help + This options enable a pure virtual machine based on m68k, + VIRTIO MMIO devices and GOLDFISH interfaces (TTY, RTC, PIC) + config PILOT bool diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 114aaa3f955a..c181030218bf 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -42,7 +42,6 @@ CONFIG_MQ_IOSCHED_DEADLINE=m CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set CONFIG_ZPOOL=m @@ -581,6 +580,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -613,7 +613,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m -CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -638,7 +637,6 @@ CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m -CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_SIPHASH=m CONFIG_TEST_IDA=m @@ -659,6 +657,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m -CONFIG_TEST_STACKINIT=m CONFIG_TEST_MEMINIT=m CONFIG_TEST_FREE_PAGES=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 30b9d932b930..40755648fb6c 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -38,7 +38,6 @@ CONFIG_MQ_IOSCHED_DEADLINE=m CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set CONFIG_ZPOOL=m @@ -538,6 +537,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -570,7 +570,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m -CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y @@ -594,7 +593,6 @@ CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m -CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_SIPHASH=m CONFIG_TEST_IDA=m @@ -615,6 +613,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m -CONFIG_TEST_STACKINIT=m CONFIG_TEST_MEMINIT=m CONFIG_TEST_FREE_PAGES=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 51ff3180e69d..be0d9155fc5b 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -45,7 +45,6 @@ CONFIG_MQ_IOSCHED_DEADLINE=m CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set CONFIG_ZPOOL=m @@ -558,6 +557,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -590,7 +590,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m -CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -615,7 +614,6 @@ CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m -CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_SIPHASH=m CONFIG_TEST_IDA=m @@ -636,6 +634,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m -CONFIG_TEST_STACKINIT=m CONFIG_TEST_MEMINIT=m CONFIG_TEST_FREE_PAGES=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 7d95ca4366e4..9af0e2d0d153 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -35,7 +35,6 @@ CONFIG_MQ_IOSCHED_DEADLINE=m CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set CONFIG_ZPOOL=m @@ -530,6 +529,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -562,7 +562,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m -CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y @@ -586,7 +585,6 @@ CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m -CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_SIPHASH=m CONFIG_TEST_IDA=m @@ -607,6 +605,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m -CONFIG_TEST_STACKINIT=m CONFIG_TEST_MEMINIT=m CONFIG_TEST_FREE_PAGES=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index e306e3813607..49341d66feb6 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -37,7 +37,6 @@ CONFIG_MQ_IOSCHED_DEADLINE=m CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set CONFIG_ZPOOL=m @@ -540,6 +539,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -572,7 +572,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m -CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y @@ -596,7 +595,6 @@ CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m -CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_SIPHASH=m CONFIG_TEST_IDA=m @@ -617,6 +615,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m -CONFIG_TEST_STACKINIT=m CONFIG_TEST_MEMINIT=m CONFIG_TEST_FREE_PAGES=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 41316cf02441..92b33d5ffab1 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -36,7 +36,6 @@ CONFIG_MQ_IOSCHED_DEADLINE=m CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set CONFIG_ZPOOL=m @@ -560,6 +559,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -592,7 +592,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m -CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -617,7 +616,6 @@ CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m -CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_SIPHASH=m CONFIG_TEST_IDA=m @@ -638,6 +636,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m -CONFIG_TEST_STACKINIT=m CONFIG_TEST_MEMINIT=m CONFIG_TEST_FREE_PAGES=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 2fc3f0df6d43..6aaa947bc849 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -56,7 +56,6 @@ CONFIG_MQ_IOSCHED_DEADLINE=m CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set CONFIG_ZPOOL=m @@ -646,6 +645,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -678,7 +678,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m -CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -703,7 +702,6 @@ CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m -CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_SIPHASH=m CONFIG_TEST_IDA=m @@ -724,6 +722,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m -CONFIG_TEST_STACKINIT=m CONFIG_TEST_MEMINIT=m CONFIG_TEST_FREE_PAGES=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 9603f4396469..b62d65e59938 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -34,7 +34,6 @@ CONFIG_MQ_IOSCHED_DEADLINE=m CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set CONFIG_ZPOOL=m @@ -529,6 +528,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -561,7 +561,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m -CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y @@ -585,7 +584,6 @@ CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m -CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_SIPHASH=m CONFIG_TEST_IDA=m @@ -606,6 +604,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m -CONFIG_TEST_STACKINIT=m CONFIG_TEST_MEMINIT=m CONFIG_TEST_FREE_PAGES=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index c9cabd3344df..8ecf261487d4 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -35,7 +35,6 @@ CONFIG_MQ_IOSCHED_DEADLINE=m CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set CONFIG_ZPOOL=m @@ -530,6 +529,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -562,7 +562,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m -CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y @@ -586,7 +585,6 @@ CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m -CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_SIPHASH=m CONFIG_TEST_IDA=m @@ -607,6 +605,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m -CONFIG_TEST_STACKINIT=m CONFIG_TEST_MEMINIT=m CONFIG_TEST_FREE_PAGES=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 5f994bf44fb8..7540d908897b 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -36,7 +36,6 @@ CONFIG_MQ_IOSCHED_DEADLINE=m CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set CONFIG_ZPOOL=m @@ -547,6 +546,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -579,7 +579,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m -CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -604,7 +603,6 @@ CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m -CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_SIPHASH=m CONFIG_TEST_IDA=m @@ -625,6 +623,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m -CONFIG_TEST_STACKINIT=m CONFIG_TEST_MEMINIT=m CONFIG_TEST_FREE_PAGES=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 183e33f7d4a0..832b45944617 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -32,7 +32,6 @@ CONFIG_MQ_IOSCHED_DEADLINE=m CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set CONFIG_ZPOOL=m @@ -529,6 +528,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -561,7 +561,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m -CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y @@ -584,7 +583,6 @@ CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m -CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_SIPHASH=m CONFIG_TEST_IDA=m @@ -605,6 +603,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m -CONFIG_TEST_STACKINIT=m CONFIG_TEST_MEMINIT=m CONFIG_TEST_FREE_PAGES=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 8214263b9ab8..9171b687e565 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -32,7 +32,6 @@ CONFIG_MQ_IOSCHED_DEADLINE=m CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set CONFIG_ZPOOL=m @@ -528,6 +527,7 @@ CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -560,7 +560,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m -CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y @@ -584,7 +583,6 @@ CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m -CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_SIPHASH=m CONFIG_TEST_IDA=m @@ -605,6 +603,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m -CONFIG_TEST_STACKINIT=m CONFIG_TEST_MEMINIT=m CONFIG_TEST_FREE_PAGES=m diff --git a/arch/m68k/configs/virt_defconfig b/arch/m68k/configs/virt_defconfig new file mode 100644 index 000000000000..8059bd618370 --- /dev/null +++ b/arch/m68k/configs/virt_defconfig @@ -0,0 +1,68 @@ +CONFIG_LOCALVERSION="-virt" +CONFIG_SYSVIPC=y +CONFIG_CGROUPS=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_SCHED=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_VIRT=y +CONFIG_PROC_HARDWARE=y +CONFIG_PARTITION_ADVANCED=y +CONFIG_AMIGA_PARTITION=y +CONFIG_ATARI_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_LDM_PARTITION=y +CONFIG_LDM_DEBUG=y +CONFIG_SUN_PARTITION=y +CONFIG_SYSV68_PARTITION=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_NET_9P=y +CONFIG_NET_9P_VIRTIO=y +CONFIG_DEVTMPFS=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_VIRTIO_BLK=y +CONFIG_SCSI=y +CONFIG_BLK_DEV_SR=y +CONFIG_SCSI_VIRTIO=y +CONFIG_NETDEVICES=y +CONFIG_VIRTIO_NET=y +CONFIG_INPUT_MOUSEDEV=y +CONFIG_INPUT_EVDEV=y +CONFIG_VIRTIO_CONSOLE=y +CONFIG_HW_RANDOM_VIRTIO=y +CONFIG_DRM=y +CONFIG_DRM_VIRTIO_GPU=y +CONFIG_FB=y +CONFIG_SOUND=y +CONFIG_SND=y +CONFIG_SND_VIRTIO=y +CONFIG_VIRT_DRIVERS=y +CONFIG_VIRTIO_INPUT=y +CONFIG_EXT4_FS=y +CONFIG_AUTOFS_FS=y +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_9P_FS=y +CONFIG_9P_FS_POSIX_ACL=y +CONFIG_9P_FS_SECURITY=y +CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/include/asm/config.h b/arch/m68k/include/asm/config.h index e73ffa23c4f5..9bb888ab5009 100644 --- a/arch/m68k/include/asm/config.h +++ b/arch/m68k/include/asm/config.h @@ -17,6 +17,7 @@ extern int mac_parse_bootinfo(const struct bi_record *record); extern int mvme147_parse_bootinfo(const struct bi_record *record); extern int mvme16x_parse_bootinfo(const struct bi_record *record); extern int q40_parse_bootinfo(const struct bi_record *record); +extern int virt_parse_bootinfo(const struct bi_record *record); extern void config_amiga(void); extern void config_apollo(void); @@ -29,5 +30,6 @@ extern void config_mvme16x(void); extern void config_q40(void); extern void config_sun3(void); extern void config_sun3x(void); +extern void config_virt(void); #endif /* _M68K_CONFIG_H */ diff --git a/arch/m68k/include/asm/io.h b/arch/m68k/include/asm/io.h index aabe6420ead2..aaeabc65e63c 100644 --- a/arch/m68k/include/asm/io.h +++ b/arch/m68k/include/asm/io.h @@ -8,6 +8,9 @@ #include <asm/io_mm.h> #endif +#define gf_ioread32 ioread32be +#define gf_iowrite32 iowrite32be + #include <asm-generic/io.h> #endif /* _M68K_IO_H */ diff --git a/arch/m68k/include/asm/irq.h b/arch/m68k/include/asm/irq.h index 91dd493791d7..7829e955ca04 100644 --- a/arch/m68k/include/asm/irq.h +++ b/arch/m68k/include/asm/irq.h @@ -12,7 +12,8 @@ */ #if defined(CONFIG_COLDFIRE) #define NR_IRQS 256 -#elif defined(CONFIG_VME) || defined(CONFIG_SUN3) || defined(CONFIG_SUN3X) +#elif defined(CONFIG_VME) || defined(CONFIG_SUN3) || \ + defined(CONFIG_SUN3X) || defined(CONFIG_VIRT) #define NR_IRQS 200 #elif defined(CONFIG_ATARI) #define NR_IRQS 141 diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index 143ba7de9bda..9b4e2fe2ac82 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -80,6 +80,9 @@ #elif defined(CONFIG_COLDFIRE) #define KMAP_START 0xe0000000 #define KMAP_END 0xf0000000 +#elif defined(CONFIG_VIRT) +#define KMAP_START 0xdf000000 +#define KMAP_END 0xff000000 #else #define KMAP_START 0xd0000000 #define KMAP_END 0xf0000000 @@ -92,6 +95,10 @@ extern unsigned long m68k_vmalloc_end; #elif defined(CONFIG_COLDFIRE) #define VMALLOC_START 0xd0000000 #define VMALLOC_END 0xe0000000 +#elif defined(CONFIG_VIRT) +#define VMALLOC_OFFSET PAGE_SIZE +#define VMALLOC_START (((unsigned long) high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) +#define VMALLOC_END KMAP_START #else /* Just any arbitrary offset to the start of the vmalloc VM area: the * current 8MB value just means that there will be a 8MB "hole" after the diff --git a/arch/m68k/include/asm/raw_io.h b/arch/m68k/include/asm/raw_io.h index 80eb2396d01e..3ba40bc1dfaa 100644 --- a/arch/m68k/include/asm/raw_io.h +++ b/arch/m68k/include/asm/raw_io.h @@ -80,14 +80,14 @@ ({ u16 __v = le16_to_cpu(*(__force volatile u16 *) (addr)); __v; }) #define rom_out_8(addr, b) \ - ({u8 __maybe_unused __w, __v = (b); u32 _addr = ((u32) (addr)); \ + (void)({u8 __maybe_unused __w, __v = (b); u32 _addr = ((u32) (addr)); \ __w = ((*(__force volatile u8 *) ((_addr | 0x10000) + (__v<<1)))); }) #define rom_out_be16(addr, w) \ - ({u16 __maybe_unused __w, __v = (w); u32 _addr = ((u32) (addr)); \ + (void)({u16 __maybe_unused __w, __v = (w); u32 _addr = ((u32) (addr)); \ __w = ((*(__force volatile u16 *) ((_addr & 0xFFFF0000UL) + ((__v & 0xFF)<<1)))); \ __w = ((*(__force volatile u16 *) ((_addr | 0x10000) + ((__v >> 8)<<1)))); }) #define rom_out_le16(addr, w) \ - ({u16 __maybe_unused __w, __v = (w); u32 _addr = ((u32) (addr)); \ + (void)({u16 __maybe_unused __w, __v = (w); u32 _addr = ((u32) (addr)); \ __w = ((*(__force volatile u16 *) ((_addr & 0xFFFF0000UL) + ((__v >> 8)<<1)))); \ __w = ((*(__force volatile u16 *) ((_addr | 0x10000) + ((__v & 0xFF)<<1)))); }) diff --git a/arch/m68k/include/asm/setup.h b/arch/m68k/include/asm/setup.h index 8f2023f8c1c4..2c99477aaf89 100644 --- a/arch/m68k/include/asm/setup.h +++ b/arch/m68k/include/asm/setup.h @@ -37,7 +37,8 @@ extern unsigned long m68k_machtype; #elif defined(CONFIG_ATARI) || defined(CONFIG_MAC) || defined(CONFIG_APOLLO) \ || defined(CONFIG_MVME16x) || defined(CONFIG_BVME6000) \ || defined(CONFIG_HP300) || defined(CONFIG_Q40) \ - || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) + || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) \ + || defined(CONFIG_VIRT) # define MACH_IS_AMIGA (m68k_machtype == MACH_AMIGA) #else # define MACH_AMIGA_ONLY @@ -50,7 +51,8 @@ extern unsigned long m68k_machtype; #elif defined(CONFIG_AMIGA) || defined(CONFIG_MAC) || defined(CONFIG_APOLLO) \ || defined(CONFIG_MVME16x) || defined(CONFIG_BVME6000) \ || defined(CONFIG_HP300) || defined(CONFIG_Q40) \ - || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) + || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) \ + || defined(CONFIG_VIRT) # define MACH_IS_ATARI (m68k_machtype == MACH_ATARI) #else # define MACH_ATARI_ONLY @@ -63,7 +65,8 @@ extern unsigned long m68k_machtype; #elif defined(CONFIG_AMIGA) || defined(CONFIG_ATARI) || defined(CONFIG_APOLLO) \ || defined(CONFIG_MVME16x) || defined(CONFIG_BVME6000) \ || defined(CONFIG_HP300) || defined(CONFIG_Q40) \ - || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) + || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) \ + || defined(CONFIG_VIRT) # define MACH_IS_MAC (m68k_machtype == MACH_MAC) #else # define MACH_MAC_ONLY @@ -84,7 +87,8 @@ extern unsigned long m68k_machtype; #elif defined(CONFIG_AMIGA) || defined(CONFIG_MAC) || defined(CONFIG_ATARI) \ || defined(CONFIG_MVME16x) || defined(CONFIG_BVME6000) \ || defined(CONFIG_HP300) || defined(CONFIG_Q40) \ - || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) + || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) \ + || defined(CONFIG_VIRT) # define MACH_IS_APOLLO (m68k_machtype == MACH_APOLLO) #else # define MACH_APOLLO_ONLY @@ -97,7 +101,8 @@ extern unsigned long m68k_machtype; #elif defined(CONFIG_AMIGA) || defined(CONFIG_MAC) || defined(CONFIG_ATARI) \ || defined(CONFIG_APOLLO) || defined(CONFIG_BVME6000) \ || defined(CONFIG_HP300) || defined(CONFIG_Q40) \ - || defined(CONFIG_SUN3X) || defined(CONFIG_MVME16x) + || defined(CONFIG_SUN3X) || defined(CONFIG_MVME16x) \ + || defined(CONFIG_VIRT) # define MACH_IS_MVME147 (m68k_machtype == MACH_MVME147) #else # define MACH_MVME147_ONLY @@ -110,7 +115,8 @@ extern unsigned long m68k_machtype; #elif defined(CONFIG_AMIGA) || defined(CONFIG_MAC) || defined(CONFIG_ATARI) \ || defined(CONFIG_APOLLO) || defined(CONFIG_BVME6000) \ || defined(CONFIG_HP300) || defined(CONFIG_Q40) \ - || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) + || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) \ + || defined(CONFIG_VIRT) # define MACH_IS_MVME16x (m68k_machtype == MACH_MVME16x) #else # define MACH_MVME16x_ONLY @@ -123,7 +129,8 @@ extern unsigned long m68k_machtype; #elif defined(CONFIG_AMIGA) || defined(CONFIG_MAC) || defined(CONFIG_ATARI) \ || defined(CONFIG_APOLLO) || defined(CONFIG_MVME16x) \ || defined(CONFIG_HP300) || defined(CONFIG_Q40) \ - || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) + || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) \ + || defined(CONFIG_VIRT) # define MACH_IS_BVME6000 (m68k_machtype == MACH_BVME6000) #else # define MACH_BVME6000_ONLY @@ -136,7 +143,8 @@ extern unsigned long m68k_machtype; #elif defined(CONFIG_AMIGA) || defined(CONFIG_MAC) || defined(CONFIG_ATARI) \ || defined(CONFIG_APOLLO) || defined(CONFIG_MVME16x) \ || defined(CONFIG_BVME6000) || defined(CONFIG_Q40) \ - || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) + || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) \ + || defined(CONFIG_VIRT) # define MACH_IS_HP300 (m68k_machtype == MACH_HP300) #else # define MACH_HP300_ONLY @@ -149,7 +157,8 @@ extern unsigned long m68k_machtype; #elif defined(CONFIG_AMIGA) || defined(CONFIG_MAC) || defined(CONFIG_ATARI) \ || defined(CONFIG_APOLLO) || defined(CONFIG_MVME16x) \ || defined(CONFIG_BVME6000) || defined(CONFIG_HP300) \ - || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) + || defined(CONFIG_SUN3X) || defined(CONFIG_MVME147) \ + || defined(CONFIG_VIRT) # define MACH_IS_Q40 (m68k_machtype == MACH_Q40) #else # define MACH_Q40_ONLY @@ -162,7 +171,8 @@ extern unsigned long m68k_machtype; #elif defined(CONFIG_AMIGA) || defined(CONFIG_MAC) || defined(CONFIG_ATARI) \ || defined(CONFIG_APOLLO) || defined(CONFIG_MVME16x) \ || defined(CONFIG_BVME6000) || defined(CONFIG_HP300) \ - || defined(CONFIG_Q40) || defined(CONFIG_MVME147) + || defined(CONFIG_Q40) || defined(CONFIG_MVME147) \ + || defined(CONFIG_VIRT) # define MACH_IS_SUN3X (m68k_machtype == MACH_SUN3X) #else # define CONFIG_SUN3X_ONLY @@ -170,6 +180,20 @@ extern unsigned long m68k_machtype; # define MACH_TYPE (MACH_SUN3X) #endif +#if !defined(CONFIG_VIRT) +# define MACH_IS_VIRT (0) +#elif defined(CONFIG_AMIGA) || defined(CONFIG_MAC) || defined(CONFIG_ATARI) \ + || defined(CONFIG_APOLLO) || defined(CONFIG_MVME16x) \ + || defined(CONFIG_BVME6000) || defined(CONFIG_HP300) \ + || defined(CONFIG_Q40) || defined(CONFIG_SUN3X) \ + || defined(CONFIG_MVME147) +# define MACH_IS_VIRT (m68k_machtype == MACH_VIRT) +#else +# define MACH_VIRT_ONLY +# define MACH_IS_VIRT (1) +# define MACH_TYPE (MACH_VIRT) +#endif + #ifndef MACH_TYPE # define MACH_TYPE (m68k_machtype) #endif diff --git a/arch/m68k/include/asm/timex.h b/arch/m68k/include/asm/timex.h index 6a21d9358280..f4a7a340f4ca 100644 --- a/arch/m68k/include/asm/timex.h +++ b/arch/m68k/include/asm/timex.h @@ -35,7 +35,7 @@ static inline unsigned long random_get_entropy(void) { if (mach_random_get_entropy) return mach_random_get_entropy(); - return 0; + return random_get_entropy_fallback(); } #define random_get_entropy random_get_entropy diff --git a/arch/m68k/include/asm/virt.h b/arch/m68k/include/asm/virt.h new file mode 100644 index 000000000000..d3320c954796 --- /dev/null +++ b/arch/m68k/include/asm/virt.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_VIRT_H +#define __ASM_VIRT_H + +#define NUM_VIRT_SOURCES 200 + +struct virt_booter_device_data { + u32 mmio; + u32 irq; +}; + +struct virt_booter_data { + u32 qemu_version; + struct virt_booter_device_data pic; + struct virt_booter_device_data rtc; + struct virt_booter_device_data tty; + struct virt_booter_device_data ctrl; + struct virt_booter_device_data virtio; +}; + +extern struct virt_booter_data virt_bi_data; + +extern void __init virt_init_IRQ(void); + +#endif diff --git a/arch/m68k/include/uapi/asm/bootinfo-virt.h b/arch/m68k/include/uapi/asm/bootinfo-virt.h new file mode 100644 index 000000000000..e4db7e2213ab --- /dev/null +++ b/arch/m68k/include/uapi/asm/bootinfo-virt.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * asm/bootinfo-virt.h -- Virtual-m68k-specific boot information definitions + */ + +#ifndef _UAPI_ASM_M68K_BOOTINFO_VIRT_H +#define _UAPI_ASM_M68K_BOOTINFO_VIRT_H + +#define BI_VIRT_QEMU_VERSION 0x8000 +#define BI_VIRT_GF_PIC_BASE 0x8001 +#define BI_VIRT_GF_RTC_BASE 0x8002 +#define BI_VIRT_GF_TTY_BASE 0x8003 +#define BI_VIRT_VIRTIO_BASE 0x8004 +#define BI_VIRT_CTRL_BASE 0x8005 + +#define VIRT_BOOTI_VERSION MK_BI_VERSION(2, 0) + +#endif /* _UAPI_ASM_M68K_BOOTINFO_MAC_H */ diff --git a/arch/m68k/include/uapi/asm/bootinfo.h b/arch/m68k/include/uapi/asm/bootinfo.h index 38d3140381fa..203d9cbf9630 100644 --- a/arch/m68k/include/uapi/asm/bootinfo.h +++ b/arch/m68k/include/uapi/asm/bootinfo.h @@ -83,6 +83,7 @@ struct mem_info { #define MACH_SUN3X 11 #define MACH_M54XX 12 #define MACH_M5441X 13 +#define MACH_VIRT 14 /* diff --git a/arch/m68k/kernel/Makefile b/arch/m68k/kernel/Makefile index dbac7f8743fc..c0833da6a2ca 100644 --- a/arch/m68k/kernel/Makefile +++ b/arch/m68k/kernel/Makefile @@ -11,6 +11,7 @@ extra-$(CONFIG_VME) := head.o extra-$(CONFIG_HP300) := head.o extra-$(CONFIG_Q40) := head.o extra-$(CONFIG_SUN3X) := head.o +extra-$(CONFIG_VIRT) := head.o extra-$(CONFIG_SUN3) := sun3-head.o extra-y += vmlinux.lds diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S index 9434fca68de5..18f278bdbd21 100644 --- a/arch/m68k/kernel/entry.S +++ b/arch/m68k/kernel/entry.S @@ -181,7 +181,7 @@ do_trace_entry: movel #-ENOSYS,%sp@(PT_OFF_D0)| needed for strace subql #4,%sp SAVE_SWITCH_STACK - jbsr syscall_trace + jbsr syscall_trace_enter RESTORE_SWITCH_STACK addql #4,%sp movel %sp@(PT_OFF_ORIG_D0),%d0 @@ -194,7 +194,7 @@ badsys: do_trace_exit: subql #4,%sp SAVE_SWITCH_STACK - jbsr syscall_trace + jbsr syscall_trace_leave RESTORE_SWITCH_STACK addql #4,%sp jra .Lret_from_exception diff --git a/arch/m68k/kernel/head.S b/arch/m68k/kernel/head.S index 493c95db0e51..9e812d8606be 100644 --- a/arch/m68k/kernel/head.S +++ b/arch/m68k/kernel/head.S @@ -262,6 +262,7 @@ #include <asm/bootinfo-hp300.h> #include <asm/bootinfo-mac.h> #include <asm/bootinfo-q40.h> +#include <asm/bootinfo-virt.h> #include <asm/bootinfo-vme.h> #include <asm/setup.h> #include <asm/entry.h> @@ -534,6 +535,7 @@ func_define putn,1 #define is_not_apollo(lab) cmpl &MACH_APOLLO,%pc@(m68k_machtype); jne lab #define is_not_q40(lab) cmpl &MACH_Q40,%pc@(m68k_machtype); jne lab #define is_not_sun3x(lab) cmpl &MACH_SUN3X,%pc@(m68k_machtype); jne lab +#define is_not_virt(lab) cmpl &MACH_VIRT,%pc@(m68k_machtype); jne lab #define hasnt_leds(lab) cmpl &MACH_HP300,%pc@(m68k_machtype); \ jeq 42f; \ @@ -647,6 +649,14 @@ ENTRY(__start) L(test_notmac): #endif /* CONFIG_MAC */ +#ifdef CONFIG_VIRT + is_not_virt(L(test_notvirt)) + + get_bi_record BI_VIRT_GF_TTY_BASE + lea %pc@(L(virt_gf_tty_base)),%a1 + movel %a0@,%a1@ +L(test_notvirt): +#endif /* CONFIG_VIRT */ /* * There are ultimately two pieces of information we want for all kinds of @@ -1237,6 +1247,13 @@ L(mmu_init_not_mac): L(notsun3x): #endif +#ifdef CONFIG_VIRT + is_not_virt(L(novirt)) + mmu_map_tt #1,#0xFF000000,#0x01000000,#_PAGE_NOCACHE_S + jbra L(mmu_init_done) +L(novirt): +#endif + #ifdef CONFIG_APOLLO is_not_apollo(L(notapollo)) @@ -3186,6 +3203,14 @@ func_start serial_putc,%d0/%d1/%a0/%a1 3: #endif +#ifdef CONFIG_VIRT + is_not_virt(1f) + + movel L(virt_gf_tty_base),%a1 + movel %d0,%a1@(GF_PUT_CHAR) +1: +#endif + L(serial_putc_done): func_return serial_putc @@ -3865,3 +3890,9 @@ q40_mem_cptr: L(q40_do_debug): .long 0 #endif + +#if defined(CONFIG_VIRT) +GF_PUT_CHAR = 0x00 +L(virt_gf_tty_base): + .long 0 +#endif /* CONFIG_VIRT */ diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c index 6342ff4d2073..daebccdd2c09 100644 --- a/arch/m68k/kernel/ptrace.c +++ b/arch/m68k/kernel/ptrace.c @@ -270,12 +270,6 @@ out_eio: return -EIO; } -asmlinkage void syscall_trace(void) -{ - ptrace_report_syscall(0); -} - -#if defined(CONFIG_COLDFIRE) || !defined(CONFIG_MMU) asmlinkage int syscall_trace_enter(void) { int ret = 0; @@ -290,4 +284,3 @@ asmlinkage void syscall_trace_leave(void) if (test_thread_flag(TIF_SYSCALL_TRACE)) ptrace_report_syscall_exit(task_pt_regs(current), 0); } -#endif /* CONFIG_COLDFIRE */ diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c index 8f94feed969c..78ab562beb31 100644 --- a/arch/m68k/kernel/setup_mm.c +++ b/arch/m68k/kernel/setup_mm.c @@ -181,6 +181,8 @@ static void __init m68k_parse_bootinfo(const struct bi_record *record) unknown = hp300_parse_bootinfo(record); else if (MACH_IS_APOLLO) unknown = apollo_parse_bootinfo(record); + else if (MACH_IS_VIRT) + unknown = virt_parse_bootinfo(record); else unknown = 1; } @@ -312,6 +314,11 @@ void __init setup_arch(char **cmdline_p) config_BSP(NULL, 0); break; #endif +#ifdef CONFIG_VIRT + case MACH_VIRT: + config_virt(); + break; +#endif default: panic("No configuration setup"); } diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 49533f65958a..b9f6908a31bc 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -625,6 +625,7 @@ static inline void siginfo_build_tests(void) /* _sigfault._perf */ BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x10); BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x18); /* _sigpoll */ BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x0c); diff --git a/arch/m68k/math-emu/fp_arith.c b/arch/m68k/math-emu/fp_arith.c index d9033238d097..f4a06492cd7a 100644 --- a/arch/m68k/math-emu/fp_arith.c +++ b/arch/m68k/math-emu/fp_arith.c @@ -243,7 +243,7 @@ fp_fdiv(struct fp_ext *dest, struct fp_ext *src) /* infinity / infinity = NaN (quiet, as always) */ if (IS_INF(src)) fp_set_nan(dest); - /* infinity / anything else = infinity (with approprate sign) */ + /* infinity / anything else = infinity (with appropriate sign) */ return dest; } if (IS_INF(src)) { diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c index 20ddf71b43d0..7594a945732b 100644 --- a/arch/m68k/mm/kmap.c +++ b/arch/m68k/mm/kmap.c @@ -179,6 +179,12 @@ void __iomem *__ioremap(unsigned long physaddr, unsigned long size, int cachefla return (void __iomem *)physaddr; } #endif +#ifdef CONFIG_VIRT + if (MACH_IS_VIRT) { + if (physaddr >= 0xff000000 && cacheflag == IOMAP_NOCACHE_SER) + return (void __iomem *)physaddr; + } +#endif #ifdef CONFIG_COLDFIRE if (__cf_internalio(physaddr)) return (void __iomem *) physaddr; @@ -293,17 +299,20 @@ EXPORT_SYMBOL(__ioremap); void iounmap(void __iomem *addr) { #ifdef CONFIG_AMIGA - if ((!MACH_IS_AMIGA) || - (((unsigned long)addr < 0x40000000) || - ((unsigned long)addr > 0x60000000))) - free_io_area((__force void *)addr); -#else + if (MACH_IS_AMIGA && + ((unsigned long)addr >= 0x40000000) && + ((unsigned long)addr < 0x60000000)) + return; +#endif +#ifdef CONFIG_VIRT + if (MACH_IS_VIRT && (unsigned long)addr >= 0xff000000) + return; +#endif #ifdef CONFIG_COLDFIRE if (cf_internalio(addr)) return; #endif free_io_area((__force void *)addr); -#endif } EXPORT_SYMBOL(iounmap); diff --git a/arch/m68k/virt/Makefile b/arch/m68k/virt/Makefile new file mode 100644 index 000000000000..54b9b2866654 --- /dev/null +++ b/arch/m68k/virt/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for Linux arch/m68k/virt source directory +# + +obj-y := config.o ints.o platform.o diff --git a/arch/m68k/virt/config.c b/arch/m68k/virt/config.c new file mode 100644 index 000000000000..68d29c8b87e1 --- /dev/null +++ b/arch/m68k/virt/config.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/serial_core.h> +#include <clocksource/timer-goldfish.h> + +#include <asm/bootinfo.h> +#include <asm/bootinfo-virt.h> +#include <asm/byteorder.h> +#include <asm/machdep.h> +#include <asm/virt.h> +#include <asm/config.h> + +struct virt_booter_data virt_bi_data; + +#define VIRT_CTRL_REG_FEATURES 0x00 +#define VIRT_CTRL_REG_CMD 0x04 + +static struct resource ctrlres; + +enum { + CMD_NOOP, + CMD_RESET, + CMD_HALT, + CMD_PANIC, +}; + +static void virt_get_model(char *str) +{ + /* str is 80 characters long */ + sprintf(str, "QEMU Virtual M68K Machine (%u.%u.%u)", + (u8)(virt_bi_data.qemu_version >> 24), + (u8)(virt_bi_data.qemu_version >> 16), + (u8)(virt_bi_data.qemu_version >> 8)); +} + +static void virt_halt(void) +{ + void __iomem *base = (void __iomem *)virt_bi_data.ctrl.mmio; + + iowrite32be(CMD_HALT, base + VIRT_CTRL_REG_CMD); + local_irq_disable(); + while (1) + ; +} + +static void virt_reset(void) +{ + void __iomem *base = (void __iomem *)virt_bi_data.ctrl.mmio; + + iowrite32be(CMD_RESET, base + VIRT_CTRL_REG_CMD); + local_irq_disable(); + while (1) + ; +} + +/* + * Parse a virtual-m68k-specific record in the bootinfo + */ + +int __init virt_parse_bootinfo(const struct bi_record *record) +{ + int unknown = 0; + const void *data = record->data; + + switch (be16_to_cpu(record->tag)) { + case BI_VIRT_QEMU_VERSION: + virt_bi_data.qemu_version = be32_to_cpup(data); + break; + case BI_VIRT_GF_PIC_BASE: + virt_bi_data.pic.mmio = be32_to_cpup(data); + data += 4; + virt_bi_data.pic.irq = be32_to_cpup(data); + break; + case BI_VIRT_GF_RTC_BASE: + virt_bi_data.rtc.mmio = be32_to_cpup(data); + data += 4; + virt_bi_data.rtc.irq = be32_to_cpup(data); + break; + case BI_VIRT_GF_TTY_BASE: + virt_bi_data.tty.mmio = be32_to_cpup(data); + data += 4; + virt_bi_data.tty.irq = be32_to_cpup(data); + break; + case BI_VIRT_CTRL_BASE: + virt_bi_data.ctrl.mmio = be32_to_cpup(data); + data += 4; + virt_bi_data.ctrl.irq = be32_to_cpup(data); + break; + case BI_VIRT_VIRTIO_BASE: + virt_bi_data.virtio.mmio = be32_to_cpup(data); + data += 4; + virt_bi_data.virtio.irq = be32_to_cpup(data); + break; + default: + unknown = 1; + break; + } + return unknown; +} + +static void __init virt_sched_init(void) +{ + goldfish_timer_init(virt_bi_data.rtc.irq, + (void __iomem *)virt_bi_data.rtc.mmio); +} + +void __init config_virt(void) +{ + char earlycon[24]; + + snprintf(earlycon, sizeof(earlycon), "early_gf_tty,0x%08x", + virt_bi_data.tty.mmio); + setup_earlycon(earlycon); + + ctrlres = (struct resource) + DEFINE_RES_MEM_NAMED(virt_bi_data.ctrl.mmio, 0x100, + "virtctrl"); + + if (request_resource(&iomem_resource, &ctrlres)) { + pr_err("Cannot allocate virt controller resource\n"); + return; + } + + mach_init_IRQ = virt_init_IRQ; + mach_sched_init = virt_sched_init; + mach_get_model = virt_get_model; + mach_reset = virt_reset; + mach_halt = virt_halt; + mach_power_off = virt_halt; +} diff --git a/arch/m68k/virt/ints.c b/arch/m68k/virt/ints.c new file mode 100644 index 000000000000..95818f901ebe --- /dev/null +++ b/arch/m68k/virt/ints.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/types.h> +#include <linux/ioport.h> + +#include <asm/hwtest.h> +#include <asm/irq.h> +#include <asm/irq_regs.h> +#include <asm/virt.h> + +#define GFPIC_REG_IRQ_PENDING 0x04 +#define GFPIC_REG_IRQ_DISABLE_ALL 0x08 +#define GFPIC_REG_IRQ_DISABLE 0x0c +#define GFPIC_REG_IRQ_ENABLE 0x10 + +extern void show_registers(struct pt_regs *regs); + +static struct resource picres[6]; +static const char *picname[6] = { + "goldfish_pic.0", + "goldfish_pic.1", + "goldfish_pic.2", + "goldfish_pic.3", + "goldfish_pic.4", + "goldfish_pic.5" +}; + +/* + * 6 goldfish-pic for CPU IRQ #1 to IRQ #6 + * CPU IRQ #1 -> PIC #1 + * IRQ #1 to IRQ #31 -> unused + * IRQ #32 -> goldfish-tty + * CPU IRQ #2 -> PIC #2 + * IRQ #1 to IRQ #32 -> virtio-mmio from 1 to 32 + * CPU IRQ #3 -> PIC #3 + * IRQ #1 to IRQ #32 -> virtio-mmio from 33 to 64 + * CPU IRQ #4 -> PIC #4 + * IRQ #1 to IRQ #32 -> virtio-mmio from 65 to 96 + * CPU IRQ #5 -> PIC #5 + * IRQ #1 to IRQ #32 -> virtio-mmio from 97 to 128 + * CPU IRQ #6 -> PIC #6 + * IRQ #1 -> goldfish-timer + * IRQ #2 -> goldfish-rtc + * IRQ #3 to IRQ #32 -> unused + * CPU IRQ #7 -> NMI + */ + +static u32 gfpic_read(int pic, int reg) +{ + void __iomem *base = (void __iomem *)(virt_bi_data.pic.mmio + + pic * 0x1000); + + return ioread32be(base + reg); +} + +static void gfpic_write(u32 value, int pic, int reg) +{ + void __iomem *base = (void __iomem *)(virt_bi_data.pic.mmio + + pic * 0x1000); + + iowrite32be(value, base + reg); +} + +#define GF_PIC(irq) ((irq - IRQ_USER) / 32) +#define GF_IRQ(irq) ((irq - IRQ_USER) % 32) + +static void virt_irq_enable(struct irq_data *data) +{ + gfpic_write(BIT(GF_IRQ(data->irq)), GF_PIC(data->irq), + GFPIC_REG_IRQ_ENABLE); +} + +static void virt_irq_disable(struct irq_data *data) +{ + gfpic_write(BIT(GF_IRQ(data->irq)), GF_PIC(data->irq), + GFPIC_REG_IRQ_DISABLE); +} + +static unsigned int virt_irq_startup(struct irq_data *data) +{ + virt_irq_enable(data); + return 0; +} + +static irqreturn_t virt_nmi_handler(int irq, void *dev_id) +{ + static int in_nmi; + + if (READ_ONCE(in_nmi)) + return IRQ_HANDLED; + WRITE_ONCE(in_nmi, 1); + + pr_warn("Non-Maskable Interrupt\n"); + show_registers(get_irq_regs()); + + WRITE_ONCE(in_nmi, 0); + return IRQ_HANDLED; +} + +static struct irq_chip virt_irq_chip = { + .name = "virt", + .irq_enable = virt_irq_enable, + .irq_disable = virt_irq_disable, + .irq_startup = virt_irq_startup, + .irq_shutdown = virt_irq_disable, +}; + +static void goldfish_pic_irq(struct irq_desc *desc) +{ + u32 irq_pending; + unsigned int irq_num; + unsigned int pic = desc->irq_data.irq - 1; + + irq_pending = gfpic_read(pic, GFPIC_REG_IRQ_PENDING); + irq_num = IRQ_USER + pic * 32; + + do { + if (irq_pending & 1) + generic_handle_irq(irq_num); + ++irq_num; + irq_pending >>= 1; + } while (irq_pending); +} + +void __init virt_init_IRQ(void) +{ + unsigned int i; + + m68k_setup_irq_controller(&virt_irq_chip, handle_simple_irq, IRQ_USER, + NUM_VIRT_SOURCES - IRQ_USER); + + for (i = 0; i < 6; i++) { + + picres[i] = (struct resource) + DEFINE_RES_MEM_NAMED(virt_bi_data.pic.mmio + i * 0x1000, + 0x1000, picname[i]); + if (request_resource(&iomem_resource, &picres[i])) { + pr_err("Cannot allocate %s resource\n", picname[i]); + return; + } + + irq_set_chained_handler(virt_bi_data.pic.irq + i, + goldfish_pic_irq); + } + + if (request_irq(IRQ_AUTO_7, virt_nmi_handler, 0, "NMI", + virt_nmi_handler)) + pr_err("Couldn't register NMI\n"); +} diff --git a/arch/m68k/virt/platform.c b/arch/m68k/virt/platform.c new file mode 100644 index 000000000000..cb820f19a221 --- /dev/null +++ b/arch/m68k/virt/platform.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/platform_device.h> +#include <linux/interrupt.h> +#include <linux/memblock.h> +#include <asm/virt.h> +#include <asm/irq.h> + +#define VIRTIO_BUS_NB 128 + +static int __init virt_virtio_init(unsigned int id) +{ + const struct resource res[] = { + DEFINE_RES_MEM(virt_bi_data.virtio.mmio + id * 0x200, 0x200), + DEFINE_RES_IRQ(virt_bi_data.virtio.irq + id), + }; + struct platform_device *pdev; + + pdev = platform_device_register_simple("virtio-mmio", id, + res, ARRAY_SIZE(res)); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); + + return 0; +} + +static int __init virt_platform_init(void) +{ + const struct resource goldfish_tty_res[] = { + DEFINE_RES_MEM(virt_bi_data.tty.mmio, 1), + DEFINE_RES_IRQ(virt_bi_data.tty.irq), + }; + /* this is the second gf-rtc, the first one is used by the scheduler */ + const struct resource goldfish_rtc_res[] = { + DEFINE_RES_MEM(virt_bi_data.rtc.mmio + 0x1000, 0x1000), + DEFINE_RES_IRQ(virt_bi_data.rtc.irq + 1), + }; + struct platform_device *pdev; + unsigned int i; + + if (!MACH_IS_VIRT) + return -ENODEV; + + /* We need this to have DMA'able memory provided to goldfish-tty */ + min_low_pfn = 0; + + pdev = platform_device_register_simple("goldfish_tty", + PLATFORM_DEVID_NONE, + goldfish_tty_res, + ARRAY_SIZE(goldfish_tty_res)); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); + + pdev = platform_device_register_simple("goldfish_rtc", + PLATFORM_DEVID_NONE, + goldfish_rtc_res, + ARRAY_SIZE(goldfish_rtc_res)); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); + + for (i = 0; i < VIRTIO_BUS_NB; i++) { + int err; + + err = virt_virtio_init(i); + if (err) + return err; + } + + return 0; +} + +arch_initcall(virt_platform_init); diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig index 5cb91509bb7c..d82f4ebf687f 100644 --- a/arch/mips/configs/gpr_defconfig +++ b/arch/mips/configs/gpr_defconfig @@ -178,12 +178,8 @@ CONFIG_NETCONSOLE=m CONFIG_ATM_TCP=m CONFIG_ATM_LANAI=m CONFIG_ATM_ENI=m -CONFIG_ATM_FIRESTREAM=m -CONFIG_ATM_ZATM=m CONFIG_ATM_NICSTAR=m CONFIG_ATM_IDT77252=m -CONFIG_ATM_AMBASSADOR=m -CONFIG_ATM_HORIZON=m CONFIG_ATM_IA=m CONFIG_ATM_FORE200E=m CONFIG_ATM_HE=m @@ -214,7 +210,6 @@ CONFIG_ATH_DEBUG=y CONFIG_ATH5K=y CONFIG_ATH5K_DEBUG=y CONFIG_WAN=y -CONFIG_LANMEDIA=m CONFIG_HDLC=m CONFIG_HDLC_RAW=m CONFIG_HDLC_RAW_ETH=m diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig index 205d3b34528c..4194e79b435c 100644 --- a/arch/mips/configs/mtx1_defconfig +++ b/arch/mips/configs/mtx1_defconfig @@ -255,12 +255,8 @@ CONFIG_ARCNET_COM20020_CS=m CONFIG_ATM_TCP=m CONFIG_ATM_LANAI=m CONFIG_ATM_ENI=m -CONFIG_ATM_FIRESTREAM=m -CONFIG_ATM_ZATM=m CONFIG_ATM_NICSTAR=m CONFIG_ATM_IDT77252=m -CONFIG_ATM_AMBASSADOR=m -CONFIG_ATM_HORIZON=m CONFIG_ATM_IA=m CONFIG_ATM_FORE200E=m CONFIG_ATM_HE=m @@ -281,7 +277,6 @@ CONFIG_CHELSIO_T1=m CONFIG_NET_TULIP=y CONFIG_DE2104X=m CONFIG_TULIP=m -CONFIG_DE4X5=m CONFIG_WINBOND_840=m CONFIG_DM9102=m CONFIG_ULI526X=m @@ -363,7 +358,6 @@ CONFIG_USB_AN2720=y CONFIG_USB_EPSON2888=y CONFIG_USB_SIERRA_NET=m CONFIG_WAN=y -CONFIG_LANMEDIA=m CONFIG_HDLC=m CONFIG_HDLC_RAW=m CONFIG_HDLC_RAW_ETH=m diff --git a/arch/mips/include/asm/timex.h b/arch/mips/include/asm/timex.h index b05bb70a2e46..2e107886f97a 100644 --- a/arch/mips/include/asm/timex.h +++ b/arch/mips/include/asm/timex.h @@ -40,9 +40,9 @@ typedef unsigned int cycles_t; /* - * On R4000/R4400 before version 5.0 an erratum exists such that if the - * cycle counter is read in the exact moment that it is matching the - * compare register, no interrupt will be generated. + * On R4000/R4400 an erratum exists such that if the cycle counter is + * read in the exact moment that it is matching the compare register, + * no interrupt will be generated. * * There is a suggested workaround and also the erratum can't strike if * the compare interrupt isn't being used as the clock source device. @@ -63,7 +63,7 @@ static inline int can_use_mips_counter(unsigned int prid) if (!__builtin_constant_p(cpu_has_counter)) asm volatile("" : "=m" (cpu_data[0].options)); if (likely(cpu_has_counter && - prid >= (PRID_IMP_R4000 | PRID_REV_ENCODE_44(5, 0)))) + prid > (PRID_IMP_R4000 | PRID_REV_ENCODE_44(15, 15)))) return 1; else return 0; @@ -76,25 +76,24 @@ static inline cycles_t get_cycles(void) else return 0; /* no usable counter */ } +#define get_cycles get_cycles /* * Like get_cycles - but where c0_count is not available we desperately * use c0_random in an attempt to get at least a little bit of entropy. - * - * R6000 and R6000A neither have a count register nor a random register. - * That leaves no entropy source in the CPU itself. */ static inline unsigned long random_get_entropy(void) { - unsigned int prid = read_c0_prid(); - unsigned int imp = prid & PRID_IMP_MASK; + unsigned int c0_random; - if (can_use_mips_counter(prid)) + if (can_use_mips_counter(read_c0_prid())) return read_c0_count(); - else if (likely(imp != PRID_IMP_R6000 && imp != PRID_IMP_R6000A)) - return read_c0_random(); + + if (cpu_has_3kex) + c0_random = (read_c0_random() >> 8) & 0x3f; else - return 0; /* no usable register */ + c0_random = read_c0_random() & 0x3f; + return (random_get_entropy_fallback() << 6) | (0x3f - c0_random); } #define random_get_entropy random_get_entropy diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 1d55e57b8466..18f3d95ecfec 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -146,6 +146,8 @@ #define SO_TXREHASH 74 +#define SO_RCVMARK 75 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index caa01457dce6..ed339d7979f3 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -141,15 +141,10 @@ static __init int cpu_has_mfc0_count_bug(void) case CPU_R4400MC: /* * The published errata for the R4400 up to 3.0 say the CPU - * has the mfc0 from count bug. + * has the mfc0 from count bug. This seems the last version + * produced. */ - if ((current_cpu_data.processor_id & 0xff) <= 0x30) - return 1; - - /* - * we assume newer revisions are ok - */ - return 0; + return 1; } return 0; diff --git a/arch/nios2/include/asm/timex.h b/arch/nios2/include/asm/timex.h index a769f871b28d..40a1adc9bd03 100644 --- a/arch/nios2/include/asm/timex.h +++ b/arch/nios2/include/asm/timex.h @@ -8,5 +8,8 @@ typedef unsigned long cycles_t; extern cycles_t get_cycles(void); +#define get_cycles get_cycles + +#define random_get_entropy() (((unsigned long)get_cycles()) ?: random_get_entropy_fallback()) #endif diff --git a/arch/openrisc/include/asm/timex.h b/arch/openrisc/include/asm/timex.h index d52b4e536e3f..5487fa93dd9b 100644 --- a/arch/openrisc/include/asm/timex.h +++ b/arch/openrisc/include/asm/timex.h @@ -23,6 +23,7 @@ static inline cycles_t get_cycles(void) { return mfspr(SPR_TTCR); } +#define get_cycles get_cycles /* This isn't really used any more */ #define CLOCK_TICK_RATE 1000 diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S index 15f1b38dfe03..2fa6cefa62ca 100644 --- a/arch/openrisc/kernel/head.S +++ b/arch/openrisc/kernel/head.S @@ -521,6 +521,15 @@ _start: l.ori r3,r0,0x1 l.mtspr r0,r3,SPR_SR + /* + * Start the TTCR as early as possible, so that the RNG can make use of + * measurements of boot time from the earliest opportunity. Especially + * important is that the TTCR does not return zero by the time we reach + * random_init(). + */ + l.movhi r3,hi(SPR_TTMR_CR) + l.mtspr r0,r3,SPR_TTMR + CLEAR_GPR(r1) CLEAR_GPR(r2) CLEAR_GPR(r3) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 52e550b45692..bd22578859d0 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -38,6 +38,7 @@ config PARISC select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD select GENERIC_ARCH_TOPOLOGY if SMP + select GENERIC_CPU_DEVICES if !SMP select GENERIC_LIB_DEVMEM_IS_ALLOWED select SYSCTL_ARCH_UNALIGN_ALLOW select SYSCTL_EXCEPTION_TRACE diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig index a5fee10d76ee..8ce0ae370680 100644 --- a/arch/parisc/configs/generic-32bit_defconfig +++ b/arch/parisc/configs/generic-32bit_defconfig @@ -6,6 +6,9 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=16 +CONFIG_CGROUPS=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y CONFIG_PERF_EVENTS=y @@ -47,7 +50,6 @@ CONFIG_PARPORT=y CONFIG_PARPORT_PC=m CONFIG_PARPORT_1284=y CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_CRYPTOLOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=6144 CONFIG_BLK_DEV_SD=y diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig index 1b8fd80cbe7f..57501b0aed92 100644 --- a/arch/parisc/configs/generic-64bit_defconfig +++ b/arch/parisc/configs/generic-64bit_defconfig @@ -16,6 +16,7 @@ CONFIG_CGROUPS=y CONFIG_MEMCG=y CONFIG_CGROUP_PIDS=y CONFIG_CPUSETS=y +CONFIG_USER_NS=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -267,9 +268,9 @@ CONFIG_CRYPTO_DEFLATE=m CONFIG_CRC_CCITT=m CONFIG_LIBCRC32C=y CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_KERNEL=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y -CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index e8b4a03343d3..8d03b3b26229 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -59,20 +59,12 @@ void flush_dcache_page(struct page *page); flush_kernel_icache_range_asm(s,e); \ } while (0) -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ -do { \ - flush_cache_page(vma, vaddr, page_to_pfn(page)); \ - memcpy(dst, src, len); \ - flush_kernel_dcache_range_asm((unsigned long)dst, (unsigned long)dst + len); \ -} while (0) - -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ -do { \ - flush_cache_page(vma, vaddr, page_to_pfn(page)); \ - memcpy(dst, src, len); \ -} while (0) - -void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn); +void copy_to_user_page(struct vm_area_struct *vma, struct page *page, + unsigned long user_vaddr, void *dst, void *src, int len); +void copy_from_user_page(struct vm_area_struct *vma, struct page *page, + unsigned long user_vaddr, void *dst, void *src, int len); +void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, + unsigned long pfn); void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -80,16 +72,7 @@ void flush_cache_range(struct vm_area_struct *vma, void flush_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr); #define ARCH_HAS_FLUSH_ANON_PAGE -static inline void -flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) -{ - if (PageAnon(page)) { - flush_tlb_page(vma, vmaddr); - preempt_disable(); - flush_dcache_page_asm(page_to_phys(page), vmaddr); - preempt_enable(); - } -} +void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr); #define ARCH_HAS_FLUSH_ON_KUNMAP static inline void kunmap_flush_on_unmap(void *addr) diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 0561568f7b48..6faaaa3ebe9b 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -26,12 +26,14 @@ #define copy_page(to, from) copy_page_asm((void *)(to), (void *)(from)) struct page; +struct vm_area_struct; void clear_page_asm(void *page); void copy_page_asm(void *to, void *from); #define clear_user_page(vto, vaddr, page) clear_page_asm(vto) -void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, - struct page *pg); +void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, + struct vm_area_struct *vma); +#define __HAVE_ARCH_COPY_USER_HIGHPAGE /* * These are used to make use of C type-checking.. diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 939db6fe620b..69765a6dbe89 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -160,7 +160,7 @@ extern void __update_cache(pte_t pte); #define SPACEID_SHIFT (MAX_ADDRBITS - 32) #else #define MAX_ADDRBITS (BITS_PER_LONG) -#define MAX_ADDRESS (1UL << MAX_ADDRBITS) +#define MAX_ADDRESS (1ULL << MAX_ADDRBITS) #define SPACEID_SHIFT 0 #endif diff --git a/arch/parisc/include/asm/timex.h b/arch/parisc/include/asm/timex.h index 06b510f8172e..b4622cb06a75 100644 --- a/arch/parisc/include/asm/timex.h +++ b/arch/parisc/include/asm/timex.h @@ -13,9 +13,10 @@ typedef unsigned long cycles_t; -static inline cycles_t get_cycles (void) +static inline cycles_t get_cycles(void) { return mfctl(16); } +#define get_cycles get_cycles #endif diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 654061e0964e..f486d3dfb6bb 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -127,6 +127,8 @@ #define SO_TXREHASH 0x4048 +#define SO_RCVMARK 0x4049 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 23348199f3f8..0fd04073d4b6 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -27,6 +27,7 @@ #include <asm/processor.h> #include <asm/sections.h> #include <asm/shmparam.h> +#include <asm/mmu_context.h> int split_tlb __ro_after_init; int dcache_stride __ro_after_init; @@ -91,7 +92,7 @@ static inline void flush_data_cache(void) } -/* Virtual address of pfn. */ +/* Kernel virtual address of pfn. */ #define pfn_va(pfn) __va(PFN_PHYS(pfn)) void @@ -124,11 +125,13 @@ show_cache_info(struct seq_file *m) cache_info.ic_size/1024 ); if (cache_info.dc_loop != 1) snprintf(buf, 32, "%lu-way associative", cache_info.dc_loop); - seq_printf(m, "D-cache\t\t: %ld KB (%s%s, %s)\n", + seq_printf(m, "D-cache\t\t: %ld KB (%s%s, %s, alias=%d)\n", cache_info.dc_size/1024, (cache_info.dc_conf.cc_wt ? "WT":"WB"), (cache_info.dc_conf.cc_sh ? ", shared I/D":""), - ((cache_info.dc_loop == 1) ? "direct mapped" : buf)); + ((cache_info.dc_loop == 1) ? "direct mapped" : buf), + cache_info.dc_conf.cc_alias + ); seq_printf(m, "ITLB entries\t: %ld\n" "DTLB entries\t: %ld%s\n", cache_info.it_size, cache_info.dt_size, @@ -324,25 +327,81 @@ __flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, preempt_enable(); } -static inline void -__purge_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, - unsigned long physaddr) +static void flush_user_cache_page(struct vm_area_struct *vma, unsigned long vmaddr) { - if (!static_branch_likely(&parisc_has_cache)) - return; + unsigned long flags, space, pgd, prot; +#ifdef CONFIG_TLB_PTLOCK + unsigned long pgd_lock; +#endif + + vmaddr &= PAGE_MASK; + preempt_disable(); - purge_dcache_page_asm(physaddr, vmaddr); + + /* Set context for flush */ + local_irq_save(flags); + prot = mfctl(8); + space = mfsp(SR_USER); + pgd = mfctl(25); +#ifdef CONFIG_TLB_PTLOCK + pgd_lock = mfctl(28); +#endif + switch_mm_irqs_off(NULL, vma->vm_mm, NULL); + local_irq_restore(flags); + + flush_user_dcache_range_asm(vmaddr, vmaddr + PAGE_SIZE); if (vma->vm_flags & VM_EXEC) - flush_icache_page_asm(physaddr, vmaddr); + flush_user_icache_range_asm(vmaddr, vmaddr + PAGE_SIZE); + flush_tlb_page(vma, vmaddr); + + /* Restore previous context */ + local_irq_save(flags); +#ifdef CONFIG_TLB_PTLOCK + mtctl(pgd_lock, 28); +#endif + mtctl(pgd, 25); + mtsp(space, SR_USER); + mtctl(prot, 8); + local_irq_restore(flags); + preempt_enable(); } +static inline pte_t *get_ptep(struct mm_struct *mm, unsigned long addr) +{ + pte_t *ptep = NULL; + pgd_t *pgd = mm->pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + if (!pgd_none(*pgd)) { + p4d = p4d_offset(pgd, addr); + if (!p4d_none(*p4d)) { + pud = pud_offset(p4d, addr); + if (!pud_none(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) + ptep = pte_offset_map(pmd, addr); + } + } + } + return ptep; +} + +static inline bool pte_needs_flush(pte_t pte) +{ + return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_NO_CACHE)) + == (_PAGE_PRESENT | _PAGE_ACCESSED); +} + void flush_dcache_page(struct page *page) { struct address_space *mapping = page_mapping_file(page); struct vm_area_struct *mpnt; unsigned long offset; unsigned long addr, old_addr = 0; + unsigned long count = 0; pgoff_t pgoff; if (mapping && !mapping_mapped(mapping)) { @@ -357,33 +416,52 @@ void flush_dcache_page(struct page *page) pgoff = page->index; - /* We have carefully arranged in arch_get_unmapped_area() that + /* + * We have carefully arranged in arch_get_unmapped_area() that * *any* mappings of a file are always congruently mapped (whether * declared as MAP_PRIVATE or MAP_SHARED), so we only need - * to flush one address here for them all to become coherent */ - + * to flush one address here for them all to become coherent + * on machines that support equivalent aliasing + */ flush_dcache_mmap_lock(mapping); vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; addr = mpnt->vm_start + offset; + if (parisc_requires_coherency()) { + pte_t *ptep; - /* The TLB is the engine of coherence on parisc: The - * CPU is entitled to speculate any page with a TLB - * mapping, so here we kill the mapping then flush the - * page along a special flush only alias mapping. - * This guarantees that the page is no-longer in the - * cache for any process and nor may it be - * speculatively read in (until the user or kernel - * specifically accesses it, of course) */ - - flush_tlb_page(mpnt, addr); - if (old_addr == 0 || (old_addr & (SHM_COLOUR - 1)) - != (addr & (SHM_COLOUR - 1))) { - __flush_cache_page(mpnt, addr, page_to_phys(page)); - if (parisc_requires_coherency() && old_addr) - printk(KERN_ERR "INEQUIVALENT ALIASES 0x%lx and 0x%lx in file %pD\n", old_addr, addr, mpnt->vm_file); - old_addr = addr; + ptep = get_ptep(mpnt->vm_mm, addr); + if (ptep && pte_needs_flush(*ptep)) + flush_user_cache_page(mpnt, addr); + } else { + /* + * The TLB is the engine of coherence on parisc: + * The CPU is entitled to speculate any page + * with a TLB mapping, so here we kill the + * mapping then flush the page along a special + * flush only alias mapping. This guarantees that + * the page is no-longer in the cache for any + * process and nor may it be speculatively read + * in (until the user or kernel specifically + * accesses it, of course) + */ + flush_tlb_page(mpnt, addr); + if (old_addr == 0 || (old_addr & (SHM_COLOUR - 1)) + != (addr & (SHM_COLOUR - 1))) { + __flush_cache_page(mpnt, addr, page_to_phys(page)); + /* + * Software is allowed to have any number + * of private mappings to a page. + */ + if (!(mpnt->vm_flags & VM_SHARED)) + continue; + if (old_addr) + pr_err("INEQUIVALENT ALIASES 0x%lx and 0x%lx in file %pD\n", + old_addr, addr, mpnt->vm_file); + old_addr = addr; + } } + WARN_ON(++count == 4096); } flush_dcache_mmap_unlock(mapping); } @@ -417,23 +495,16 @@ void __init parisc_setup_cache_timing(void) printk(KERN_DEBUG "Whole cache flush %lu cycles, flushing %lu bytes %lu cycles\n", alltime, size, rangetime); - threshold = L1_CACHE_ALIGN(size * alltime / rangetime); + threshold = L1_CACHE_ALIGN((unsigned long)((uint64_t)size * alltime / rangetime)); + pr_info("Calculated flush threshold is %lu KiB\n", + threshold/1024); /* - * The threshold computed above isn't very reliable since the - * flush times depend greatly on the percentage of dirty lines - * in the flush range. Further, the whole cache time doesn't - * include the time to refill lines that aren't in the mm/vma - * being flushed. By timing glibc build and checks on mako cpus, - * the following formula seems to work reasonably well. The - * value from the timing calculation is too small, and increases - * build and check times by almost a factor two. + * The threshold computed above isn't very reliable. The following + * heuristic works reasonably well on c8000/rp3440. */ threshold2 = cache_info.dc_size * num_online_cpus(); - if (threshold2 > threshold) - threshold = threshold2; - if (threshold) - parisc_cache_flush_threshold = threshold; + parisc_cache_flush_threshold = threshold2; printk(KERN_INFO "Cache flush threshold set to %lu KiB\n", parisc_cache_flush_threshold/1024); @@ -489,19 +560,47 @@ void flush_kernel_dcache_page_addr(void *addr) } EXPORT_SYMBOL(flush_kernel_dcache_page_addr); -void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, - struct page *pg) +static void flush_cache_page_if_present(struct vm_area_struct *vma, + unsigned long vmaddr, unsigned long pfn) { - /* Copy using kernel mapping. No coherency is needed (all in - kunmap) for the `to' page. However, the `from' page needs to - be flushed through a mapping equivalent to the user mapping - before it can be accessed through the kernel mapping. */ - preempt_disable(); - flush_dcache_page_asm(__pa(vfrom), vaddr); - copy_page_asm(vto, vfrom); - preempt_enable(); + pte_t *ptep = get_ptep(vma->vm_mm, vmaddr); + + /* + * The pte check is racy and sometimes the flush will trigger + * a non-access TLB miss. Hopefully, the page has already been + * flushed. + */ + if (ptep && pte_needs_flush(*ptep)) + flush_cache_page(vma, vmaddr, pfn); +} + +void copy_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + void *kto, *kfrom; + + kfrom = kmap_local_page(from); + kto = kmap_local_page(to); + flush_cache_page_if_present(vma, vaddr, page_to_pfn(from)); + copy_page_asm(kto, kfrom); + kunmap_local(kto); + kunmap_local(kfrom); +} + +void copy_to_user_page(struct vm_area_struct *vma, struct page *page, + unsigned long user_vaddr, void *dst, void *src, int len) +{ + flush_cache_page_if_present(vma, user_vaddr, page_to_pfn(page)); + memcpy(dst, src, len); + flush_kernel_dcache_range_asm((unsigned long)dst, (unsigned long)dst + len); +} + +void copy_from_user_page(struct vm_area_struct *vma, struct page *page, + unsigned long user_vaddr, void *dst, void *src, int len) +{ + flush_cache_page_if_present(vma, user_vaddr, page_to_pfn(page)); + memcpy(dst, src, len); } -EXPORT_SYMBOL(copy_user_page); /* __flush_tlb_range() * @@ -532,92 +631,105 @@ int __flush_tlb_range(unsigned long sid, unsigned long start, return 0; } -static inline unsigned long mm_total_size(struct mm_struct *mm) +static void flush_cache_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - struct vm_area_struct *vma; - unsigned long usize = 0; - - for (vma = mm->mmap; vma; vma = vma->vm_next) - usize += vma->vm_end - vma->vm_start; - return usize; -} - -static inline pte_t *get_ptep(pgd_t *pgd, unsigned long addr) -{ - pte_t *ptep = NULL; + unsigned long addr, pfn; + pte_t *ptep; - if (!pgd_none(*pgd)) { - p4d_t *p4d = p4d_offset(pgd, addr); - if (!p4d_none(*p4d)) { - pud_t *pud = pud_offset(p4d, addr); - if (!pud_none(*pud)) { - pmd_t *pmd = pmd_offset(pud, addr); - if (!pmd_none(*pmd)) - ptep = pte_offset_map(pmd, addr); + for (addr = start; addr < end; addr += PAGE_SIZE) { + /* + * The vma can contain pages that aren't present. Although + * the pte search is expensive, we need the pte to find the + * page pfn and to check whether the page should be flushed. + */ + ptep = get_ptep(vma->vm_mm, addr); + if (ptep && pte_needs_flush(*ptep)) { + if (parisc_requires_coherency()) { + flush_user_cache_page(vma, addr); + } else { + pfn = pte_pfn(*ptep); + if (WARN_ON(!pfn_valid(pfn))) + return; + __flush_cache_page(vma, addr, PFN_PHYS(pfn)); } } } - return ptep; } -static void flush_cache_pages(struct vm_area_struct *vma, struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline unsigned long mm_total_size(struct mm_struct *mm) { - unsigned long addr, pfn; - pte_t *ptep; + struct vm_area_struct *vma; + unsigned long usize = 0; - for (addr = start; addr < end; addr += PAGE_SIZE) { - ptep = get_ptep(mm->pgd, addr); - if (ptep) { - pfn = pte_pfn(*ptep); - flush_cache_page(vma, addr, pfn); - } - } + for (vma = mm->mmap; vma && usize < parisc_cache_flush_threshold; vma = vma->vm_next) + usize += vma->vm_end - vma->vm_start; + return usize; } void flush_cache_mm(struct mm_struct *mm) { struct vm_area_struct *vma; - /* Flushing the whole cache on each cpu takes forever on - rp3440, etc. So, avoid it if the mm isn't too big. */ - if ((!IS_ENABLED(CONFIG_SMP) || !arch_irqs_disabled()) && - mm_total_size(mm) >= parisc_cache_flush_threshold) { - if (mm->context.space_id) - flush_tlb_all(); + /* + * Flushing the whole cache on each cpu takes forever on + * rp3440, etc. So, avoid it if the mm isn't too big. + * + * Note that we must flush the entire cache on machines + * with aliasing caches to prevent random segmentation + * faults. + */ + if (!parisc_requires_coherency() + || mm_total_size(mm) >= parisc_cache_flush_threshold) { + if (WARN_ON(IS_ENABLED(CONFIG_SMP) && arch_irqs_disabled())) + return; + flush_tlb_all(); flush_cache_all(); return; } + /* Flush mm */ for (vma = mm->mmap; vma; vma = vma->vm_next) - flush_cache_pages(vma, mm, vma->vm_start, vma->vm_end); + flush_cache_pages(vma, vma->vm_start, vma->vm_end); } -void flush_cache_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - if ((!IS_ENABLED(CONFIG_SMP) || !arch_irqs_disabled()) && - end - start >= parisc_cache_flush_threshold) { - if (vma->vm_mm->context.space_id) - flush_tlb_range(vma, start, end); + if (!parisc_requires_coherency() + || end - start >= parisc_cache_flush_threshold) { + if (WARN_ON(IS_ENABLED(CONFIG_SMP) && arch_irqs_disabled())) + return; + flush_tlb_range(vma, start, end); flush_cache_all(); return; } - flush_cache_pages(vma, vma->vm_mm, start, end); + flush_cache_pages(vma, start, end); } -void -flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn) +void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn) { - if (pfn_valid(pfn)) { - if (likely(vma->vm_mm->context.space_id)) { - flush_tlb_page(vma, vmaddr); - __flush_cache_page(vma, vmaddr, PFN_PHYS(pfn)); - } else { - __purge_cache_page(vma, vmaddr, PFN_PHYS(pfn)); - } + if (WARN_ON(!pfn_valid(pfn))) + return; + if (parisc_requires_coherency()) + flush_user_cache_page(vma, vmaddr); + else + __flush_cache_page(vma, vmaddr, PFN_PHYS(pfn)); +} + +void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) +{ + if (!PageAnon(page)) + return; + + if (parisc_requires_coherency()) { + flush_user_cache_page(vma, vmaddr); + return; } + + flush_tlb_page(vma, vmaddr); + preempt_disable(); + flush_dcache_page_asm(page_to_phys(page), vmaddr); + preempt_enable(); } void flush_kernel_vmap_range(void *vaddr, int size) diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c index 3343d2fb7889..6e0b86652f30 100644 --- a/arch/parisc/kernel/kprobes.c +++ b/arch/parisc/kernel/kprobes.c @@ -152,7 +152,7 @@ int __kprobes parisc_kprobe_ss_handler(struct pt_regs *regs) /* for absolute branch instructions we can copy iaoq_b. for relative * branch instructions we need to calculate the new address based on the * difference between iaoq_f and iaoq_b. We cannot use iaoq_b without - * modificationt because it's based on our ainsn.insn address. + * modifications because it's based on our ainsn.insn address. */ if (p->post_handler) diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index d98692115221..26eb568f8b96 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -171,6 +171,7 @@ static int __init processor_probe(struct parisc_device *dev) p->cpu_num = cpu_info.cpu_num; p->cpu_loc = cpu_info.cpu_loc; + set_cpu_possible(cpuid, true); store_cpu_topology(cpuid); #ifdef CONFIG_SMP @@ -419,8 +420,7 @@ show_cpuinfo (struct seq_file *m, void *v) } seq_printf(m, " (0x%02lx)\n", boot_cpu_data.pdc.capabilities); - seq_printf(m, "model\t\t: %s\n" - "model name\t: %s\n", + seq_printf(m, "model\t\t: %s - %s\n", boot_cpu_data.pdc.sys_model_name, cpuinfo->dev ? cpuinfo->dev->name : "Unknown"); @@ -461,6 +461,13 @@ static struct parisc_driver cpu_driver __refdata = { */ void __init processor_init(void) { + unsigned int cpu; + reset_cpu_topology(); + + /* reset possible mask. We will mark those which are possible. */ + for_each_possible_cpu(cpu) + set_cpu_possible(cpu, false); + register_parisc_driver(&cpu_driver); } diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index b91cb45ffd4e..f005ddedb50e 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -161,6 +161,8 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_PA11 dma_ops_init(); #endif + + clear_sched_clock_stable(); } /* diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index bb27dfeeddfc..9714fbd7c42d 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -251,13 +251,9 @@ void __init time_init(void) static int __init init_cr16_clocksource(void) { /* - * The cr16 interval timers are not syncronized across CPUs, even if - * they share the same socket. + * The cr16 interval timers are not synchronized across CPUs. */ if (num_online_cpus() > 1 && !running_on_qemu) { - /* mark sched_clock unstable */ - clear_sched_clock_stable(); - clocksource_cr16.name = "cr16_unstable"; clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; clocksource_cr16.rating = 0; diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index a6e61cf2cad0..b78f1b9d45c1 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -469,7 +469,7 @@ void parisc_terminate(char *msg, struct pt_regs *regs, int code, unsigned long o * panic notifiers, and we should call panic * directly from the location that we wish. * e.g. We should not call panic from - * parisc_terminate, but rather the oter way around. + * parisc_terminate, but rather the other way around. * This hack works, prints the panic message twice, * and it enables reboot timers! */ diff --git a/arch/parisc/math-emu/dfadd.c b/arch/parisc/math-emu/dfadd.c index ec487e07f004..00e561d4aa55 100644 --- a/arch/parisc/math-emu/dfadd.c +++ b/arch/parisc/math-emu/dfadd.c @@ -253,7 +253,7 @@ dbl_fadd( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/math-emu/dfsub.c b/arch/parisc/math-emu/dfsub.c index c4f30acf2d48..4f03782284bd 100644 --- a/arch/parisc/math-emu/dfsub.c +++ b/arch/parisc/math-emu/dfsub.c @@ -256,7 +256,7 @@ dbl_fsub( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/math-emu/sfadd.c b/arch/parisc/math-emu/sfadd.c index 838758279d5b..9b98c874dfac 100644 --- a/arch/parisc/math-emu/sfadd.c +++ b/arch/parisc/math-emu/sfadd.c @@ -249,7 +249,7 @@ sgl_fadd( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/math-emu/sfsub.c b/arch/parisc/math-emu/sfsub.c index 583d3ace4634..29d9eed09d12 100644 --- a/arch/parisc/math-emu/sfsub.c +++ b/arch/parisc/math-emu/sfsub.c @@ -252,7 +252,7 @@ sgl_fsub( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index f114e102aaf2..84bc437be5cd 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -22,6 +22,8 @@ #include <asm/traps.h> +#define DEBUG_NATLB 0 + /* Various important other fields */ #define bit22set(x) (x & 0x00000200) #define bits23_25set(x) (x & 0x000001c0) @@ -450,8 +452,8 @@ handle_nadtlb_fault(struct pt_regs *regs) fallthrough; case 0x380: /* PDC and FIC instructions */ - if (printk_ratelimit()) { - pr_warn("BUG: nullifying cache flush/purge instruction\n"); + if (DEBUG_NATLB && printk_ratelimit()) { + pr_warn("WARNING: nullifying cache flush/purge instruction\n"); show_regs(regs); } if (insn & 0x20) { diff --git a/arch/powerpc/configs/chrp32_defconfig b/arch/powerpc/configs/chrp32_defconfig index a4a805b87469..fb314f75ad4b 100644 --- a/arch/powerpc/configs/chrp32_defconfig +++ b/arch/powerpc/configs/chrp32_defconfig @@ -53,7 +53,6 @@ CONFIG_ATA_GENERIC=y CONFIG_NETDEVICES=y CONFIG_PCNET32=y CONFIG_NET_TULIP=y -CONFIG_DE4X5=y CONFIG_MV643XX_ETH=y CONFIG_8139CP=y CONFIG_8139TOO=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index bb549cb1c3e3..b622ecd73286 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -444,7 +444,6 @@ CONFIG_NET_TULIP=y CONFIG_DE2104X=m CONFIG_TULIP=m CONFIG_TULIP_MMIO=y -CONFIG_DE4X5=m CONFIG_WINBOND_840=m CONFIG_DM9102=m CONFIG_ULI526X=m diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index ecbae1832de3..61a4736355c2 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -13,7 +13,8 @@ #ifdef CONFIG_DEBUG_BUGVERBOSE .macro __EMIT_BUG_ENTRY addr,file,line,flags .section __bug_table,"aw" -5001: .4byte \addr - 5001b, 5002f - 5001b +5001: .4byte \addr - . + .4byte 5002f - . .short \line, \flags .org 5001b+BUG_ENTRY_SIZE .previous @@ -24,7 +25,7 @@ #else .macro __EMIT_BUG_ENTRY addr,file,line,flags .section __bug_table,"aw" -5001: .4byte \addr - 5001b +5001: .4byte \addr - . .short \flags .org 5001b+BUG_ENTRY_SIZE .previous @@ -49,15 +50,16 @@ #ifdef CONFIG_DEBUG_BUGVERBOSE #define _EMIT_BUG_ENTRY \ ".section __bug_table,\"aw\"\n" \ - "2:\t.4byte 1b - 2b, %0 - 2b\n" \ - "\t.short %1, %2\n" \ + "2: .4byte 1b - .\n" \ + " .4byte %0 - .\n" \ + " .short %1, %2\n" \ ".org 2b+%3\n" \ ".previous\n" #else #define _EMIT_BUG_ENTRY \ ".section __bug_table,\"aw\"\n" \ - "2:\t.4byte 1b - 2b\n" \ - "\t.short %2\n" \ + "2: .4byte 1b - .\n" \ + " .short %2\n" \ ".org 2b+%3\n" \ ".previous\n" #endif diff --git a/arch/powerpc/include/asm/timex.h b/arch/powerpc/include/asm/timex.h index fa2e76e4093a..14b4489de52c 100644 --- a/arch/powerpc/include/asm/timex.h +++ b/arch/powerpc/include/asm/timex.h @@ -19,6 +19,7 @@ static inline cycles_t get_cycles(void) { return mftb(); } +#define get_cycles get_cycles #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_TIMEX_H */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 65562c4a0a69..4c09c6688ac6 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -752,7 +752,7 @@ u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) * FIXME: How do i get PID? Do I really need it? * prstatus.pr_pid = ???? */ - elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + elf_core_copy_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); return buf; diff --git a/arch/powerpc/kernel/vdso/gettimeofday.S b/arch/powerpc/kernel/vdso/gettimeofday.S index eb9c81e1c218..0c4ecc8fec5a 100644 --- a/arch/powerpc/kernel/vdso/gettimeofday.S +++ b/arch/powerpc/kernel/vdso/gettimeofday.S @@ -22,12 +22,15 @@ .macro cvdso_call funct call_time=0 .cfi_startproc PPC_STLU r1, -PPC_MIN_STKFRM(r1) + .cfi_adjust_cfa_offset PPC_MIN_STKFRM mflr r0 - .cfi_register lr, r0 PPC_STLU r1, -PPC_MIN_STKFRM(r1) + .cfi_adjust_cfa_offset PPC_MIN_STKFRM PPC_STL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) + .cfi_rel_offset lr, PPC_MIN_STKFRM + PPC_LR_STKOFF #ifdef __powerpc64__ PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1) + .cfi_rel_offset r2, PPC_MIN_STKFRM + STK_GOT #endif get_datapage r5 .ifeq \call_time @@ -39,13 +42,15 @@ PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) #ifdef __powerpc64__ PPC_LL r2, PPC_MIN_STKFRM + STK_GOT(r1) + .cfi_restore r2 #endif .ifeq \call_time cmpwi r3, 0 .endif mtlr r0 - .cfi_restore lr addi r1, r1, 2 * PPC_MIN_STKFRM + .cfi_restore lr + .cfi_def_cfa_offset 0 crclr so .ifeq \call_time beqlr+ diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S index e3ab9df6cf19..6cfcd20d4668 100644 --- a/arch/powerpc/kvm/book3s_32_sr.S +++ b/arch/powerpc/kvm/book3s_32_sr.S @@ -122,11 +122,27 @@ /* 0x0 - 0xb */ - /* 'current->mm' needs to be in r4 */ - tophys(r4, r2) - lwz r4, MM(r4) - tophys(r4, r4) - /* This only clobbers r0, r3, r4 and r5 */ + /* switch_mmu_context() needs paging, let's enable it */ + mfmsr r9 + ori r11, r9, MSR_DR + mtmsr r11 + sync + + /* switch_mmu_context() clobbers r12, rescue it */ + SAVE_GPR(12, r1) + + /* Calling switch_mmu_context(<inv>, current->mm, <inv>); */ + lwz r4, MM(r2) bl switch_mmu_context + /* restore r12 */ + REST_GPR(12, r1) + + /* Disable paging again */ + mfmsr r9 + li r6, MSR_DR + andc r9, r9, r6 + mtmsr r9 + sync + .endm diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index b97bc179f65a..adcb1a1a2bfe 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -112,7 +112,7 @@ static void __init fill_prstatus(struct elf_prstatus *prstatus, int pir, struct pt_regs *regs) { memset(prstatus, 0, sizeof(struct elf_prstatus)); - elf_core_copy_kernel_regs(&(prstatus->pr_reg), regs); + elf_core_copy_regs(&(prstatus->pr_reg), regs); /* * Overload PID with PIR value. diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index f58728d5f10d..39962c905542 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -462,7 +462,6 @@ static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu { struct papr_scm_perf_stat *stat; struct papr_scm_perf_stats *stats; - char *statid; int index, rc, count; u32 available_events; @@ -493,14 +492,12 @@ static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu for (index = 0, stat = stats->scm_statistic, count = 0; index < available_events; index++, ++stat) { - statid = kzalloc(strlen(stat->stat_id) + 1, GFP_KERNEL); - if (!statid) { + p->nvdimm_events_map[count] = kmemdup_nul(stat->stat_id, 8, GFP_KERNEL); + if (!p->nvdimm_events_map[count]) { rc = -ENOMEM; goto out_nvdimm_events_map; } - strcpy(statid, stat->stat_id); - p->nvdimm_events_map[count] = statid; count++; } p->nvdimm_events_map[count] = NULL; diff --git a/arch/powerpc/platforms/pseries/vas-sysfs.c b/arch/powerpc/platforms/pseries/vas-sysfs.c index 909535ca513a..ec65586cbeb3 100644 --- a/arch/powerpc/platforms/pseries/vas-sysfs.c +++ b/arch/powerpc/platforms/pseries/vas-sysfs.c @@ -27,22 +27,31 @@ struct vas_caps_entry { /* * This function is used to get the notification from the drmgr when - * QoS credits are changed. Though receiving the target total QoS - * credits here, get the official QoS capabilities from the hypervisor. + * QoS credits are changed. */ -static ssize_t update_total_credits_trigger(struct vas_cop_feat_caps *caps, +static ssize_t update_total_credits_store(struct vas_cop_feat_caps *caps, const char *buf, size_t count) { int err; u16 creds; err = kstrtou16(buf, 0, &creds); + /* + * The user space interface from the management console + * notifies OS with the new QoS credits and then the + * hypervisor. So OS has to use this new credits value + * and reconfigure VAS windows (close or reopen depends + * on the credits available) instead of depending on VAS + * QoS capabilities from the hypervisor. + */ if (!err) - err = vas_reconfig_capabilties(caps->win_type); + err = vas_reconfig_capabilties(caps->win_type, creds); if (err) return -EINVAL; + pr_info("Set QoS total credits %u\n", creds); + return count; } @@ -92,7 +101,7 @@ VAS_ATTR_RO(nr_total_credits); VAS_ATTR_RO(nr_used_credits); static struct vas_sysfs_entry update_total_credits_attribute = - __ATTR(update_total_credits, 0200, NULL, update_total_credits_trigger); + __ATTR(update_total_credits, 0200, NULL, update_total_credits_store); static struct attribute *vas_def_capab_attrs[] = { &nr_total_credits_attribute.attr, diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 1f59d78c77a1..ec643bbdb67f 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -779,10 +779,10 @@ static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds, * changes. Reconfig window configurations based on the credits * availability from this new capabilities. */ -int vas_reconfig_capabilties(u8 type) +int vas_reconfig_capabilties(u8 type, int new_nr_creds) { struct vas_cop_feat_caps *caps; - int old_nr_creds, new_nr_creds; + int old_nr_creds; struct vas_caps *vcaps; int rc = 0, nr_active_wins; @@ -795,12 +795,6 @@ int vas_reconfig_capabilties(u8 type) caps = &vcaps->caps; mutex_lock(&vas_pseries_mutex); - rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, vcaps->feat, - (u64)virt_to_phys(&hv_cop_caps)); - if (rc) - goto out; - - new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); old_nr_creds = atomic_read(&caps->nr_total_credits); @@ -832,7 +826,6 @@ int vas_reconfig_capabilties(u8 type) false); } -out: mutex_unlock(&vas_pseries_mutex); return rc; } @@ -850,7 +843,7 @@ static int pseries_vas_notifier(struct notifier_block *nb, struct of_reconfig_data *rd = data; struct device_node *dn = rd->dn; const __be32 *intserv = NULL; - int len, rc = 0; + int new_nr_creds, len, rc = 0; if ((action == OF_RECONFIG_ATTACH_NODE) || (action == OF_RECONFIG_DETACH_NODE)) @@ -862,7 +855,15 @@ static int pseries_vas_notifier(struct notifier_block *nb, if (!intserv) return NOTIFY_OK; - rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE); + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, + vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, + (u64)virt_to_phys(&hv_cop_caps)); + if (!rc) { + new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); + rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, + new_nr_creds); + } + if (rc) pr_err("Failed reconfig VAS capabilities with DLPAR\n"); diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index 34177881e998..333ffa2f9f42 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -135,7 +135,7 @@ struct pseries_vas_window { }; int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps); -int vas_reconfig_capabilties(u8 type); +int vas_reconfig_capabilties(u8 type, int new_nr_creds); int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps); #ifdef CONFIG_PPC_VAS diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 00fd9c548f26..3ac2a81a55eb 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -468,7 +468,7 @@ config CC_HAVE_STACKPROTECTOR_TLS config STACKPROTECTOR_PER_TASK def_bool y - depends on !GCC_PLUGIN_RANDSTRUCT + depends on !RANDSTRUCT depends on STACKPROTECTOR && CC_HAVE_STACKPROTECTOR_TLS config PHYS_RAM_BASE_FIXED diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi index 746c4d4e7686..cf2f55e1dcb6 100644 --- a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi +++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi @@ -366,7 +366,7 @@ gpio1: gpio@20121000 { compatible = "microchip,mpfs-gpio"; - reg = <000 0x20121000 0x0 0x1000>; + reg = <0x0 0x20121000 0x0 0x1000>; interrupt-parent = <&plic>; interrupt-controller; #interrupt-cells = <1>; diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi index aad45d7f498f..5c638fd5b35c 100644 --- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi @@ -167,7 +167,7 @@ clocks = <&prci FU540_PRCI_CLK_TLCLK>; status = "disabled"; }; - dma: dma@3000000 { + dma: dma-controller@3000000 { compatible = "sifive,fu540-c000-pdma"; reg = <0x0 0x3000000 0x0 0x8000>; interrupt-parent = <&plic0>; diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h index d3804a2f9aad..1aaea81fb141 100644 --- a/arch/riscv/include/asm/bug.h +++ b/arch/riscv/include/asm/bug.h @@ -30,8 +30,8 @@ typedef u32 bug_insn_t; #ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS -#define __BUG_ENTRY_ADDR RISCV_INT " 1b - 2b" -#define __BUG_ENTRY_FILE RISCV_INT " %0 - 2b" +#define __BUG_ENTRY_ADDR RISCV_INT " 1b - ." +#define __BUG_ENTRY_FILE RISCV_INT " %0 - ." #else #define __BUG_ENTRY_ADDR RISCV_PTR " 1b" #define __BUG_ENTRY_FILE RISCV_PTR " %0" diff --git a/arch/riscv/include/asm/timex.h b/arch/riscv/include/asm/timex.h index 507cae273bc6..d6a7428f6248 100644 --- a/arch/riscv/include/asm/timex.h +++ b/arch/riscv/include/asm/timex.h @@ -41,7 +41,7 @@ static inline u32 get_cycles_hi(void) static inline unsigned long random_get_entropy(void) { if (unlikely(clint_time_val == NULL)) - return 0; + return random_get_entropy_fallback(); return get_cycles(); } #define random_get_entropy() random_get_entropy() diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index b0793dc0c291..05ed641a1134 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -208,8 +208,25 @@ static void __init setup_bootmem(void) * early_init_fdt_reserve_self() since __pa() does * not work for DTB pointers that are fixmap addresses */ - if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) - memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); + if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) { + /* + * In case the DTB is not located in a memory region we won't + * be able to locate it later on via the linear mapping and + * get a segfault when accessing it via __va(dtb_early_pa). + * To avoid this situation copy DTB to a memory region. + * Note that memblock_phys_alloc will also reserve DTB region. + */ + if (!memblock_is_memory(dtb_early_pa)) { + size_t fdt_size = fdt_totalsize(dtb_early_va); + phys_addr_t new_dtb_early_pa = memblock_phys_alloc(fdt_size, PAGE_SIZE); + void *new_dtb_early_va = early_memremap(new_dtb_early_pa, fdt_size); + + memcpy(new_dtb_early_va, dtb_early_va, fdt_size); + early_memunmap(new_dtb_early_va, fdt_size); + _dtb_early_pa = new_dtb_early_pa; + } else + memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); + } early_init_fdt_scan_reserved_mem(); dma_contiguous_reserve(dma32_phys_limit); diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index f42d9cd3b64d..2a3715bf29fe 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -535,6 +535,43 @@ static inline u32 rv_amoadd_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) return rv_amo_insn(0, aq, rl, rs2, rs1, 2, rd, 0x2f); } +static inline u32 rv_amoand_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0xc, aq, rl, rs2, rs1, 2, rd, 0x2f); +} + +static inline u32 rv_amoor_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x8, aq, rl, rs2, rs1, 2, rd, 0x2f); +} + +static inline u32 rv_amoxor_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x4, aq, rl, rs2, rs1, 2, rd, 0x2f); +} + +static inline u32 rv_amoswap_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x1, aq, rl, rs2, rs1, 2, rd, 0x2f); +} + +static inline u32 rv_lr_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x2, aq, rl, rs2, rs1, 2, rd, 0x2f); +} + +static inline u32 rv_sc_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x3, aq, rl, rs2, rs1, 2, rd, 0x2f); +} + +static inline u32 rv_fence(u8 pred, u8 succ) +{ + u16 imm11_0 = pred << 4 | succ; + + return rv_i_insn(imm11_0, 0, 0, 0, 0xf); +} + /* RVC instrutions. */ static inline u16 rvc_addi4spn(u8 rd, u32 imm10) @@ -753,6 +790,36 @@ static inline u32 rv_amoadd_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) return rv_amo_insn(0, aq, rl, rs2, rs1, 3, rd, 0x2f); } +static inline u32 rv_amoand_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0xc, aq, rl, rs2, rs1, 3, rd, 0x2f); +} + +static inline u32 rv_amoor_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x8, aq, rl, rs2, rs1, 3, rd, 0x2f); +} + +static inline u32 rv_amoxor_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x4, aq, rl, rs2, rs1, 3, rd, 0x2f); +} + +static inline u32 rv_amoswap_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x1, aq, rl, rs2, rs1, 3, rd, 0x2f); +} + +static inline u32 rv_lr_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x2, aq, rl, rs2, rs1, 3, rd, 0x2f); +} + +static inline u32 rv_sc_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x3, aq, rl, rs2, rs1, 3, rd, 0x2f); +} + /* RV64-only RVC instructions. */ static inline u16 rvc_ld(u8 rd, u32 imm8, u8 rs1) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 0bcda99d1d68..00df3a8f92ac 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -455,6 +455,90 @@ static int emit_call(bool fixed, u64 addr, struct rv_jit_context *ctx) return 0; } +static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64, + struct rv_jit_context *ctx) +{ + u8 r0; + int jmp_offset; + + if (off) { + if (is_12b_int(off)) { + emit_addi(RV_REG_T1, rd, off, ctx); + } else { + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + } + rd = RV_REG_T1; + } + + switch (imm) { + /* lock *(u32/u64 *)(dst_reg + off16) <op>= src_reg */ + case BPF_ADD: + emit(is64 ? rv_amoadd_d(RV_REG_ZERO, rs, rd, 0, 0) : + rv_amoadd_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + break; + case BPF_AND: + emit(is64 ? rv_amoand_d(RV_REG_ZERO, rs, rd, 0, 0) : + rv_amoand_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + break; + case BPF_OR: + emit(is64 ? rv_amoor_d(RV_REG_ZERO, rs, rd, 0, 0) : + rv_amoor_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + break; + case BPF_XOR: + emit(is64 ? rv_amoxor_d(RV_REG_ZERO, rs, rd, 0, 0) : + rv_amoxor_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + break; + /* src_reg = atomic_fetch_<op>(dst_reg + off16, src_reg) */ + case BPF_ADD | BPF_FETCH: + emit(is64 ? rv_amoadd_d(rs, rs, rd, 0, 0) : + rv_amoadd_w(rs, rs, rd, 0, 0), ctx); + if (!is64) + emit_zext_32(rs, ctx); + break; + case BPF_AND | BPF_FETCH: + emit(is64 ? rv_amoand_d(rs, rs, rd, 0, 0) : + rv_amoand_w(rs, rs, rd, 0, 0), ctx); + if (!is64) + emit_zext_32(rs, ctx); + break; + case BPF_OR | BPF_FETCH: + emit(is64 ? rv_amoor_d(rs, rs, rd, 0, 0) : + rv_amoor_w(rs, rs, rd, 0, 0), ctx); + if (!is64) + emit_zext_32(rs, ctx); + break; + case BPF_XOR | BPF_FETCH: + emit(is64 ? rv_amoxor_d(rs, rs, rd, 0, 0) : + rv_amoxor_w(rs, rs, rd, 0, 0), ctx); + if (!is64) + emit_zext_32(rs, ctx); + break; + /* src_reg = atomic_xchg(dst_reg + off16, src_reg); */ + case BPF_XCHG: + emit(is64 ? rv_amoswap_d(rs, rs, rd, 0, 0) : + rv_amoswap_w(rs, rs, rd, 0, 0), ctx); + if (!is64) + emit_zext_32(rs, ctx); + break; + /* r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg); */ + case BPF_CMPXCHG: + r0 = bpf_to_rv_reg(BPF_REG_0, ctx); + emit(is64 ? rv_addi(RV_REG_T2, r0, 0) : + rv_addiw(RV_REG_T2, r0, 0), ctx); + emit(is64 ? rv_lr_d(r0, 0, rd, 0, 0) : + rv_lr_w(r0, 0, rd, 0, 0), ctx); + jmp_offset = ninsns_rvoff(8); + emit(rv_bne(RV_REG_T2, r0, jmp_offset >> 1), ctx); + emit(is64 ? rv_sc_d(RV_REG_T3, rs, rd, 0, 0) : + rv_sc_w(RV_REG_T3, rs, rd, 0, 0), ctx); + jmp_offset = ninsns_rvoff(-6); + emit(rv_bne(RV_REG_T3, 0, jmp_offset >> 1), ctx); + emit(rv_fence(0x3, 0x3), ctx); + break; + } +} + #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) @@ -1146,30 +1230,8 @@ out_be: break; case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: - if (insn->imm != BPF_ADD) { - pr_err("bpf-jit: not supported: atomic operation %02x ***\n", - insn->imm); - return -EINVAL; - } - - /* atomic_add: lock *(u32 *)(dst + off) += src - * atomic_add: lock *(u64 *)(dst + off) += src - */ - - if (off) { - if (is_12b_int(off)) { - emit_addi(RV_REG_T1, rd, off, ctx); - } else { - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - } - - rd = RV_REG_T1; - } - - emit(BPF_SIZE(code) == BPF_W ? - rv_amoadd_w(RV_REG_ZERO, rs, rd, 0, 0) : - rv_amoadd_d(RV_REG_ZERO, rs, rd, 0, 0), ctx); + emit_atomic(rd, rs, off, imm, + BPF_SIZE(code) == BPF_DW, ctx); break; default: pr_err("bpf-jit: unknown opcode %02x\n", code); diff --git a/arch/s390/Makefile b/arch/s390/Makefile index e441b60b1812..80eb3ee84ff1 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -20,7 +20,9 @@ LDFLAGS_vmlinux := -pie endif aflags_dwarf := -Wa,-gdwarf-2 KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__ +ifndef CONFIG_AS_IS_LLVM KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf)) +endif KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float -mbackchain @@ -30,6 +32,16 @@ KBUILD_CFLAGS_DECOMPRESSOR += -fno-stack-protector KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,)) + +ifdef CONFIG_CC_IS_GCC + ifeq ($(call cc-ifversion, -ge, 1200, y), y) + ifeq ($(call cc-ifversion, -lt, 1300, y), y) + KBUILD_CFLAGS += $(call cc-disable-warning, array-bounds) + KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, array-bounds) + endif + endif +endif + UTS_MACHINE := s390x STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384) CHECKFLAGS += -D__s390__ -D__s390x__ diff --git a/arch/s390/boot/.gitignore b/arch/s390/boot/.gitignore index b265bfede188..f56591bc0897 100644 --- a/arch/s390/boot/.gitignore +++ b/arch/s390/boot/.gitignore @@ -2,3 +2,6 @@ image bzImage section_cmp.* +vmlinux +vmlinux.lds +vmlinux.syms diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 0ba646899131..883357a211a3 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -37,14 +37,21 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o -obj-y += version.o pgm_check_info.o ctype.o +obj-y += version.o pgm_check_info.o ctype.o ipl_data.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o -targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) -subdir- := compressed +obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o +obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o +obj-all := $(obj-y) piggy.o syms.o + +targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) +targets += vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 +targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 +targets += vmlinux.bin.zst info.bin syms.bin vmlinux.syms $(obj-all) OBJECTS := $(addprefix $(obj)/,$(obj-y)) +OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all)) quiet_cmd_section_cmp = SECTCMP $* define cmd_section_cmp @@ -59,14 +66,67 @@ define cmd_section_cmp touch $@ endef -$(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE +$(obj)/bzImage: $(obj)/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE $(call if_changed,objcopy) -$(obj)/section_cmp%: vmlinux $(obj)/compressed/vmlinux FORCE +$(obj)/section_cmp%: vmlinux $(obj)/vmlinux FORCE $(call if_changed,section_cmp) -$(obj)/compressed/vmlinux: $(obj)/startup.a FORCE - $(Q)$(MAKE) $(build)=$(obj)/compressed $@ +LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup --build-id=sha1 -T +$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS_ALL) FORCE + $(call if_changed,ld) + +LDFLAGS_vmlinux.syms := --oformat $(LD_BFD) -e startup -T +$(obj)/vmlinux.syms: $(obj)/vmlinux.lds $(OBJECTS) FORCE + $(call if_changed,ld) + +quiet_cmd_dumpsyms = DUMPSYMS $< +define cmd_dumpsyms + $(NM) -n -S --format=bsd "$<" | sed -nE 's/^0*([0-9a-fA-F]+) 0*([0-9a-fA-F]+) [tT] ([^ ]*)$$/\1 \2 \3/p' | tr '\n' '\0' > "$@" +endef + +$(obj)/syms.bin: $(obj)/vmlinux.syms FORCE + $(call if_changed,dumpsyms) + +OBJCOPYFLAGS_syms.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.decompressor.syms +$(obj)/syms.o: $(obj)/syms.bin FORCE + $(call if_changed,objcopy) + +OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load +$(obj)/info.bin: vmlinux FORCE + $(call if_changed,objcopy) + +OBJCOPYFLAGS_info.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.info +$(obj)/info.o: $(obj)/info.bin FORCE + $(call if_changed,objcopy) + +OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section=.vmlinux.info -S +$(obj)/vmlinux.bin: vmlinux FORCE + $(call if_changed,objcopy) + +suffix-$(CONFIG_KERNEL_GZIP) := .gz +suffix-$(CONFIG_KERNEL_BZIP2) := .bz2 +suffix-$(CONFIG_KERNEL_LZ4) := .lz4 +suffix-$(CONFIG_KERNEL_LZMA) := .lzma +suffix-$(CONFIG_KERNEL_LZO) := .lzo +suffix-$(CONFIG_KERNEL_XZ) := .xz +suffix-$(CONFIG_KERNEL_ZSTD) := .zst -$(obj)/startup.a: $(OBJECTS) FORCE - $(call if_changed,ar) +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE + $(call if_changed,gzip) +$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE + $(call if_changed,bzip2_with_size) +$(obj)/vmlinux.bin.lz4: $(obj)/vmlinux.bin FORCE + $(call if_changed,lz4_with_size) +$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE + $(call if_changed,lzma_with_size) +$(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.bin FORCE + $(call if_changed,lzo_with_size) +$(obj)/vmlinux.bin.xz: $(obj)/vmlinux.bin FORCE + $(call if_changed,xzkern_with_size) +$(obj)/vmlinux.bin.zst: $(obj)/vmlinux.bin FORCE + $(call if_changed,zstd22_with_size) + +OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed +$(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE + $(call if_changed,objcopy) diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 641ce0fc5c3e..70418389414d 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -2,9 +2,12 @@ #ifndef BOOT_BOOT_H #define BOOT_BOOT_H -#include <asm/extable.h> #include <linux/types.h> +#define IPL_START 0x200 + +#ifndef __ASSEMBLY__ + void startup_kernel(void); unsigned long detect_memory(void); bool is_ipl_block_dump(void); @@ -31,4 +34,5 @@ extern char _stack_start[], _stack_end[]; unsigned long read_ipl_report(unsigned long safe_offset); +#endif /* __ASSEMBLY__ */ #endif /* BOOT_BOOT_H */ diff --git a/arch/s390/boot/compressed/clz_ctz.c b/arch/s390/boot/clz_ctz.c index c3ebf248596b..c3ebf248596b 100644 --- a/arch/s390/boot/compressed/clz_ctz.c +++ b/arch/s390/boot/clz_ctz.c diff --git a/arch/s390/boot/compressed/.gitignore b/arch/s390/boot/compressed/.gitignore deleted file mode 100644 index 01d93832cf4a..000000000000 --- a/arch/s390/boot/compressed/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -vmlinux -vmlinux.lds -vmlinux.syms diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile deleted file mode 100644 index d04e0e7de0b3..000000000000 --- a/arch/s390/boot/compressed/Makefile +++ /dev/null @@ -1,86 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# linux/arch/s390/boot/compressed/Makefile -# -# create a compressed vmlinux image from the original vmlinux -# - -KCOV_INSTRUMENT := n -GCOV_PROFILE := n -UBSAN_SANITIZE := n -KASAN_SANITIZE := n -KCSAN_SANITIZE := n - -obj-y := $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o -obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o -obj-all := $(obj-y) piggy.o syms.o -targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 -targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 -targets += vmlinux.bin.zst -targets += info.bin syms.bin vmlinux.syms $(obj-all) - -KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR) -KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR) -OBJCOPYFLAGS := - -OBJECTS := $(addprefix $(obj)/,$(obj-y)) -OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all)) - -LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup --build-id=sha1 -T -$(obj)/vmlinux: $(obj)/vmlinux.lds $(objtree)/arch/s390/boot/startup.a $(OBJECTS_ALL) FORCE - $(call if_changed,ld) - -LDFLAGS_vmlinux.syms := --oformat $(LD_BFD) -e startup -T -$(obj)/vmlinux.syms: $(obj)/vmlinux.lds $(objtree)/arch/s390/boot/startup.a $(OBJECTS) FORCE - $(call if_changed,ld) - -quiet_cmd_dumpsyms = DUMPSYMS $< -define cmd_dumpsyms - $(NM) -n -S --format=bsd "$<" | sed -nE 's/^0*([0-9a-fA-F]+) 0*([0-9a-fA-F]+) [tT] ([^ ]*)$$/\1 \2 \3/p' | tr '\n' '\0' > "$@" -endef - -$(obj)/syms.bin: $(obj)/vmlinux.syms FORCE - $(call if_changed,dumpsyms) - -OBJCOPYFLAGS_syms.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.decompressor.syms -$(obj)/syms.o: $(obj)/syms.bin FORCE - $(call if_changed,objcopy) - -OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load -$(obj)/info.bin: vmlinux FORCE - $(call if_changed,objcopy) - -OBJCOPYFLAGS_info.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.info -$(obj)/info.o: $(obj)/info.bin FORCE - $(call if_changed,objcopy) - -OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section=.vmlinux.info -S -$(obj)/vmlinux.bin: vmlinux FORCE - $(call if_changed,objcopy) - -suffix-$(CONFIG_KERNEL_GZIP) := .gz -suffix-$(CONFIG_KERNEL_BZIP2) := .bz2 -suffix-$(CONFIG_KERNEL_LZ4) := .lz4 -suffix-$(CONFIG_KERNEL_LZMA) := .lzma -suffix-$(CONFIG_KERNEL_LZO) := .lzo -suffix-$(CONFIG_KERNEL_XZ) := .xz -suffix-$(CONFIG_KERNEL_ZSTD) := .zst - -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE - $(call if_changed,bzip2_with_size) -$(obj)/vmlinux.bin.lz4: $(obj)/vmlinux.bin FORCE - $(call if_changed,lz4_with_size) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE - $(call if_changed,lzma_with_size) -$(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.bin FORCE - $(call if_changed,lzo_with_size) -$(obj)/vmlinux.bin.xz: $(obj)/vmlinux.bin FORCE - $(call if_changed,xzkern_with_size) -$(obj)/vmlinux.bin.zst: $(obj)/vmlinux.bin FORCE - $(call if_changed,zstd22_with_size) - -OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed -$(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE - $(call if_changed,objcopy) diff --git a/arch/s390/boot/compressed/decompressor.c b/arch/s390/boot/decompressor.c index e27c2140d620..e27c2140d620 100644 --- a/arch/s390/boot/compressed/decompressor.c +++ b/arch/s390/boot/decompressor.c diff --git a/arch/s390/boot/compressed/decompressor.h b/arch/s390/boot/decompressor.h index f75cc31a77dd..f75cc31a77dd 100644 --- a/arch/s390/boot/compressed/decompressor.h +++ b/arch/s390/boot/decompressor.h diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index 666692429db0..3f79b9efb803 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -27,234 +27,181 @@ #include <asm/page.h> #include <asm/ptrace.h> #include <asm/sclp.h> - -#define ARCH_OFFSET 4 +#include "boot.h" #define EP_OFFSET 0x10008 #define EP_STRING "S390EP" +#define IPL_BS 0x730 __HEAD - -#define IPL_BS 0x730 - .org 0 - .long 0x00080000,0x80000000+iplstart # The first 24 bytes are loaded - .long 0x02000018,0x60000050 # by ipl to addresses 0-23. - .long 0x02000068,0x60000050 # (a PSW and two CCWs). - .fill 80-24,1,0x40 # bytes 24-79 are discarded !! - .long 0x020000f0,0x60000050 # The next 160 byte are loaded - .long 0x02000140,0x60000050 # to addresses 0x18-0xb7 - .long 0x02000190,0x60000050 # They form the continuation - .long 0x020001e0,0x60000050 # of the CCW program started - .long 0x02000230,0x60000050 # by ipl and load the range - .long 0x02000280,0x60000050 # 0x0f0-0x730 from the image - .long 0x020002d0,0x60000050 # to the range 0x0f0-0x730 - .long 0x02000320,0x60000050 # in memory. At the end of - .long 0x02000370,0x60000050 # the channel program the PSW - .long 0x020003c0,0x60000050 # at location 0 is loaded. - .long 0x02000410,0x60000050 # Initial processing starts - .long 0x02000460,0x60000050 # at 0x200 = iplstart. - .long 0x020004b0,0x60000050 - .long 0x02000500,0x60000050 - .long 0x02000550,0x60000050 - .long 0x020005a0,0x60000050 - .long 0x020005f0,0x60000050 - .long 0x02000640,0x60000050 - .long 0x02000690,0x60000050 - .long 0x020006e0,0x20000050 - - .org __LC_RST_NEW_PSW # 0x1a0 - .quad 0,iplstart - .org __LC_EXT_NEW_PSW # 0x1b0 - .quad 0x0002000180000000,0x1b0 # disabled wait - .org __LC_PGM_NEW_PSW # 0x1d0 - .quad 0x0000000180000000,startup_pgm_check_handler - .org __LC_IO_NEW_PSW # 0x1f0 - .quad 0x0002000180000000,0x1f0 # disabled wait - - .org 0x200 - +ipl_start: + mvi __LC_AR_MODE_ID,1 # set esame flag + slr %r0,%r0 # set cpuid to zero + lhi %r1,2 # mode 2 = esame (dump) + sigp %r1,%r0,0x12 # switch to esame mode + sam64 # switch to 64 bit addressing mode + lgh %r1,__LC_SUBCHANNEL_ID # test if subchannel number + brctg %r1,.Lnoload # is valid + llgf %r1,__LC_SUBCHANNEL_ID # load ipl subchannel number + lghi %r2,IPL_BS # load start address + bras %r14,.Lloader # load rest of ipl image + larl %r12,parmarea # pointer to parameter area + stg %r1,IPL_DEVICE-PARMAREA(%r12) # save ipl device number +# +# load parameter file from ipl device +# +.Lagain1: + larl %r2,_end # ramdisk loc. is temp + bras %r14,.Lloader # load parameter file + ltgr %r2,%r2 # got anything ? + jz .Lnopf + lg %r3,MAX_COMMAND_LINE_SIZE-PARMAREA(%r12) + aghi %r3,-1 + clgr %r2,%r3 + jl .Lnotrunc + lgr %r2,%r3 +.Lnotrunc: + larl %r4,_end + larl %r13,.L_hdr + clc 0(3,%r4),0(%r13) # if it is HDRx + jz .Lagain1 # skip dataset header + larl %r13,.L_eof + clc 0(3,%r4),0(%r13) # if it is EOFx + jz .Lagain1 # skip dateset trailer + lgr %r5,%r2 + la %r6,COMMAND_LINE-PARMAREA(%r12) + lgr %r7,%r2 + aghi %r7,1 + mvcl %r6,%r4 +.Lnopf: +# +# load ramdisk from ipl device +# +.Lagain2: + larl %r2,_end # addr of ramdisk + stg %r2,INITRD_START-PARMAREA(%r12) + bras %r14,.Lloader # load ramdisk + stg %r2,INITRD_SIZE-PARMAREA(%r12) # store size of rd + ltgr %r2,%r2 + jnz .Lrdcont + stg %r2,INITRD_START-PARMAREA(%r12) # no ramdisk found +.Lrdcont: + larl %r2,_end + larl %r13,.L_hdr # skip HDRx and EOFx + clc 0(3,%r2),0(%r13) + jz .Lagain2 + larl %r13,.L_eof + clc 0(3,%r2),0(%r13) + jz .Lagain2 +# +# reset files in VM reader +# + larl %r13,.Lcpuid + stidp 0(%r13) # store cpuid + tm 0(%r13),0xff # running VM ? + jno .Lnoreset + larl %r2,.Lreset + lghi %r3,26 + diag %r2,%r3,8 + larl %r5,.Lirb + stsch 0(%r5) # check if irq is pending + tm 30(%r5),0x0f # by verifying if any of the + jnz .Lwaitforirq # activity or status control + tm 31(%r5),0xff # bits is set in the schib + jz .Lnoreset +.Lwaitforirq: + bras %r14,.Lirqwait # wait for IO interrupt + c %r1,__LC_SUBCHANNEL_ID # compare subchannel number + jne .Lwaitforirq + larl %r5,.Lirb + tsch 0(%r5) +.Lnoreset: + j .Lnoload +# +# everything loaded, go for it +# +.Lnoload: + jg startup # # subroutine to wait for end I/O # .Lirqwait: - mvc __LC_IO_NEW_PSW(16),.Lnewpsw # set up IO interrupt psw - lpsw .Lwaitpsw + larl %r13,.Lnewpswmask # set up IO interrupt psw + mvc __LC_IO_NEW_PSW(8),0(%r13) + stg %r14,__LC_IO_NEW_PSW+8 + larl %r13,.Lwaitpsw + lpswe 0(%r13) .Lioint: - br %r14 - .align 8 -.Lnewpsw: - .quad 0x0000000080000000,.Lioint -.Lwaitpsw: - .long 0x020a0000,0x80000000+.Lioint - # # subroutine for loading cards from the reader # .Lloader: - la %r4,0(%r14) - la %r3,.Lorb # r2 = address of orb into r2 - la %r5,.Lirb # r4 = address of irb - la %r6,.Lccws - la %r7,20 + lgr %r4,%r14 + larl %r3,.Lorb # r2 = address of orb into r2 + larl %r5,.Lirb # r4 = address of irb + larl %r6,.Lccws + lghi %r7,20 .Linit: st %r2,4(%r6) # initialize CCW data addresses la %r2,0x50(%r2) la %r6,8(%r6) - bct 7,.Linit - - lctl %c6,%c6,.Lcr6 # set IO subclass mask - slr %r2,%r2 + brctg %r7,.Linit + larl %r13,.Lcr6 + lctlg %c6,%c6,0(%r13) + xgr %r2,%r2 .Lldlp: ssch 0(%r3) # load chunk of 1600 bytes - bnz .Llderr + jnz .Llderr .Lwait4irq: - bas %r14,.Lirqwait + bras %r14,.Lirqwait c %r1,__LC_SUBCHANNEL_ID # compare subchannel number - bne .Lwait4irq + jne .Lwait4irq tsch 0(%r5) - - slr %r0,%r0 + xgr %r0,%r0 ic %r0,8(%r5) # get device status - chi %r0,8 # channel end ? - be .Lcont - chi %r0,12 # channel end + device end ? - be .Lcont - - l %r0,4(%r5) - s %r0,8(%r3) # r0/8 = number of ccws executed - mhi %r0,10 # *10 = number of bytes in ccws - lh %r3,10(%r5) # get residual count - sr %r0,%r3 # #ccws*80-residual=#bytes read - ar %r2,%r0 - + cghi %r0,8 # channel end ? + je .Lcont + cghi %r0,12 # channel end + device end ? + je .Lcont + llgf %r0,4(%r5) + sgf %r0,8(%r3) # r0/8 = number of ccws executed + mghi %r0,10 # *10 = number of bytes in ccws + llgh %r3,10(%r5) # get residual count + sgr %r0,%r3 # #ccws*80-residual=#bytes read + agr %r2,%r0 br %r4 # r2 contains the total size - .Lcont: - ahi %r2,0x640 # add 0x640 to total size - la %r6,.Lccws - la %r7,20 + aghi %r2,0x640 # add 0x640 to total size + larl %r6,.Lccws + lghi %r7,20 .Lincr: l %r0,4(%r6) # update CCW data addresses - ahi %r0,0x640 + aghi %r0,0x640 st %r0,4(%r6) - ahi %r6,8 - bct 7,.Lincr - - b .Lldlp + aghi %r6,8 + brctg %r7,.Lincr + j .Lldlp .Llderr: - lpsw .Lcrash + larl %r13,.Lcrash + lpsw 0(%r13) .align 8 +.Lwaitpsw: + .quad 0x0202000180000000,.Lioint +.Lnewpswmask: + .quad 0x0000000180000000 + .align 8 .Lorb: .long 0x00000000,0x0080ff00,.Lccws .Lirb: .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.Lcr6: .long 0xff000000 -.Lloadp:.long 0,0 + .align 8 +.Lcr6: .quad 0x00000000ff000000 .align 8 .Lcrash:.long 0x000a0000,0x00000000 - .align 8 .Lccws: .rept 19 .long 0x02600050,0x00000000 .endr .long 0x02200050,0x00000000 - -iplstart: - mvi __LC_AR_MODE_ID,1 # set esame flag - slr %r0,%r0 # set cpuid to zero - lhi %r1,2 # mode 2 = esame (dump) - sigp %r1,%r0,0x12 # switch to esame mode - bras %r13,0f - .fill 16,4,0x0 -0: lmh %r0,%r15,0(%r13) # clear high-order half of gprs - sam31 # switch to 31 bit addressing mode - lh %r1,__LC_SUBCHANNEL_ID # test if subchannel number - bct %r1,.Lnoload # is valid - l %r1,__LC_SUBCHANNEL_ID # load ipl subchannel number - la %r2,IPL_BS # load start address - bas %r14,.Lloader # load rest of ipl image - l %r12,.Lparm # pointer to parameter area - st %r1,IPL_DEVICE+ARCH_OFFSET-PARMAREA(%r12) # save ipl device number - -# -# load parameter file from ipl device -# -.Lagain1: - l %r2,.Linitrd # ramdisk loc. is temp - bas %r14,.Lloader # load parameter file - ltr %r2,%r2 # got anything ? - bz .Lnopf - l %r3,MAX_COMMAND_LINE_SIZE+ARCH_OFFSET-PARMAREA(%r12) - ahi %r3,-1 - clr %r2,%r3 - bl .Lnotrunc - lr %r2,%r3 -.Lnotrunc: - l %r4,.Linitrd - clc 0(3,%r4),.L_hdr # if it is HDRx - bz .Lagain1 # skip dataset header - clc 0(3,%r4),.L_eof # if it is EOFx - bz .Lagain1 # skip dateset trailer - - lr %r5,%r2 - la %r6,COMMAND_LINE-PARMAREA(%r12) - lr %r7,%r2 - ahi %r7,1 - mvcl %r6,%r4 -.Lnopf: - -# -# load ramdisk from ipl device -# -.Lagain2: - l %r2,.Linitrd # addr of ramdisk - st %r2,INITRD_START+ARCH_OFFSET-PARMAREA(%r12) - bas %r14,.Lloader # load ramdisk - st %r2,INITRD_SIZE+ARCH_OFFSET-PARMAREA(%r12) # store size of rd - ltr %r2,%r2 - bnz .Lrdcont - st %r2,INITRD_START+ARCH_OFFSET-PARMAREA(%r12) # no ramdisk found -.Lrdcont: - l %r2,.Linitrd - - clc 0(3,%r2),.L_hdr # skip HDRx and EOFx - bz .Lagain2 - clc 0(3,%r2),.L_eof - bz .Lagain2 - -# -# reset files in VM reader -# - stidp .Lcpuid # store cpuid - tm .Lcpuid,0xff # running VM ? - bno .Lnoreset - la %r2,.Lreset - lhi %r3,26 - diag %r2,%r3,8 - la %r5,.Lirb - stsch 0(%r5) # check if irq is pending - tm 30(%r5),0x0f # by verifying if any of the - bnz .Lwaitforirq # activity or status control - tm 31(%r5),0xff # bits is set in the schib - bz .Lnoreset -.Lwaitforirq: - bas %r14,.Lirqwait # wait for IO interrupt - c %r1,__LC_SUBCHANNEL_ID # compare subchannel number - bne .Lwaitforirq - la %r5,.Lirb - tsch 0(%r5) -.Lnoreset: - b .Lnoload - -# -# everything loaded, go for it -# -.Lnoload: - l %r1,.Lstartup - br %r1 - -.Linitrd:.long _end # default address of initrd -.Lparm: .long PARMAREA -.Lstartup: .long startup .Lreset:.byte 0xc3,0xc8,0xc1,0xd5,0xc7,0xc5,0x40,0xd9,0xc4,0xd9,0x40 .byte 0xc1,0xd3,0xd3,0x40,0xd2,0xc5,0xc5,0xd7,0x40,0xd5,0xd6 .byte 0xc8,0xd6,0xd3,0xc4 # "change rdr all keep nohold" @@ -268,10 +215,10 @@ iplstart: # this is called either by the ipl loader or directly by PSW restart # or linload or SALIPL # - .org STARTUP_NORMAL_OFFSET + .org STARTUP_NORMAL_OFFSET - IPL_START SYM_CODE_START(startup) j startup_normal - .org EP_OFFSET + .org EP_OFFSET - IPL_START # # This is a list of s390 kernel entry points. At address 0x1000f the number of # valid entry points is stored. @@ -283,7 +230,7 @@ SYM_CODE_START(startup) # # kdump startup-code, running in 64 bit absolute addressing mode # - .org STARTUP_KDUMP_OFFSET + .org STARTUP_KDUMP_OFFSET - IPL_START j startup_kdump SYM_CODE_END(startup) SYM_CODE_START_LOCAL(startup_normal) @@ -295,20 +242,23 @@ SYM_CODE_START_LOCAL(startup_normal) .fill 16,4,0x0 0: lmh %r0,%r15,0(%r13) # clear high-order half of gprs sam64 # switch to 64 bit addressing mode - basr %r13,0 # get base -.LPG0: - mvc __LC_EXT_NEW_PSW(16),.Lext_new_psw-.LPG0(%r13) - mvc __LC_PGM_NEW_PSW(16),.Lpgm_new_psw-.LPG0(%r13) - mvc __LC_IO_NEW_PSW(16),.Lio_new_psw-.LPG0(%r13) + larl %r13,.Lext_new_psw + mvc __LC_EXT_NEW_PSW(16),0(%r13) + larl %r13,.Lpgm_new_psw + mvc __LC_PGM_NEW_PSW(16),0(%r13) + larl %r13,.Lio_new_psw + mvc __LC_IO_NEW_PSW(16),0(%r13) xc 0x200(256),0x200 # partially clear lowcore xc 0x300(256),0x300 xc 0xe00(256),0xe00 xc 0xf00(256),0xf00 - lctlg %c0,%c15,.Lctl-.LPG0(%r13) # load control registers + larl %r13,.Lctl + lctlg %c0,%c15,0(%r13) # load control registers stcke __LC_BOOT_CLOCK mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 - spt 6f-.LPG0(%r13) - mvc __LC_LAST_UPDATE_TIMER(8),6f-.LPG0(%r13) + larl %r13,6f + spt 0(%r13) + mvc __LC_LAST_UPDATE_TIMER(8),0(%r13) larl %r15,_stack_end-STACK_FRAME_OVERHEAD brasl %r14,sclp_early_setup_buffer brasl %r14,verify_facilities @@ -368,23 +318,3 @@ SYM_CODE_START_LOCAL(startup_pgm_check_handler) lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r8) lpswe __LC_RETURN_PSW # disabled wait SYM_CODE_END(startup_pgm_check_handler) - -# -# params at 10400 (setup.h) -# Must be keept in sync with struct parmarea in setup.h -# - .org PARMAREA -SYM_DATA_START(parmarea) - .quad 0 # IPL_DEVICE - .quad 0 # INITRD_START - .quad 0 # INITRD_SIZE - .quad 0 # OLDMEM_BASE - .quad 0 # OLDMEM_SIZE - .quad kernel_version # points to kernel version string - .quad COMMAND_LINE_SIZE - - .org COMMAND_LINE - .byte "root=/dev/ram0 ro" - .byte 0 - .org PARMAREA+__PARMAREA_SIZE -SYM_DATA_END(parmarea) diff --git a/arch/s390/boot/ipl_data.c b/arch/s390/boot/ipl_data.c new file mode 100644 index 000000000000..0846e2b249c6 --- /dev/null +++ b/arch/s390/boot/ipl_data.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/compat.h> +#include <linux/ptrace.h> +#include <asm/cio.h> +#include <asm/asm-offsets.h> +#include "boot.h" + +#define CCW0(cmd, addr, cnt, flg) \ + { .cmd_code = cmd, .cda = addr, .count = cnt, .flags = flg, } + +#define PSW_MASK_DISABLED (PSW_MASK_WAIT | PSW_MASK_EA | PSW_MASK_BA) + +struct ipl_lowcore { + psw_t32 ipl_psw; /* 0x0000 */ + struct ccw0 ccwpgm[2]; /* 0x0008 */ + u8 fill[56]; /* 0x0018 */ + struct ccw0 ccwpgmcc[20]; /* 0x0050 */ + u8 pad_0xf0[0x01a0-0x00f0]; /* 0x00f0 */ + psw_t restart_psw; /* 0x01a0 */ + psw_t external_new_psw; /* 0x01b0 */ + psw_t svc_new_psw; /* 0x01c0 */ + psw_t program_new_psw; /* 0x01d0 */ + psw_t mcck_new_psw; /* 0x01e0 */ + psw_t io_new_psw; /* 0x01f0 */ +}; + +/* + * Initial lowcore for IPL: the first 24 bytes are loaded by IPL to + * addresses 0-23 (a PSW and two CCWs). Bytes 24-79 are discarded. + * The next 160 bytes are loaded to addresses 0x18-0xb7. They form + * the continuation of the CCW program started by IPL and load the + * range 0x0f0-0x730 from the image to the range 0x0f0-0x730 in + * memory. At the end of the channel program the PSW at location 0 is + * loaded. + * Initial processing starts at 0x200 = iplstart. + * + * The restart psw points to iplstart which allows to load a kernel + * image into memory and starting it by a psw restart on any cpu. All + * other default psw new locations contain a disabled wait psw where + * the address indicates which psw was loaded. + * + * Note that the 'file' utility can detect s390 kernel images. For + * that to succeed the two initial CCWs, and the 0x40 fill bytes must + * be present. + */ +static struct ipl_lowcore ipl_lowcore __used __section(".ipldata") = { + .ipl_psw = { .mask = PSW32_MASK_BASE, .addr = PSW32_ADDR_AMODE | IPL_START }, + .ccwpgm = { + [ 0] = CCW0(CCW_CMD_READ_IPL, 0x018, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 1] = CCW0(CCW_CMD_READ_IPL, 0x068, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + }, + .fill = { + [ 0 ... 55] = 0x40, + }, + .ccwpgmcc = { + [ 0] = CCW0(CCW_CMD_READ_IPL, 0x0f0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 1] = CCW0(CCW_CMD_READ_IPL, 0x140, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 2] = CCW0(CCW_CMD_READ_IPL, 0x190, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 3] = CCW0(CCW_CMD_READ_IPL, 0x1e0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 4] = CCW0(CCW_CMD_READ_IPL, 0x230, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 5] = CCW0(CCW_CMD_READ_IPL, 0x280, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 6] = CCW0(CCW_CMD_READ_IPL, 0x2d0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 7] = CCW0(CCW_CMD_READ_IPL, 0x320, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 8] = CCW0(CCW_CMD_READ_IPL, 0x370, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 9] = CCW0(CCW_CMD_READ_IPL, 0x3c0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [10] = CCW0(CCW_CMD_READ_IPL, 0x410, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [11] = CCW0(CCW_CMD_READ_IPL, 0x460, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [12] = CCW0(CCW_CMD_READ_IPL, 0x4b0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [13] = CCW0(CCW_CMD_READ_IPL, 0x500, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [14] = CCW0(CCW_CMD_READ_IPL, 0x550, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [15] = CCW0(CCW_CMD_READ_IPL, 0x5a0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [16] = CCW0(CCW_CMD_READ_IPL, 0x5f0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [17] = CCW0(CCW_CMD_READ_IPL, 0x640, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [18] = CCW0(CCW_CMD_READ_IPL, 0x690, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [19] = CCW0(CCW_CMD_READ_IPL, 0x6e0, 0x50, CCW_FLAG_SLI), + }, + .restart_psw = { .mask = 0, .addr = IPL_START, }, + .external_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_EXT_NEW_PSW, }, + .svc_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_SVC_NEW_PSW, }, + .program_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_PGM_NEW_PSW, }, + .mcck_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_MCK_NEW_PSW, }, + .io_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_IO_NEW_PSW, }, +}; diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 9ed7e29c81d9..ca78d6162245 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -8,9 +8,16 @@ #include <asm/sections.h> #include <asm/boot_data.h> #include <asm/facility.h> +#include <asm/setup.h> #include <asm/uv.h> #include "boot.h" +struct parmarea parmarea __section(".parmarea") = { + .kernel_version = (unsigned long)kernel_version, + .max_command_line_size = COMMAND_LINE_SIZE, + .command_line = "root=/dev/ram0 ro", +}; + char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; int __bootdata(noexec_disabled); diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c index d8984462071f..e8d74d4f62aa 100644 --- a/arch/s390/boot/kaslr.c +++ b/arch/s390/boot/kaslr.c @@ -8,7 +8,7 @@ #include <asm/timex.h> #include <asm/sclp.h> #include <asm/kasan.h> -#include "compressed/decompressor.h" +#include "decompressor.h" #include "boot.h" #define PRNG_MODE_TDES 1 diff --git a/arch/s390/boot/mem_detect.c b/arch/s390/boot/mem_detect.c index 2f949cd9076b..7fa1a32ea0f3 100644 --- a/arch/s390/boot/mem_detect.c +++ b/arch/s390/boot/mem_detect.c @@ -7,7 +7,7 @@ #include <asm/sections.h> #include <asm/mem_detect.h> #include <asm/sparsemem.h> -#include "compressed/decompressor.h" +#include "decompressor.h" #include "boot.h" struct mem_detect_info __bootdata(mem_detect); diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 1aa11a8f57dd..863e6bcaa5a1 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -10,7 +10,7 @@ #include <asm/sclp.h> #include <asm/diag.h> #include <asm/uv.h> -#include "compressed/decompressor.h" +#include "decompressor.h" #include "boot.h" #include "uv.h" diff --git a/arch/s390/boot/compressed/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index 918e05137d4c..af5c6860e0a1 100644 --- a/arch/s390/boot/compressed/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -4,6 +4,7 @@ #include <asm/thread_info.h> #include <asm/page.h> #include <asm/sclp.h> +#include "boot.h" OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") OUTPUT_ARCH(s390:64-bit) @@ -13,11 +14,19 @@ ENTRY(startup) SECTIONS { . = 0; + .ipldata : { + *(.ipldata) + } + . = IPL_START; .head.text : { _head = . ; HEAD_TEXT _ehead = . ; } + . = PARMAREA; + .parmarea : { + *(.parmarea) + } .text : { _text = .; /* Text */ *(.text) diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c index bfbafd35bcbd..e013088b5115 100644 --- a/arch/s390/crypto/des_s390.c +++ b/arch/s390/crypto/des_s390.c @@ -194,7 +194,7 @@ static struct skcipher_alg cbc_des_alg = { * same as DES. Implementers MUST reject keys that exhibit this * property. * - * In fips mode additinally check for all 3 keys are unique. + * In fips mode additionally check for all 3 keys are unique. * */ static int des3_setkey(struct crypto_tfm *tfm, const u8 *key, diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index 234d791ca59d..ae382bafc772 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -528,7 +528,7 @@ static ssize_t prng_tdes_read(struct file *file, char __user *ubuf, /* give mutex free before calling schedule() */ mutex_unlock(&prng_data->mutex); schedule(); - /* occopy mutex again */ + /* occupy mutex again */ if (mutex_lock_interruptible(&prng_data->mutex)) { if (ret == 0) ret = -ERESTARTSYS; diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c index 3765c2d81df5..a3d881ca0a98 100644 --- a/arch/s390/hypfs/hypfs_vm.c +++ b/arch/s390/hypfs/hypfs_vm.c @@ -190,7 +190,7 @@ int hypfs_vm_create_files(struct dentry *root) if (IS_ERR(data)) return PTR_ERR(data); - /* Hpervisor Info */ + /* Hypervisor Info */ dir = hypfs_mkdir(root, "hyp"); if (IS_ERR(dir)) { rc = PTR_ERR(dir); diff --git a/arch/s390/include/asm/alternative-asm.h b/arch/s390/include/asm/alternative-asm.h index bb3837d7387c..7db046596b93 100644 --- a/arch/s390/include/asm/alternative-asm.h +++ b/arch/s390/include/asm/alternative-asm.h @@ -5,19 +5,6 @@ #ifdef __ASSEMBLY__ /* - * Check the length of an instruction sequence. The length may not be larger - * than 254 bytes and it has to be divisible by 2. - */ -.macro alt_len_check start,end - .if ( \end - \start ) > 254 - .error "cpu alternatives does not support instructions blocks > 254 bytes\n" - .endif - .if ( \end - \start ) % 2 - .error "cpu alternatives instructions length is odd\n" - .endif -.endm - -/* * Issue one struct alt_instr descriptor entry (need to put it into * the section .altinstructions, see below). This entry contains * enough information for the alternatives patching code to patch an @@ -28,66 +15,29 @@ .long \alt_start - . .word \feature .byte \orig_end - \orig_start - .byte \alt_end - \alt_start -.endm - -/* - * Fill up @bytes with nops. The macro emits 6-byte nop instructions - * for the bulk of the area, possibly followed by a 4-byte and/or - * a 2-byte nop if the size of the area is not divisible by 6. - */ -.macro alt_pad_fill bytes - .rept ( \bytes ) / 6 - brcl 0,0 - .endr - .rept ( \bytes ) % 6 / 4 - nop - .endr - .rept ( \bytes ) % 6 % 4 / 2 - nopr - .endr -.endm - -/* - * Fill up @bytes with nops. If the number of bytes is larger - * than 6, emit a jg instruction to branch over all nops, then - * fill an area of size (@bytes - 6) with nop instructions. - */ -.macro alt_pad bytes - .if ( \bytes > 0 ) - .if ( \bytes > 6 ) - jg . + \bytes - alt_pad_fill \bytes - 6 - .else - alt_pad_fill \bytes - .endif - .endif + .org . - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start ) + .org . - ( \alt_end - \alt_start ) + ( \orig_end - \orig_start ) .endm /* * Define an alternative between two instructions. If @feature is * present, early code in apply_alternatives() replaces @oldinstr with - * @newinstr. ".skip" directive takes care of proper instruction padding - * in case @newinstr is longer than @oldinstr. + * @newinstr. */ .macro ALTERNATIVE oldinstr, newinstr, feature .pushsection .altinstr_replacement,"ax" 770: \newinstr 771: .popsection 772: \oldinstr -773: alt_len_check 770b, 771b - alt_len_check 772b, 773b - alt_pad ( ( 771b - 770b ) - ( 773b - 772b ) ) -774: .pushsection .altinstructions,"a" - alt_entry 772b, 774b, 770b, 771b, \feature +773: .pushsection .altinstructions,"a" + alt_entry 772b, 773b, 770b, 771b, \feature .popsection .endm /* * Define an alternative between two instructions. If @feature is * present, early code in apply_alternatives() replaces @oldinstr with - * @newinstr. ".skip" directive takes care of proper instruction padding - * in case @newinstr is longer than @oldinstr. + * @newinstr. */ .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 .pushsection .altinstr_replacement,"ax" @@ -95,17 +45,9 @@ 771: \newinstr2 772: .popsection 773: \oldinstr -774: alt_len_check 770b, 771b - alt_len_check 771b, 772b - alt_len_check 773b, 774b - .if ( 771b - 770b > 772b - 771b ) - alt_pad ( ( 771b - 770b ) - ( 774b - 773b ) ) - .else - alt_pad ( ( 772b - 771b ) - ( 774b - 773b ) ) - .endif -775: .pushsection .altinstructions,"a" - alt_entry 773b, 775b, 770b, 771b,\feature1 - alt_entry 773b, 775b, 771b, 772b,\feature2 +774: .pushsection .altinstructions,"a" + alt_entry 773b, 774b, 770b, 771b,\feature1 + alt_entry 773b, 774b, 771b, 772b,\feature2 .popsection .endm diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index 3f2856ed6808..904dd049f954 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -13,32 +13,25 @@ struct alt_instr { s32 repl_offset; /* offset to replacement instruction */ u16 facility; /* facility bit set for replacement */ u8 instrlen; /* length of original instruction */ - u8 replacementlen; /* length of new instruction */ } __packed; void apply_alternative_instructions(void); void apply_alternatives(struct alt_instr *start, struct alt_instr *end); /* - * |661: |662: |6620 |663: - * +-----------+---------------------+ - * | oldinstr | oldinstr_padding | - * | +----------+----------+ - * | | | | - * | | >6 bytes |6/4/2 nops| - * | |6 bytes jg-----------> - * +-----------+---------------------+ - * ^^ static padding ^^ + * +---------------------------------+ + * |661: |662: + * | oldinstr | + * +---------------------------------+ * * .altinstr_replacement section - * +---------------------+-----------+ + * +---------------------------------+ * |6641: |6651: * | alternative instr 1 | - * +-----------+---------+- - - - - -+ - * |6642: |6652: | - * | alternative instr 2 | padding - * +---------------------+- - - - - -+ - * ^ runtime ^ + * +---------------------------------+ + * |6642: |6652: + * | alternative instr 2 | + * +---------------------------------+ * * .altinstructions section * +---------------------------------+ @@ -47,77 +40,31 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); * +---------------------------------+ */ -#define b_altinstr(num) "664"#num -#define e_altinstr(num) "665"#num - -#define e_oldinstr_pad_end "663" +#define b_altinstr(num) "664"#num +#define e_altinstr(num) "665"#num #define oldinstr_len "662b-661b" -#define oldinstr_total_len e_oldinstr_pad_end"b-661b" #define altinstr_len(num) e_altinstr(num)"b-"b_altinstr(num)"b" -#define oldinstr_pad_len(num) \ - "-(((" altinstr_len(num) ")-(" oldinstr_len ")) > 0) * " \ - "((" altinstr_len(num) ")-(" oldinstr_len "))" - -#define INSTR_LEN_SANITY_CHECK(len) \ - ".if " len " > 254\n" \ - "\t.error \"cpu alternatives does not support instructions " \ - "blocks > 254 bytes\"\n" \ - ".endif\n" \ - ".if (" len ") %% 2\n" \ - "\t.error \"cpu alternatives instructions length is odd\"\n" \ - ".endif\n" - -#define OLDINSTR_PADDING(oldinstr, num) \ - ".if " oldinstr_pad_len(num) " > 6\n" \ - "\tjg " e_oldinstr_pad_end "f\n" \ - "6620:\n" \ - "\t.rept (" oldinstr_pad_len(num) " - (6620b-662b)) / 2\n" \ - "\tnopr\n" \ - ".else\n" \ - "\t.rept " oldinstr_pad_len(num) " / 6\n" \ - "\t.brcl 0,0\n" \ - "\t.endr\n" \ - "\t.rept " oldinstr_pad_len(num) " %% 6 / 4\n" \ - "\tnop\n" \ - "\t.endr\n" \ - "\t.rept " oldinstr_pad_len(num) " %% 6 %% 4 / 2\n" \ - "\tnopr\n" \ - ".endr\n" \ - ".endif\n" - -#define OLDINSTR(oldinstr, num) \ - "661:\n\t" oldinstr "\n662:\n" \ - OLDINSTR_PADDING(oldinstr, num) \ - e_oldinstr_pad_end ":\n" \ - INSTR_LEN_SANITY_CHECK(oldinstr_len) - -#define OLDINSTR_2(oldinstr, num1, num2) \ - "661:\n\t" oldinstr "\n662:\n" \ - ".if " altinstr_len(num1) " < " altinstr_len(num2) "\n" \ - OLDINSTR_PADDING(oldinstr, num2) \ - ".else\n" \ - OLDINSTR_PADDING(oldinstr, num1) \ - ".endif\n" \ - e_oldinstr_pad_end ":\n" \ - INSTR_LEN_SANITY_CHECK(oldinstr_len) + +#define OLDINSTR(oldinstr) \ + "661:\n\t" oldinstr "\n662:\n" #define ALTINSTR_ENTRY(facility, num) \ "\t.long 661b - .\n" /* old instruction */ \ "\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \ "\t.word " __stringify(facility) "\n" /* facility bit */ \ - "\t.byte " oldinstr_total_len "\n" /* source len */ \ - "\t.byte " altinstr_len(num) "\n" /* alt instruction len */ + "\t.byte " oldinstr_len "\n" /* instruction len */ \ + "\t.org . - (" oldinstr_len ") + (" altinstr_len(num) ")\n" \ + "\t.org . - (" altinstr_len(num) ") + (" oldinstr_len ")\n" #define ALTINSTR_REPLACEMENT(altinstr, num) /* replacement */ \ - b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" \ - INSTR_LEN_SANITY_CHECK(altinstr_len(num)) + b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, altinstr, facility) \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(altinstr, 1) \ ".popsection\n" \ - OLDINSTR(oldinstr, 1) \ + OLDINSTR(oldinstr) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(facility, 1) \ ".popsection\n" @@ -127,7 +74,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); ALTINSTR_REPLACEMENT(altinstr1, 1) \ ALTINSTR_REPLACEMENT(altinstr2, 2) \ ".popsection\n" \ - OLDINSTR_2(oldinstr, 1, 2) \ + OLDINSTR(oldinstr) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(facility1, 1) \ ALTINSTR_ENTRY(facility2, 2) \ diff --git a/arch/s390/include/asm/asm-extable.h b/arch/s390/include/asm/asm-extable.h index fb62df5e16a2..f24d9591aaed 100644 --- a/arch/s390/include/asm/asm-extable.h +++ b/arch/s390/include/asm/asm-extable.h @@ -26,16 +26,16 @@ stringify_in_c(.long (_target) - .;) \ stringify_in_c(.short (_type);) \ stringify_in_c(.macro extable_reg reg;) \ - stringify_in_c(.set found, 0;) \ - stringify_in_c(.set regnr, 0;) \ + stringify_in_c(.set .Lfound, 0;) \ + stringify_in_c(.set .Lregnr, 0;) \ stringify_in_c(.irp rs,r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;) \ stringify_in_c(.ifc "\reg", "%%\rs";) \ - stringify_in_c(.set found, 1;) \ - stringify_in_c(.short regnr;) \ + stringify_in_c(.set .Lfound, 1;) \ + stringify_in_c(.short .Lregnr;) \ stringify_in_c(.endif;) \ - stringify_in_c(.set regnr, regnr+1;) \ + stringify_in_c(.set .Lregnr, .Lregnr+1;) \ stringify_in_c(.endr;) \ - stringify_in_c(.ifne (found != 1);) \ + stringify_in_c(.ifne (.Lfound != 1);) \ stringify_in_c(.error "extable_reg: bad register argument";) \ stringify_in_c(.endif;) \ stringify_in_c(.endm;) \ diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h index 2c057e1f3200..82de2a7c4160 100644 --- a/arch/s390/include/asm/barrier.h +++ b/arch/s390/include/asm/barrier.h @@ -26,14 +26,14 @@ static __always_inline void bcr_serialize(void) asm volatile(__ASM_BCR_SERIALIZE : : : "memory"); } -#define mb() bcr_serialize() -#define rmb() barrier() -#define wmb() barrier() -#define dma_rmb() mb() -#define dma_wmb() mb() -#define __smp_mb() mb() -#define __smp_rmb() rmb() -#define __smp_wmb() wmb() +#define __mb() bcr_serialize() +#define __rmb() barrier() +#define __wmb() barrier() +#define __dma_rmb() __mb() +#define __dma_wmb() __mb() +#define __smp_mb() __mb() +#define __smp_rmb() __rmb() +#define __smp_wmb() __wmb() #define __smp_store_release(p, v) \ do { \ diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index 0b25f28351ed..aebe1e22c7be 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -15,7 +15,8 @@ "1: .asciz \""__FILE__"\"\n" \ ".previous\n" \ ".section __bug_table,\"awM\",@progbits,%2\n" \ - "2: .long 0b-2b,1b-2b\n" \ + "2: .long 0b-.\n" \ + " .long 1b-.\n" \ " .short %0,%1\n" \ " .org 2b+%2\n" \ ".previous\n" \ @@ -30,7 +31,7 @@ asm_inline volatile( \ "0: mc 0,0\n" \ ".section __bug_table,\"awM\",@progbits,%1\n" \ - "1: .long 0b-1b\n" \ + "1: .long 0b-.\n" \ " .short %0\n" \ " .org 1b+%1\n" \ ".previous\n" \ diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 1effac6a0152..1c4f585dd39b 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -369,7 +369,7 @@ void cio_gp_dma_destroy(struct gen_pool *gp_dma, struct device *dma_dev); struct gen_pool *cio_gp_dma_create(struct device *dma_dev, int nr_pages); /* Function from drivers/s390/cio/chsc.c */ -int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); +int chsc_sstpc(void *page, unsigned int op, u16 ctrl, long *clock_delta); int chsc_sstpi(void *page, void *result, size_t size); int chsc_stzi(void *page, void *result, size_t size); int chsc_sgib(u32 origin); diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index cdc7ae72529d..7d6fe813ac39 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/thread_info.h> +#include <asm/ptrace.h> #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; @@ -22,32 +23,8 @@ typedef u16 compat_mode_t; (__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \ }) -#define PSW32_MASK_PER 0x40000000UL -#define PSW32_MASK_DAT 0x04000000UL -#define PSW32_MASK_IO 0x02000000UL -#define PSW32_MASK_EXT 0x01000000UL -#define PSW32_MASK_KEY 0x00F00000UL -#define PSW32_MASK_BASE 0x00080000UL /* Always one */ -#define PSW32_MASK_MCHECK 0x00040000UL -#define PSW32_MASK_WAIT 0x00020000UL -#define PSW32_MASK_PSTATE 0x00010000UL -#define PSW32_MASK_ASC 0x0000C000UL -#define PSW32_MASK_CC 0x00003000UL -#define PSW32_MASK_PM 0x00000f00UL -#define PSW32_MASK_RI 0x00000080UL - #define PSW32_MASK_USER 0x0000FF00UL -#define PSW32_ADDR_AMODE 0x80000000UL -#define PSW32_ADDR_INSN 0x7FFFFFFFUL - -#define PSW32_DEFAULT_KEY (((u32) PAGE_DEFAULT_ACC) << 20) - -#define PSW32_ASC_PRIMARY 0x00000000UL -#define PSW32_ASC_ACCREG 0x00004000UL -#define PSW32_ASC_SECONDARY 0x00008000UL -#define PSW32_ASC_HOME 0x0000C000UL - #define PSW32_USER_BITS (PSW32_MASK_DAT | PSW32_MASK_IO | PSW32_MASK_EXT | \ PSW32_DEFAULT_KEY | PSW32_MASK_BASE | \ PSW32_MASK_MCHECK | PSW32_MASK_PSTATE | \ diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h index 82388da3f95f..267a8f88e143 100644 --- a/arch/s390/include/asm/ctl_reg.h +++ b/arch/s390/include/asm/ctl_reg.h @@ -93,7 +93,9 @@ union ctlreg0 { unsigned long tcx : 1; /* Transactional-Execution control */ unsigned long pifo : 1; /* Transactional-Execution Program- Interruption-Filtering Override */ - unsigned long : 22; + unsigned long : 3; + unsigned long ccc : 1; /* Cryptography counter control */ + unsigned long : 18; unsigned long : 3; unsigned long lap : 1; /* Low-address-protection control */ unsigned long : 4; diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h index 2f0a1cacdf85..000de2b1e67a 100644 --- a/arch/s390/include/asm/entry-common.h +++ b/arch/s390/include/asm/entry-common.h @@ -9,19 +9,21 @@ #include <linux/uaccess.h> #include <asm/timex.h> #include <asm/fpu/api.h> +#include <asm/pai.h> #define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP) void do_per_trap(struct pt_regs *regs); -#ifdef CONFIG_DEBUG_ENTRY -static __always_inline void arch_check_user_regs(struct pt_regs *regs) +static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) { - debug_user_asce(0); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + debug_user_asce(0); + + pai_kernel_enter(regs); } -#define arch_check_user_regs arch_check_user_regs -#endif /* CONFIG_DEBUG_ENTRY */ +#define arch_enter_from_user_mode arch_enter_from_user_mode static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs, unsigned long ti_work) @@ -44,6 +46,8 @@ static __always_inline void arch_exit_to_user_mode(void) if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) debug_user_asce(1); + + pai_kernel_exit(current_pt_regs()); } #define arch_exit_to_user_mode arch_exit_to_user_mode diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 3f8ee257f9aa..a405b6bb89fb 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -133,6 +133,8 @@ int ipl_report_add_certificate(struct ipl_report *report, void *key, * DIAG 308 support */ enum diag308_subcode { + DIAG308_CLEAR_RESET = 0, + DIAG308_LOAD_NORMAL_RESET = 1, DIAG308_REL_HSA = 2, DIAG308_LOAD_CLEAR = 3, DIAG308_LOAD_NORMAL_DUMP = 4, @@ -141,6 +143,10 @@ enum diag308_subcode { DIAG308_LOAD_NORMAL = 7, }; +enum diag308_subcode_flags { + DIAG308_FLAG_EI = 1UL << 16, +}; + enum diag308_rc { DIAG308_RC_OK = 0x0001, DIAG308_RC_NOCONFIG = 0x0102, diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 56002aeacabf..26fe5e535728 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -200,7 +200,10 @@ struct lowcore { __u64 last_break_save_area; /* 0x1338 */ __u32 access_regs_save_area[16]; /* 0x1340 */ __u64 cregs_save_area[16]; /* 0x1380 */ - __u8 pad_0x1400[0x1800-0x1400]; /* 0x1400 */ + __u8 pad_0x1400[0x1500-0x1400]; /* 0x1400 */ + /* Cryptography-counter designation */ + __u64 ccd; /* 0x1500 */ + __u8 pad_0x1508[0x1800-0x1508]; /* 0x1508 */ /* Transaction abort diagnostic block */ struct pgm_tdb pgm_tdb; /* 0x1800 */ diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index 292083083830..af1cd3a6f406 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -101,7 +101,7 @@ void nmi_alloc_mcesa_early(u64 *mcesad); int nmi_alloc_mcesa(u64 *mcesad); void nmi_free_mcesa(u64 *mcesad); -void s390_handle_mcck(void); +void s390_handle_mcck(struct pt_regs *regs); void __s390_handle_mcck(void); int s390_do_machine_check(struct pt_regs *regs); diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h index 2cfcd5ac3a8b..d910d71b5bb5 100644 --- a/arch/s390/include/asm/nospec-insn.h +++ b/arch/s390/include/asm/nospec-insn.h @@ -54,31 +54,31 @@ .endm .macro __DECODE_R expand,reg - .set __decode_fail,1 + .set .L__decode_fail,1 .irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .ifc \reg,%r\r1 \expand \r1 - .set __decode_fail,0 + .set .L__decode_fail,0 .endif .endr - .if __decode_fail == 1 + .if .L__decode_fail == 1 .error "__DECODE_R failed" .endif .endm .macro __DECODE_RR expand,rsave,rtarget - .set __decode_fail,1 + .set .L__decode_fail,1 .irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .ifc \rsave,%r\r1 .irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .ifc \rtarget,%r\r2 \expand \r1,\r2 - .set __decode_fail,0 + .set .L__decode_fail,0 .endif .endr .endif .endr - .if __decode_fail == 1 + .if .L__decode_fail == 1 .error "__DECODE_RR failed" .endif .endm diff --git a/arch/s390/include/asm/pai.h b/arch/s390/include/asm/pai.h new file mode 100644 index 000000000000..5b7e33ac6f0b --- /dev/null +++ b/arch/s390/include/asm/pai.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Processor Activity Instrumentation support for cryptography counters + * + * Copyright IBM Corp. 2022 + * Author(s): Thomas Richter <tmricht@linux.ibm.com> + */ +#ifndef _ASM_S390_PAI_H +#define _ASM_S390_PAI_H + +#include <linux/jump_label.h> +#include <asm/lowcore.h> +#include <asm/ptrace.h> + +struct qpaci_info_block { + u64 header; + struct { + u64 : 8; + u64 num_cc : 8; /* # of supported crypto counters */ + u64 : 48; + }; +}; + +static inline int qpaci(struct qpaci_info_block *info) +{ + /* Size of info (in double words minus one) */ + size_t size = sizeof(*info) / sizeof(u64) - 1; + int cc; + + asm volatile( + " lgr 0,%[size]\n" + " .insn s,0xb28f0000,%[info]\n" + " lgr %[size],0\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=d" (cc), [info] "=Q" (*info), [size] "+&d" (size) + : + : "0", "cc", "memory"); + return cc ? (size + 1) * sizeof(u64) : 0; +} + +#define PAI_CRYPTO_BASE 0x1000 /* First event number */ +#define PAI_CRYPTO_MAXCTR 256 /* Max # of event counters */ +#define PAI_CRYPTO_KERNEL_OFFSET 2048 + +DECLARE_STATIC_KEY_FALSE(pai_key); + +static __always_inline void pai_kernel_enter(struct pt_regs *regs) +{ + if (!IS_ENABLED(CONFIG_PERF_EVENTS)) + return; + if (!static_branch_unlikely(&pai_key)) + return; + if (!S390_lowcore.ccd) + return; + if (!user_mode(regs)) + return; + WRITE_ONCE(S390_lowcore.ccd, S390_lowcore.ccd | PAI_CRYPTO_KERNEL_OFFSET); +} + +static __always_inline void pai_kernel_exit(struct pt_regs *regs) +{ + if (!IS_ENABLED(CONFIG_PERF_EVENTS)) + return; + if (!static_branch_unlikely(&pai_key)) + return; + if (!S390_lowcore.ccd) + return; + if (!user_mode(regs)) + return; + WRITE_ONCE(S390_lowcore.ccd, S390_lowcore.ccd & ~PAI_CRYPTO_KERNEL_OFFSET); +} + +#endif diff --git a/arch/s390/include/asm/pci_debug.h b/arch/s390/include/asm/pci_debug.h index 5dfe47588277..3bb4e7e33a0e 100644 --- a/arch/s390/include/asm/pci_debug.h +++ b/arch/s390/include/asm/pci_debug.h @@ -17,9 +17,14 @@ extern debug_info_t *pci_debug_err_id; debug_text_event(pci_debug_err_id, 0, debug_buffer); \ } while (0) +static inline void zpci_err_hex_level(int level, void *addr, int len) +{ + debug_event(pci_debug_err_id, level, addr, len); +} + static inline void zpci_err_hex(void *addr, int len) { - debug_event(pci_debug_err_id, 0, addr, len); + zpci_err_hex_level(0, addr, len); } #endif diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h index d9d5350cc3ec..bf15da0fedbc 100644 --- a/arch/s390/include/asm/preempt.h +++ b/arch/s390/include/asm/preempt.h @@ -46,10 +46,17 @@ static inline bool test_preempt_need_resched(void) static inline void __preempt_count_add(int val) { - if (__builtin_constant_p(val) && (val >= -128) && (val <= 127)) - __atomic_add_const(val, &S390_lowcore.preempt_count); - else - __atomic_add(val, &S390_lowcore.preempt_count); + /* + * With some obscure config options and CONFIG_PROFILE_ALL_BRANCHES + * enabled, gcc 12 fails to handle __builtin_constant_p(). + */ + if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES)) { + if (__builtin_constant_p(val) && (val >= -128) && (val <= 127)) { + __atomic_add_const(val, &S390_lowcore.preempt_count); + return; + } + } + __atomic_add(val, &S390_lowcore.preempt_count); } static inline void __preempt_count_sub(int val) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index ff1e25d515a8..add764a2be8c 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -83,6 +83,7 @@ void cpu_detect_mhz_feature(void); extern const struct seq_operations cpuinfo_op; extern void execve_tail(void); extern void __bpon(void); +unsigned long vdso_size(void); /* * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit. @@ -94,9 +95,10 @@ extern void __bpon(void); (_REGION3_SIZE >> 1) : (_REGION2_SIZE >> 1)) #define TASK_SIZE_MAX (-PAGE_SIZE) -#define STACK_TOP (test_thread_flag(TIF_31BIT) ? \ - _REGION3_SIZE : _REGION2_SIZE) -#define STACK_TOP_MAX _REGION2_SIZE +#define VDSO_BASE (STACK_TOP + PAGE_SIZE) +#define VDSO_LIMIT (test_thread_flag(TIF_31BIT) ? _REGION3_SIZE : _REGION2_SIZE) +#define STACK_TOP (VDSO_LIMIT - vdso_size() - PAGE_SIZE) +#define STACK_TOP_MAX (_REGION2_SIZE - vdso_size() - PAGE_SIZE) #define HAVE_ARCH_PICK_MMAP_LAYOUT diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index ddb70fb13fbc..8bae33ab320a 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -71,6 +71,35 @@ enum { &(*(struct psw_bits *)(&(__psw))); \ })) +#define PSW32_MASK_PER 0x40000000UL +#define PSW32_MASK_DAT 0x04000000UL +#define PSW32_MASK_IO 0x02000000UL +#define PSW32_MASK_EXT 0x01000000UL +#define PSW32_MASK_KEY 0x00F00000UL +#define PSW32_MASK_BASE 0x00080000UL /* Always one */ +#define PSW32_MASK_MCHECK 0x00040000UL +#define PSW32_MASK_WAIT 0x00020000UL +#define PSW32_MASK_PSTATE 0x00010000UL +#define PSW32_MASK_ASC 0x0000C000UL +#define PSW32_MASK_CC 0x00003000UL +#define PSW32_MASK_PM 0x00000f00UL +#define PSW32_MASK_RI 0x00000080UL + +#define PSW32_ADDR_AMODE 0x80000000UL +#define PSW32_ADDR_INSN 0x7FFFFFFFUL + +#define PSW32_DEFAULT_KEY (((u32)PAGE_DEFAULT_ACC) << 20) + +#define PSW32_ASC_PRIMARY 0x00000000UL +#define PSW32_ASC_ACCREG 0x00004000UL +#define PSW32_ASC_SECONDARY 0x00008000UL +#define PSW32_ASC_HOME 0x0000C000UL + +typedef struct { + unsigned int mask; + unsigned int addr; +} psw_t32 __aligned(8); + #define PGM_INT_CODE_MASK 0x7f #define PGM_INT_CODE_PER 0x80 diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 04cb1e7582a6..236b34b75ddb 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -87,6 +87,7 @@ struct sclp_info { unsigned char has_diag318 : 1; unsigned char has_sipl : 1; unsigned char has_dirq : 1; + unsigned char has_iplcc : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/arch/s390/include/asm/scsw.h b/arch/s390/include/asm/scsw.h index a7c3ccf681da..7ce584aff5bb 100644 --- a/arch/s390/include/asm/scsw.h +++ b/arch/s390/include/asm/scsw.h @@ -508,9 +508,21 @@ static inline int scsw_cmd_is_valid_zcc(union scsw *scsw) */ static inline int scsw_cmd_is_valid_ectl(union scsw *scsw) { - return (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) && - !(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) && - (scsw->cmd.stctl & SCSW_STCTL_ALERT_STATUS); + /* Must be status pending. */ + if (!(scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND)) + return 0; + + /* Must have alert status. */ + if (!(scsw->cmd.stctl & SCSW_STCTL_ALERT_STATUS)) + return 0; + + /* Must be alone or together with primary, secondary or both, + * => no intermediate status. + */ + if (scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) + return 0; + + return 1; } /** @@ -522,10 +534,25 @@ static inline int scsw_cmd_is_valid_ectl(union scsw *scsw) */ static inline int scsw_cmd_is_valid_pno(union scsw *scsw) { - return (scsw->cmd.fctl != 0) && - (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) && - (!(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) || - (scsw->cmd.actl & SCSW_ACTL_SUSPENDED)); + /* Must indicate at least one I/O function. */ + if (!scsw->cmd.fctl) + return 0; + + /* Must be status pending. */ + if (!(scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND)) + return 0; + + /* Can be status pending alone, or with any combination of primary, + * secondary and alert => no intermediate status. + */ + if (!(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS)) + return 1; + + /* If intermediate, must be suspended. */ + if (scsw->cmd.actl & SCSW_ACTL_SUSPENDED) + return 1; + + return 0; } /** @@ -675,9 +702,21 @@ static inline int scsw_tm_is_valid_q(union scsw *scsw) */ static inline int scsw_tm_is_valid_ectl(union scsw *scsw) { - return (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) && - !(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) && - (scsw->tm.stctl & SCSW_STCTL_ALERT_STATUS); + /* Must be status pending. */ + if (!(scsw->tm.stctl & SCSW_STCTL_STATUS_PEND)) + return 0; + + /* Must have alert status. */ + if (!(scsw->tm.stctl & SCSW_STCTL_ALERT_STATUS)) + return 0; + + /* Must be alone or together with primary, secondary or both, + * => no intermediate status. + */ + if (scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) + return 0; + + return 1; } /** @@ -689,11 +728,25 @@ static inline int scsw_tm_is_valid_ectl(union scsw *scsw) */ static inline int scsw_tm_is_valid_pno(union scsw *scsw) { - return (scsw->tm.fctl != 0) && - (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) && - (!(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) || - ((scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) && - (scsw->tm.actl & SCSW_ACTL_SUSPENDED))); + /* Must indicate at least one I/O function. */ + if (!scsw->tm.fctl) + return 0; + + /* Must be status pending. */ + if (!(scsw->tm.stctl & SCSW_STCTL_STATUS_PEND)) + return 0; + + /* Can be status pending alone, or with any combination of primary, + * secondary and alert => no intermediate status. + */ + if (!(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS)) + return 1; + + /* If intermediate, must be suspended. */ + if (scsw->tm.actl & SCSW_ACTL_SUSPENDED) + return 1; + + return 0; } /** diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 24a54443c865..37127cd7749e 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -77,8 +77,9 @@ static inline int arch_spin_trylock(arch_spinlock_t *lp) static inline void arch_spin_unlock(arch_spinlock_t *lp) { typecheck(int, lp->lock); + kcsan_release(); asm_inline volatile( - ALTERNATIVE("", ".insn rre,0xb2fa0000,7,0", 49) /* NIAI 7 */ + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", 49) /* NIAI 7 */ " sth %1,%0\n" : "=R" (((unsigned short *) &lp->lock)[1]) : "d" (0) : "cc", "memory"); diff --git a/arch/s390/include/asm/stp.h b/arch/s390/include/asm/stp.h index ba07463897c1..4d74d7e33340 100644 --- a/arch/s390/include/asm/stp.h +++ b/arch/s390/include/asm/stp.h @@ -44,8 +44,8 @@ struct stp_sstpi { u32 : 32; u32 ctnid[3]; u32 : 32; - u32 todoff[4]; - u32 rsvd[48]; + u64 todoff; + u32 rsvd[50]; } __packed; struct stp_tzib { diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 2cfce42aa7fc..ce878e85b6e4 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -197,6 +197,7 @@ static inline cycles_t get_cycles(void) { return (cycles_t) get_tod_clock() >> 2; } +#define get_cycles get_cycles int get_phys_clock(unsigned long *clock); void init_cpu_timer(void); diff --git a/arch/s390/include/asm/vx-insn.h b/arch/s390/include/asm/vx-insn.h index 87e6cc2aeba4..95480ed9149e 100644 --- a/arch/s390/include/asm/vx-insn.h +++ b/arch/s390/include/asm/vx-insn.h @@ -366,7 +366,7 @@ .macro VLM vfrom, vto, disp, base, hint=3 VX_NUM v1, \vfrom VX_NUM v3, \vto - GR_NUM b2, \base /* Base register */ + GR_NUM b2, \base .word 0xE700 | ((v1&15) << 4) | (v3&15) .word (b2 << 12) | (\disp) MRXBOPC \hint, 0x36, v1, v3 @@ -376,7 +376,7 @@ .macro VST vr1, disp, index="%r0", base VX_NUM v1, \vr1 GR_NUM x2, \index - GR_NUM b2, \base /* Base register */ + GR_NUM b2, \base .word 0xE700 | ((v1&15) << 4) | (x2&15) .word (b2 << 12) | (\disp) MRXBOPC 0, 0x0E, v1 @@ -386,7 +386,7 @@ .macro VSTM vfrom, vto, disp, base, hint=3 VX_NUM v1, \vfrom VX_NUM v3, \vto - GR_NUM b2, \base /* Base register */ + GR_NUM b2, \base .word 0xE700 | ((v1&15) << 4) | (v3&15) .word (b2 << 12) | (\disp) MRXBOPC \hint, 0x3E, v1, v3 diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h index 7349e96d28a0..924b876f992c 100644 --- a/arch/s390/include/uapi/asm/pkey.h +++ b/arch/s390/include/uapi/asm/pkey.h @@ -171,7 +171,7 @@ struct pkey_skey2pkey { #define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey) /* - * Verify the given CCA AES secure key for being able to be useable with + * Verify the given CCA AES secure key for being able to be usable with * the pkey module. Check for correct key type and check for having at * least one crypto card being able to handle this key (master key * or old master key verification pattern matches). diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h index 2f04a5499d74..d83713f67530 100644 --- a/arch/s390/include/uapi/asm/zcrypt.h +++ b/arch/s390/include/uapi/asm/zcrypt.h @@ -4,7 +4,7 @@ * * zcrypt 2.2.1 (user-visible header) * - * Copyright IBM Corp. 2001, 2019 + * Copyright IBM Corp. 2001, 2022 * Author(s): Robert Burroughs * Eric Rossman (edrossma@us.ibm.com) * @@ -85,7 +85,7 @@ struct ica_rsa_modexpo_crt { struct CPRBX { __u16 cprb_len; /* CPRB length 220 */ __u8 cprb_ver_id; /* CPRB version id. 0x02 */ - __u8 pad_000[3]; /* Alignment pad bytes */ + __u8 _pad_000[3]; /* Alignment pad bytes */ __u8 func_id[2]; /* function id 0x5432 */ __u8 cprb_flags[4]; /* Flags */ __u32 req_parml; /* request parameter buffer len */ @@ -95,19 +95,19 @@ struct CPRBX { __u32 rpl_datal; /* reply data block len */ __u32 rpld_datal; /* replied data block len */ __u32 req_extbl; /* request extension block len */ - __u8 pad_001[4]; /* reserved */ + __u8 _pad_001[4]; /* reserved */ __u32 rpld_extbl; /* replied extension block len */ - __u8 padx000[16 - sizeof(__u8 *)]; + __u8 _pad_002[16 - sizeof(__u8 *)]; __u8 __user *req_parmb; /* request parm block 'address' */ - __u8 padx001[16 - sizeof(__u8 *)]; + __u8 _pad_003[16 - sizeof(__u8 *)]; __u8 __user *req_datab; /* request data block 'address' */ - __u8 padx002[16 - sizeof(__u8 *)]; + __u8 _pad_004[16 - sizeof(__u8 *)]; __u8 __user *rpl_parmb; /* reply parm block 'address' */ - __u8 padx003[16 - sizeof(__u8 *)]; + __u8 _pad_005[16 - sizeof(__u8 *)]; __u8 __user *rpl_datab; /* reply data block 'address' */ - __u8 padx004[16 - sizeof(__u8 *)]; + __u8 _pad_006[16 - sizeof(__u8 *)]; __u8 __user *req_extb; /* request extension block 'addr'*/ - __u8 padx005[16 - sizeof(__u8 *)]; + __u8 _pad_007[16 - sizeof(__u8 *)]; __u8 __user *rpl_extb; /* reply extension block 'address'*/ __u16 ccp_rtcode; /* server return code */ __u16 ccp_rscode; /* server reason code */ @@ -115,12 +115,10 @@ struct CPRBX { __u8 logon_id[8]; /* Logon Identifier */ __u8 mac_value[8]; /* Mac Value */ __u8 mac_content_flgs; /* Mac content flag byte */ - __u8 pad_002; /* Alignment */ + __u8 _pad_008; /* Alignment */ __u16 domain; /* Domain */ - __u8 usage_domain[4]; /* Usage domain */ - __u8 cntrl_domain[4]; /* Control domain */ - __u8 S390enf_mask[4]; /* S/390 enforcement mask */ - __u8 pad_004[36]; /* reserved */ + __u8 _pad_009[12]; /* reserved, checked for zeros */ + __u8 _pad_010[36]; /* reserved */ } __attribute__((packed)); /** @@ -238,8 +236,8 @@ struct zcrypt_device_matrix_ext { }; #define AUTOSELECT 0xFFFFFFFF -#define AUTOSEL_AP ((__u16) 0xFFFF) -#define AUTOSEL_DOM ((__u16) 0xFFFF) +#define AUTOSEL_AP ((__u16)0xFFFF) +#define AUTOSEL_DOM ((__u16)0xFFFF) #define ZCRYPT_IOCTL_MAGIC 'z' @@ -305,12 +303,12 @@ struct zcrypt_device_matrix_ext { /** * Supported ioctl calls */ -#define ICARSAMODEXPO _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0) -#define ICARSACRT _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0) -#define ZSECSENDCPRB _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0) -#define ZSENDEP11CPRB _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0) +#define ICARSAMODEXPO _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0) +#define ICARSACRT _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0) +#define ZSECSENDCPRB _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0) +#define ZSENDEP11CPRB _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0) -#define ZCRYPT_DEVICE_STATUS _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x5f, 0) +#define ZCRYPT_DEVICE_STATUS _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x5f, 0) #define ZCRYPT_STATUS_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x58, char[MAX_ZDEV_CARDIDS_EXT]) #define ZCRYPT_QDEPTH_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x59, char[MAX_ZDEV_CARDIDS_EXT]) #define ZCRYPT_PERDEV_REQCNT _IOR(ZCRYPT_IOCTL_MAGIC, 0x5a, int[MAX_ZDEV_CARDIDS_EXT]) @@ -352,7 +350,7 @@ struct zcrypt_device_matrix { }; /* Deprecated: use ZCRYPT_DEVICE_STATUS */ -#define ZDEVICESTATUS _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x4f, 0) +#define ZDEVICESTATUS _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x4f, 0) /* Deprecated: use ZCRYPT_STATUS_MASK */ #define Z90STAT_STATUS_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x48, char[64]) /* Deprecated: use ZCRYPT_QDEPTH_MASK */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index c8d1b6aa823e..5851041bb214 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -72,6 +72,7 @@ obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o +obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o obj-$(CONFIG_TRACEPOINTS) += trace.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index cce0ddee2d02..e7bca29f9c34 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -7,8 +7,6 @@ #include <asm/facility.h> #include <asm/nospec-branch.h> -#define MAX_PATCH_LEN (255 - 1) - static int __initdata_or_module alt_instr_disabled; static int __init disable_alternative_instructions(char *str) @@ -19,85 +17,30 @@ static int __init disable_alternative_instructions(char *str) early_param("noaltinstr", disable_alternative_instructions); -struct brcl_insn { - u16 opc; - s32 disp; -} __packed; - -static u16 __initdata_or_module nop16 = 0x0700; -static u32 __initdata_or_module nop32 = 0x47000000; -static struct brcl_insn __initdata_or_module nop48 = { - 0xc004, 0 -}; - -static const void *nops[] __initdata_or_module = { - &nop16, - &nop32, - &nop48 -}; - -static void __init_or_module add_jump_padding(void *insns, unsigned int len) -{ - struct brcl_insn brcl = { - 0xc0f4, - len / 2 - }; - - memcpy(insns, &brcl, sizeof(brcl)); - insns += sizeof(brcl); - len -= sizeof(brcl); - - while (len > 0) { - memcpy(insns, &nop16, 2); - insns += 2; - len -= 2; - } -} - -static void __init_or_module add_padding(void *insns, unsigned int len) -{ - if (len > 6) - add_jump_padding(insns, len); - else if (len >= 2) - memcpy(insns, nops[len / 2 - 1], len); -} - static void __init_or_module __apply_alternatives(struct alt_instr *start, struct alt_instr *end) { struct alt_instr *a; u8 *instr, *replacement; - u8 insnbuf[MAX_PATCH_LEN]; /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. */ for (a = start; a < end; a++) { - int insnbuf_sz = 0; - instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; if (!__test_facility(a->facility, alt_stfle_fac_list)) continue; - if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) { + if (unlikely(a->instrlen % 2)) { WARN_ONCE(1, "cpu alternatives instructions length is " "odd, skipping patching\n"); continue; } - memcpy(insnbuf, replacement, a->replacementlen); - insnbuf_sz = a->replacementlen; - - if (a->instrlen > a->replacementlen) { - add_padding(insnbuf + a->replacementlen, - a->instrlen - a->replacementlen); - insnbuf_sz += a->instrlen - a->replacementlen; - } - - s390_kernel_write(instr, insnbuf, insnbuf_sz); + s390_kernel_write(instr, replacement, a->instrlen); } } diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h index 64509e7dbd3b..ef23739b277c 100644 --- a/arch/s390/kernel/compat_linux.h +++ b/arch/s390/kernel/compat_linux.h @@ -5,69 +5,59 @@ #include <linux/compat.h> #include <linux/socket.h> #include <linux/syscalls.h> +#include <asm/ptrace.h> -/* Macro that masks the high order bit of an 32 bit pointer and converts it*/ -/* to a 64 bit pointer */ -#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL)) -#define AA(__x) \ - ((unsigned long)(__x)) +/* + * Macro that masks the high order bit of a 32 bit pointer and + * converts it to a 64 bit pointer. + */ +#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL)) +#define AA(__x) ((unsigned long)(__x)) /* Now 32bit compatibility types */ struct ipc_kludge_32 { - __u32 msgp; /* pointer */ - __s32 msgtyp; + __u32 msgp; /* pointer */ + __s32 msgtyp; }; /* asm/sigcontext.h */ -typedef union -{ - __u64 d; - __u32 f; +typedef union { + __u64 d; + __u32 f; } freg_t32; -typedef struct -{ +typedef struct { unsigned int fpc; unsigned int pad; - freg_t32 fprs[__NUM_FPRS]; + freg_t32 fprs[__NUM_FPRS]; } _s390_fp_regs32; -typedef struct -{ - __u32 mask; - __u32 addr; -} _psw_t32 __attribute__ ((aligned(8))); - -typedef struct -{ - _psw_t32 psw; +typedef struct { + psw_t32 psw; __u32 gprs[__NUM_GPRS]; __u32 acrs[__NUM_ACRS]; } _s390_regs_common32; -typedef struct -{ +typedef struct { _s390_regs_common32 regs; - _s390_fp_regs32 fpregs; + _s390_fp_regs32 fpregs; } _sigregs32; -typedef struct -{ - __u32 gprs_high[__NUM_GPRS]; - __u64 vxrs_low[__NUM_VXRS_LOW]; - __vector128 vxrs_high[__NUM_VXRS_HIGH]; - __u8 __reserved[128]; +typedef struct { + __u32 gprs_high[__NUM_GPRS]; + __u64 vxrs_low[__NUM_VXRS_LOW]; + __vector128 vxrs_high[__NUM_VXRS_HIGH]; + __u8 __reserved[128]; } _sigregs_ext32; #define _SIGCONTEXT_NSIG32 64 #define _SIGCONTEXT_NSIG_BPW32 32 #define __SIGNAL_FRAMESIZE32 96 -#define _SIGMASK_COPY_SIZE32 (sizeof(u32)*2) +#define _SIGMASK_COPY_SIZE32 (sizeof(u32) * 2) -struct sigcontext32 -{ +struct sigcontext32 { __u32 oldmask[_COMPAT_NSIG_WORDS]; - __u32 sregs; /* pointer */ + __u32 sregs; /* pointer */ }; /* asm/signal.h */ @@ -75,11 +65,11 @@ struct sigcontext32 /* asm/ucontext.h */ struct ucontext32 { __u32 uc_flags; - __u32 uc_link; /* pointer */ + __u32 uc_link; /* pointer */ compat_stack_t uc_stack; _sigregs32 uc_mcontext; compat_sigset_t uc_sigmask; - /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ + /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ unsigned char __unused[128 - sizeof(compat_sigset_t)]; _sigregs_ext32 uc_mcontext_ext; }; @@ -88,25 +78,6 @@ struct stat64_emu31; struct mmap_arg_struct_emu31; struct fadvise64_64_args; -long compat_sys_s390_chown16(const char __user *filename, u16 user, u16 group); -long compat_sys_s390_lchown16(const char __user *filename, u16 user, u16 group); -long compat_sys_s390_fchown16(unsigned int fd, u16 user, u16 group); -long compat_sys_s390_setregid16(u16 rgid, u16 egid); -long compat_sys_s390_setgid16(u16 gid); -long compat_sys_s390_setreuid16(u16 ruid, u16 euid); -long compat_sys_s390_setuid16(u16 uid); -long compat_sys_s390_setresuid16(u16 ruid, u16 euid, u16 suid); -long compat_sys_s390_getresuid16(u16 __user *ruid, u16 __user *euid, u16 __user *suid); -long compat_sys_s390_setresgid16(u16 rgid, u16 egid, u16 sgid); -long compat_sys_s390_getresgid16(u16 __user *rgid, u16 __user *egid, u16 __user *sgid); -long compat_sys_s390_setfsuid16(u16 uid); -long compat_sys_s390_setfsgid16(u16 gid); -long compat_sys_s390_getgroups16(int gidsetsize, u16 __user *grouplist); -long compat_sys_s390_setgroups16(int gidsetsize, u16 __user *grouplist); -long compat_sys_s390_getuid16(void); -long compat_sys_s390_geteuid16(void); -long compat_sys_s390_getgid16(void); -long compat_sys_s390_getegid16(void); long compat_sys_s390_truncate64(const char __user *path, u32 high, u32 low); long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low); long compat_sys_s390_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 high, u32 low); @@ -118,8 +89,8 @@ long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbu long compat_sys_s390_fstatat64(unsigned int dfd, const char __user *filename, struct stat64_emu31 __user *statbuf, int flag); long compat_sys_s390_old_mmap(struct mmap_arg_struct_emu31 __user *arg); long compat_sys_s390_mmap2(struct mmap_arg_struct_emu31 __user *arg); -long compat_sys_s390_read(unsigned int fd, char __user * buf, compat_size_t count); -long compat_sys_s390_write(unsigned int fd, const char __user * buf, compat_size_t count); +long compat_sys_s390_read(unsigned int fd, char __user *buf, compat_size_t count); +long compat_sys_s390_write(unsigned int fd, const char __user *buf, compat_size_t count); long compat_sys_s390_fadvise64(int fd, u32 high, u32 low, compat_size_t len, int advise); long compat_sys_s390_fadvise64_64(struct fadvise64_64_args __user *args); long compat_sys_s390_sync_file_range(int fd, u32 offhigh, u32 offlow, u32 nhigh, u32 nlow, unsigned int flags); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 59b69c8ab5e1..df41132ccd06 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -53,19 +53,19 @@ STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE _LPP_OFFSET = __LC_LPP .macro STBEAR address - ALTERNATIVE "", ".insn s,0xb2010000,\address", 193 + ALTERNATIVE "nop", ".insn s,0xb2010000,\address", 193 .endm .macro LBEAR address - ALTERNATIVE "", ".insn s,0xb2000000,\address", 193 + ALTERNATIVE "nop", ".insn s,0xb2000000,\address", 193 .endm .macro LPSWEY address,lpswe - ALTERNATIVE "b \lpswe", ".insn siy,0xeb0000000071,\address,0", 193 + ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", 193 .endm .macro MBEAR reg - ALTERNATIVE "", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193 + ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193 .endm .macro CHECK_STACK savearea @@ -121,16 +121,16 @@ _LPP_OFFSET = __LC_LPP .endm .macro BPOFF - ALTERNATIVE "", ".insn rrf,0xb2e80000,0,0,12,0", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", 82 .endm .macro BPON - ALTERNATIVE "", ".insn rrf,0xb2e80000,0,0,13,0", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", 82 .endm .macro BPENTER tif_ptr,tif_mask ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \ - "", 82 + "j .+12; nop; nop", 82 .endm .macro BPEXIT tif_ptr,tif_mask @@ -172,9 +172,19 @@ _LPP_OFFSET = __LC_LPP lgr %r14,\reg larl %r13,\start slgr %r14,%r13 - lghi %r13,\end - \start - clgr %r14,%r13 +#ifdef CONFIG_AS_IS_LLVM + clgfrl %r14,.Lrange_size\@ +#else + clgfi %r14,\end - \start +#endif jhe \outside_label +#ifdef CONFIG_AS_IS_LLVM + .section .rodata, "a" + .align 4 +.Lrange_size\@: + .long \end - \start + .previous +#endif .endm .macro SIEEXIT @@ -226,7 +236,7 @@ ENTRY(__switch_to) aghi %r3,__TASK_pid mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task - ALTERNATIVE "", "lpp _LPP_OFFSET", 40 + ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 BR_EX %r14 ENDPROC(__switch_to) @@ -473,10 +483,7 @@ ENTRY(\name) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC MBEAR %r11 stmg %r8,%r9,__PT_PSW(%r11) - tm %r8,0x0001 # coming from user space? - jno 1f - lctlg %c1,%c1,__LC_KERNEL_ASCE -1: lgr %r2,%r11 # pass pointer to pt_regs + lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,\handler mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) tmhh %r8,0x0001 # returning to user ? @@ -602,6 +609,7 @@ ENTRY(mcck_int_handler) mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) la %r11,STACK_FRAME_OVERHEAD(%r1) + lgr %r2,%r11 lgr %r15,%r1 brasl %r14,s390_handle_mcck .Lmcck_return: @@ -612,7 +620,7 @@ ENTRY(mcck_int_handler) jno 0f BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP stpt __LC_EXIT_TIMER -0: ALTERNATIVE "", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193 +0: ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193 LBEAR 0(%r12) lmg %r11,%r15,__PT_R11(%r11) LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE @@ -648,7 +656,7 @@ ENTRY(mcck_int_handler) ENDPROC(mcck_int_handler) ENTRY(restart_int_handler) - ALTERNATIVE "", "lpp _LPP_OFFSET", 40 + ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 stg %r15,__LC_SAVE_AREA_RESTART TSTMSK __LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4 jz 0f diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 3033f616e256..45393919fe61 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -205,7 +205,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq) unsigned long flags; int cpu; - irq_lock_sparse(); + rcu_read_lock(); desc = irq_to_desc(irq); if (!desc) goto out; @@ -224,7 +224,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq) seq_putc(p, '\n'); raw_spin_unlock_irqrestore(&desc->lock, flags); out: - irq_unlock_sparse(); + rcu_read_unlock(); } /* diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 6ebf02e15c85..ab761c008f98 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -26,8 +26,10 @@ #include <asm/stacktrace.h> #include <asm/switch_to.h> #include <asm/nmi.h> +#include <asm/sclp.h> -typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long); +typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long, + unsigned long); extern const unsigned char relocate_kernel[]; extern const unsigned long long relocate_kernel_len; @@ -243,6 +245,7 @@ void machine_crash_shutdown(struct pt_regs *regs) */ static void __do_machine_kexec(void *data) { + unsigned long diag308_subcode; relocate_kernel_t data_mover; struct kimage *image = data; @@ -251,7 +254,10 @@ static void __do_machine_kexec(void *data) __arch_local_irq_stnsm(0xfb); /* disable DAT - avoid no-execute */ /* Call the moving routine */ - (*data_mover)(&image->head, image->start); + diag308_subcode = DIAG308_CLEAR_RESET; + if (sclp.has_iplcc) + diag308_subcode |= DIAG308_FLAG_EI; + (*data_mover)(&image->head, image->start, diag308_subcode); /* Die if kexec returns */ disabled_wait(); diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index fc60e29b8690..53ed3884fe64 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -29,6 +29,8 @@ #include <asm/switch_to.h> #include <asm/ctl_reg.h> #include <asm/asm-offsets.h> +#include <asm/pai.h> + #include <linux/kvm_host.h> struct mcck_struct { @@ -169,10 +171,12 @@ void __s390_handle_mcck(void) } } -void noinstr s390_handle_mcck(void) +void noinstr s390_handle_mcck(struct pt_regs *regs) { trace_hardirqs_off(); + pai_kernel_enter(regs); __s390_handle_mcck(); + pai_kernel_exit(regs); trace_hardirqs_on(); } /* diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index 52c1fe23b823..0d64aafd158f 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -295,6 +295,76 @@ CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108); CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x00109); CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z16, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z16, CRSTE_1MB_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z16, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z16, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z16, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z16, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z16, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z16, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z16, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ, 0x0091); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_CHIP_HIT, 0x0093); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_DRAWER_HIT, 0x0094); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP, 0x0095); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_CHIP_HIT, 0x0097); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT, 0x0098); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE, 0x0099); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER, 0x009a); +CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER, 0x009b); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_MEMORY, 0x009c); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE_MEMORY, 0x009d); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER_MEMORY, 0x009e); +CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER_MEMORY, 0x009f); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_IV, 0x00a0); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT, 0x00a1); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT, 0x00a2); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_IV, 0x00a3); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT, 0x00a4); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT, 0x00a5); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_IV, 0x00a6); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT, 0x00a7); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT, 0x00a8); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ, 0x00a9); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_CHIP_HIT, 0x00ab); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_DRAWER_HIT, 0x00ac); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP, 0x00ad); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_IV, 0x00ae); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_CHIP_HIT, 0x00af); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT, 0x00b0); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE, 0x00b1); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER, 0x00b2); +CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER, 0x00b3); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_MEMORY, 0x00b4); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE_MEMORY, 0x00b5); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER_MEMORY, 0x00b6); +CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER_MEMORY, 0x00b7); +CPUMF_EVENT_ATTR(cf_z16, BCD_DFP_EXECUTION_SLOTS, 0x00e0); +CPUMF_EVENT_ATTR(cf_z16, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z16, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z16, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z16, TX_NC_TABORT, 0x00f4); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_NO_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_SPECIAL, 0x00f6); +CPUMF_EVENT_ATTR(cf_z16, DFLT_ACCESS, 0x00f8); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CYCLES, 0x00fd); +CPUMF_EVENT_ATTR(cf_z16, SORTL, 0x0100); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CC, 0x0109); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CCFINISH, 0x010a); +CPUMF_EVENT_ATTR(cf_z16, NNPA_INVOCATIONS, 0x010b); +CPUMF_EVENT_ATTR(cf_z16, NNPA_COMPLETIONS, 0x010c); +CPUMF_EVENT_ATTR(cf_z16, NNPA_WAIT_LOCK, 0x010d); +CPUMF_EVENT_ATTR(cf_z16, NNPA_HOLD_LOCK, 0x010e); +CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES), @@ -635,6 +705,80 @@ static struct attribute *cpumcf_z15_pmu_event_attr[] __initdata = { NULL, }; +static struct attribute *cpumcf_z16_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z16, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, CRSTE_1MB_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z16, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z16, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z16, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z16, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z16, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z16, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z16, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_IV), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_IV), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, BCD_DFP_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z16, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z16, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z16, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z16, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z16, DFLT_ACCESS), + CPUMF_EVENT_PTR(cf_z16, DFLT_CYCLES), + CPUMF_EVENT_PTR(cf_z16, SORTL), + CPUMF_EVENT_PTR(cf_z16, DFLT_CC), + CPUMF_EVENT_PTR(cf_z16, DFLT_CCFINISH), + CPUMF_EVENT_PTR(cf_z16, NNPA_INVOCATIONS), + CPUMF_EVENT_PTR(cf_z16, NNPA_COMPLETIONS), + CPUMF_EVENT_PTR(cf_z16, NNPA_WAIT_LOCK), + CPUMF_EVENT_PTR(cf_z16, NNPA_HOLD_LOCK), + CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */ static struct attribute_group cpumcf_pmu_events_group = { @@ -749,6 +893,10 @@ __init const struct attribute_group **cpumf_cf_event_group(void) case 0x8562: model = cpumcf_z15_pmu_event_attr; break; + case 0x3931: + case 0x3932: + model = cpumcf_z16_pmu_event_attr; + break; default: model = none; break; diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c new file mode 100644 index 000000000000..8c1545946d85 --- /dev/null +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -0,0 +1,688 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support - Processor Activity Instrumentation Facility + * + * Copyright IBM Corp. 2022 + * Author(s): Thomas Richter <tmricht@linux.ibm.com> + */ +#define KMSG_COMPONENT "pai_crypto" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/io.h> +#include <linux/perf_event.h> + +#include <asm/ctl_reg.h> +#include <asm/pai.h> +#include <asm/debug.h> + +static debug_info_t *cfm_dbg; +static unsigned int paicrypt_cnt; /* Size of the mapped counter sets */ + /* extracted with QPACI instruction */ + +DEFINE_STATIC_KEY_FALSE(pai_key); + +struct pai_userdata { + u16 num; + u64 value; +} __packed; + +struct paicrypt_map { + unsigned long *page; /* Page for CPU to store counters */ + struct pai_userdata *save; /* Page to store no-zero counters */ + unsigned int users; /* # of PAI crypto users */ + unsigned int sampler; /* # of PAI crypto samplers */ + unsigned int counter; /* # of PAI crypto counters */ + struct perf_event *event; /* Perf event for sampling */ +}; + +static DEFINE_PER_CPU(struct paicrypt_map, paicrypt_map); + +/* Release the PMU if event is the last perf event */ +static DEFINE_MUTEX(pai_reserve_mutex); + +/* Adjust usage counters and remove allocated memory when all users are + * gone. + */ +static void paicrypt_event_destroy(struct perf_event *event) +{ + struct paicrypt_map *cpump = per_cpu_ptr(&paicrypt_map, event->cpu); + + cpump->event = NULL; + static_branch_dec(&pai_key); + mutex_lock(&pai_reserve_mutex); + if (event->attr.sample_period) + cpump->sampler -= 1; + else + cpump->counter -= 1; + debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d" + " sampler %d counter %d\n", __func__, + event->attr.config, event->cpu, cpump->sampler, + cpump->counter); + if (!cpump->counter && !cpump->sampler) { + debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n", + __func__, (unsigned long)cpump->page, + cpump->save); + free_page((unsigned long)cpump->page); + cpump->page = NULL; + kvfree(cpump->save); + cpump->save = NULL; + } + mutex_unlock(&pai_reserve_mutex); +} + +static u64 paicrypt_getctr(struct paicrypt_map *cpump, int nr, bool kernel) +{ + if (kernel) + nr += PAI_CRYPTO_MAXCTR; + return cpump->page[nr]; +} + +/* Read the counter values. Return value from location in CMP. For event + * CRYPTO_ALL sum up all events. + */ +static u64 paicrypt_getdata(struct perf_event *event, bool kernel) +{ + struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + u64 sum = 0; + int i; + + if (event->attr.config != PAI_CRYPTO_BASE) { + return paicrypt_getctr(cpump, + event->attr.config - PAI_CRYPTO_BASE, + kernel); + } + + for (i = 1; i <= paicrypt_cnt; i++) { + u64 val = paicrypt_getctr(cpump, i, kernel); + + if (!val) + continue; + sum += val; + } + return sum; +} + +static u64 paicrypt_getall(struct perf_event *event) +{ + u64 sum = 0; + + if (!event->attr.exclude_kernel) + sum += paicrypt_getdata(event, true); + if (!event->attr.exclude_user) + sum += paicrypt_getdata(event, false); + + return sum; +} + +/* Used to avoid races in checking concurrent access of counting and + * sampling for crypto events + * + * Only one instance of event pai_crypto/CRYPTO_ALL/ for sampling is + * allowed and when this event is running, no counting event is allowed. + * Several counting events are allowed in parallel, but no sampling event + * is allowed while one (or more) counting events are running. + * + * This function is called in process context and it is save to block. + * When the event initialization functions fails, no other call back will + * be invoked. + * + * Allocate the memory for the event. + */ +static int paicrypt_busy(struct perf_event_attr *a, struct paicrypt_map *cpump) +{ + unsigned int *use_ptr; + int rc = 0; + + mutex_lock(&pai_reserve_mutex); + if (a->sample_period) { /* Sampling requested */ + use_ptr = &cpump->sampler; + if (cpump->counter || cpump->sampler) + rc = -EBUSY; /* ... sampling/counting active */ + } else { /* Counting requested */ + use_ptr = &cpump->counter; + if (cpump->sampler) + rc = -EBUSY; /* ... and sampling active */ + } + if (rc) + goto unlock; + + /* Allocate memory for counter page and counter extraction. + * Only the first counting event has to allocate a page. + */ + if (cpump->page) + goto unlock; + + rc = -ENOMEM; + cpump->page = (unsigned long *)get_zeroed_page(GFP_KERNEL); + if (!cpump->page) + goto unlock; + cpump->save = kvmalloc_array(paicrypt_cnt + 1, + sizeof(struct pai_userdata), GFP_KERNEL); + if (!cpump->save) { + free_page((unsigned long)cpump->page); + cpump->page = NULL; + goto unlock; + } + rc = 0; + +unlock: + /* If rc is non-zero, do not increment counter/sampler. */ + if (!rc) + *use_ptr += 1; + debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx sampler %d" + " counter %d page %#lx save %p rc %d\n", __func__, + a->sample_period, cpump->sampler, cpump->counter, + (unsigned long)cpump->page, cpump->save, rc); + mutex_unlock(&pai_reserve_mutex); + return rc; +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int paicrypt_event_init(struct perf_event *event) +{ + struct perf_event_attr *a = &event->attr; + struct paicrypt_map *cpump; + int rc; + + /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */ + if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) + return -ENOENT; + /* PAI crypto event must be valid */ + if (a->config > PAI_CRYPTO_BASE + paicrypt_cnt) + return -EINVAL; + /* Allow only CPU wide operation, no process context for now. */ + if (event->hw.target || event->cpu == -1) + return -ENOENT; + /* Allow only CRYPTO_ALL for sampling. */ + if (a->sample_period && a->config != PAI_CRYPTO_BASE) + return -EINVAL; + + cpump = per_cpu_ptr(&paicrypt_map, event->cpu); + rc = paicrypt_busy(a, cpump); + if (rc) + return rc; + + cpump->event = event; + event->destroy = paicrypt_event_destroy; + + if (a->sample_period) { + a->sample_period = 1; + a->freq = 0; + /* Register for paicrypt_sched_task() to be called */ + event->attach_state |= PERF_ATTACH_SCHED_CB; + /* Add raw data which contain the memory mapped counters */ + a->sample_type |= PERF_SAMPLE_RAW; + /* Turn off inheritance */ + a->inherit = 0; + } + + static_branch_inc(&pai_key); + return 0; +} + +static void paicrypt_read(struct perf_event *event) +{ + u64 prev, new, delta; + + prev = local64_read(&event->hw.prev_count); + new = paicrypt_getall(event); + local64_set(&event->hw.prev_count, new); + delta = (prev <= new) ? new - prev + : (-1ULL - prev) + new + 1; /* overflow */ + local64_add(delta, &event->count); +} + +static void paicrypt_start(struct perf_event *event, int flags) +{ + u64 sum; + + sum = paicrypt_getall(event); /* Get current value */ + local64_set(&event->hw.prev_count, sum); + local64_set(&event->count, 0); +} + +static int paicrypt_add(struct perf_event *event, int flags) +{ + struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + unsigned long ccd; + + if (cpump->users++ == 0) { + ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET; + WRITE_ONCE(S390_lowcore.ccd, ccd); + __ctl_set_bit(0, 50); + } + cpump->event = event; + if (flags & PERF_EF_START && !event->attr.sample_period) { + /* Only counting needs initial counter value */ + paicrypt_start(event, PERF_EF_RELOAD); + } + event->hw.state = 0; + if (event->attr.sample_period) + perf_sched_cb_inc(event->pmu); + return 0; +} + +static void paicrypt_stop(struct perf_event *event, int flags) +{ + paicrypt_read(event); + event->hw.state = PERF_HES_STOPPED; +} + +static void paicrypt_del(struct perf_event *event, int flags) +{ + struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + + if (event->attr.sample_period) + perf_sched_cb_dec(event->pmu); + if (!event->attr.sample_period) + /* Only counting needs to read counter */ + paicrypt_stop(event, PERF_EF_UPDATE); + if (cpump->users-- == 1) { + __ctl_clear_bit(0, 50); + WRITE_ONCE(S390_lowcore.ccd, 0); + } +} + +/* Create raw data and save it in buffer. Returns number of bytes copied. + * Saves only positive counter entries of the form + * 2 bytes: Number of counter + * 8 bytes: Value of counter + */ +static size_t paicrypt_copy(struct pai_userdata *userdata, + struct paicrypt_map *cpump, + bool exclude_user, bool exclude_kernel) +{ + int i, outidx = 0; + + for (i = 1; i <= paicrypt_cnt; i++) { + u64 val = 0; + + if (!exclude_kernel) + val += paicrypt_getctr(cpump, i, true); + if (!exclude_user) + val += paicrypt_getctr(cpump, i, false); + if (val) { + userdata[outidx].num = i; + userdata[outidx].value = val; + outidx++; + } + } + return outidx * sizeof(struct pai_userdata); +} + +static int paicrypt_push_sample(void) +{ + struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + struct perf_event *event = cpump->event; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + size_t rawsize; + int overflow; + + if (!cpump->event) /* No event active */ + return 0; + rawsize = paicrypt_copy(cpump->save, cpump, + cpump->event->attr.exclude_user, + cpump->event->attr.exclude_kernel); + if (!rawsize) /* No incremented counters */ + return 0; + + /* Setup perf sample */ + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + memset(&data, 0, sizeof(data)); + perf_sample_data_init(&data, 0, event->hw.last_period); + if (event->attr.sample_type & PERF_SAMPLE_TID) { + data.tid_entry.pid = task_tgid_nr(current); + data.tid_entry.tid = task_pid_nr(current); + } + if (event->attr.sample_type & PERF_SAMPLE_TIME) + data.time = event->clock(); + if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) + data.id = event->id; + if (event->attr.sample_type & PERF_SAMPLE_CPU) { + data.cpu_entry.cpu = smp_processor_id(); + data.cpu_entry.reserved = 0; + } + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = rawsize; + raw.frag.data = cpump->save; + raw.size = raw.frag.size; + data.raw = &raw; + } + + overflow = perf_event_overflow(event, &data, ®s); + perf_event_update_userpage(event); + /* Clear lowcore page after read */ + memset(cpump->page, 0, PAGE_SIZE); + return overflow; +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event CRYPTO_ALL is allowed. + */ +static void paicrypt_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, clear values. + */ + if (!sched_in) + paicrypt_push_sample(); +} + +/* Attribute definitions for paicrypt interface. As with other CPU + * Measurement Facilities, there is one attribute per mapped counter. + * The number of mapped counters may vary per machine generation. Use + * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction + * to determine the number of mapped counters. The instructions returns + * a positive number, which is the highest number of supported counters. + * All counters less than this number are also supported, there are no + * holes. A returned number of zero means no support for mapped counters. + * + * The identification of the counter is a unique number. The chosen range + * is 0x1000 + offset in mapped kernel page. + * All CPU Measurement Facility counters identifiers must be unique and + * the numbers from 0 to 496 are already used for the CPU Measurement + * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already + * used for the CPU Measurement Sampling facility. + */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *paicrypt_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paicrypt_events_group = { + .name = "events", + .attrs = NULL /* Filled in attr_event_init() */ +}; + +static struct attribute_group paicrypt_format_group = { + .name = "format", + .attrs = paicrypt_format_attr, +}; + +static const struct attribute_group *paicrypt_attr_groups[] = { + &paicrypt_events_group, + &paicrypt_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paicrypt = { + .task_ctx_nr = perf_invalid_context, + .event_init = paicrypt_event_init, + .add = paicrypt_add, + .del = paicrypt_del, + .start = paicrypt_start, + .stop = paicrypt_stop, + .read = paicrypt_read, + .sched_task = paicrypt_sched_task, + .attr_groups = paicrypt_attr_groups +}; + +/* List of symbolic PAI counter names. */ +static const char * const paicrypt_ctrnames[] = { + [0] = "CRYPTO_ALL", + [1] = "KM_DEA", + [2] = "KM_TDEA_128", + [3] = "KM_TDEA_192", + [4] = "KM_ENCRYPTED_DEA", + [5] = "KM_ENCRYPTED_TDEA_128", + [6] = "KM_ENCRYPTED_TDEA_192", + [7] = "KM_AES_128", + [8] = "KM_AES_192", + [9] = "KM_AES_256", + [10] = "KM_ENCRYPTED_AES_128", + [11] = "KM_ENCRYPTED_AES_192", + [12] = "KM_ENCRYPTED_AES_256", + [13] = "KM_XTS_AES_128", + [14] = "KM_XTS_AES_256", + [15] = "KM_XTS_ENCRYPTED_AES_128", + [16] = "KM_XTS_ENCRYPTED_AES_256", + [17] = "KMC_DEA", + [18] = "KMC_TDEA_128", + [19] = "KMC_TDEA_192", + [20] = "KMC_ENCRYPTED_DEA", + [21] = "KMC_ENCRYPTED_TDEA_128", + [22] = "KMC_ENCRYPTED_TDEA_192", + [23] = "KMC_AES_128", + [24] = "KMC_AES_192", + [25] = "KMC_AES_256", + [26] = "KMC_ENCRYPTED_AES_128", + [27] = "KMC_ENCRYPTED_AES_192", + [28] = "KMC_ENCRYPTED_AES_256", + [29] = "KMC_PRNG", + [30] = "KMA_GCM_AES_128", + [31] = "KMA_GCM_AES_192", + [32] = "KMA_GCM_AES_256", + [33] = "KMA_GCM_ENCRYPTED_AES_128", + [34] = "KMA_GCM_ENCRYPTED_AES_192", + [35] = "KMA_GCM_ENCRYPTED_AES_256", + [36] = "KMF_DEA", + [37] = "KMF_TDEA_128", + [38] = "KMF_TDEA_192", + [39] = "KMF_ENCRYPTED_DEA", + [40] = "KMF_ENCRYPTED_TDEA_128", + [41] = "KMF_ENCRYPTED_TDEA_192", + [42] = "KMF_AES_128", + [43] = "KMF_AES_192", + [44] = "KMF_AES_256", + [45] = "KMF_ENCRYPTED_AES_128", + [46] = "KMF_ENCRYPTED_AES_192", + [47] = "KMF_ENCRYPTED_AES_256", + [48] = "KMCTR_DEA", + [49] = "KMCTR_TDEA_128", + [50] = "KMCTR_TDEA_192", + [51] = "KMCTR_ENCRYPTED_DEA", + [52] = "KMCTR_ENCRYPTED_TDEA_128", + [53] = "KMCTR_ENCRYPTED_TDEA_192", + [54] = "KMCTR_AES_128", + [55] = "KMCTR_AES_192", + [56] = "KMCTR_AES_256", + [57] = "KMCTR_ENCRYPTED_AES_128", + [58] = "KMCTR_ENCRYPTED_AES_192", + [59] = "KMCTR_ENCRYPTED_AES_256", + [60] = "KMO_DEA", + [61] = "KMO_TDEA_128", + [62] = "KMO_TDEA_192", + [63] = "KMO_ENCRYPTED_DEA", + [64] = "KMO_ENCRYPTED_TDEA_128", + [65] = "KMO_ENCRYPTED_TDEA_192", + [66] = "KMO_AES_128", + [67] = "KMO_AES_192", + [68] = "KMO_AES_256", + [69] = "KMO_ENCRYPTED_AES_128", + [70] = "KMO_ENCRYPTED_AES_192", + [71] = "KMO_ENCRYPTED_AES_256", + [72] = "KIMD_SHA_1", + [73] = "KIMD_SHA_256", + [74] = "KIMD_SHA_512", + [75] = "KIMD_SHA3_224", + [76] = "KIMD_SHA3_256", + [77] = "KIMD_SHA3_384", + [78] = "KIMD_SHA3_512", + [79] = "KIMD_SHAKE_128", + [80] = "KIMD_SHAKE_256", + [81] = "KIMD_GHASH", + [82] = "KLMD_SHA_1", + [83] = "KLMD_SHA_256", + [84] = "KLMD_SHA_512", + [85] = "KLMD_SHA3_224", + [86] = "KLMD_SHA3_256", + [87] = "KLMD_SHA3_384", + [88] = "KLMD_SHA3_512", + [89] = "KLMD_SHAKE_128", + [90] = "KLMD_SHAKE_256", + [91] = "KMAC_DEA", + [92] = "KMAC_TDEA_128", + [93] = "KMAC_TDEA_192", + [94] = "KMAC_ENCRYPTED_DEA", + [95] = "KMAC_ENCRYPTED_TDEA_128", + [96] = "KMAC_ENCRYPTED_TDEA_192", + [97] = "KMAC_AES_128", + [98] = "KMAC_AES_192", + [99] = "KMAC_AES_256", + [100] = "KMAC_ENCRYPTED_AES_128", + [101] = "KMAC_ENCRYPTED_AES_192", + [102] = "KMAC_ENCRYPTED_AES_256", + [103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA", + [104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128", + [105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192", + [106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA", + [107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128", + [108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192", + [109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128", + [110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192", + [111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256", + [112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128", + [113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192", + [114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256A", + [115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128", + [116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256", + [117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128", + [118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256", + [119] = "PCC_SCALAR_MULTIPLY_P256", + [120] = "PCC_SCALAR_MULTIPLY_P384", + [121] = "PCC_SCALAR_MULTIPLY_P521", + [122] = "PCC_SCALAR_MULTIPLY_ED25519", + [123] = "PCC_SCALAR_MULTIPLY_ED448", + [124] = "PCC_SCALAR_MULTIPLY_X25519", + [125] = "PCC_SCALAR_MULTIPLY_X448", + [126] = "PRNO_SHA_512_DRNG", + [127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO", + [128] = "PRNO_TRNG", + [129] = "KDSA_ECDSA_VERIFY_P256", + [130] = "KDSA_ECDSA_VERIFY_P384", + [131] = "KDSA_ECDSA_VERIFY_P521", + [132] = "KDSA_ECDSA_SIGN_P256", + [133] = "KDSA_ECDSA_SIGN_P384", + [134] = "KDSA_ECDSA_SIGN_P521", + [135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256", + [136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384", + [137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521", + [138] = "KDSA_EDDSA_VERIFY_ED25519", + [139] = "KDSA_EDDSA_VERIFY_ED448", + [140] = "KDSA_EDDSA_SIGN_ED25519", + [141] = "KDSA_EDDSA_SIGN_ED448", + [142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519", + [143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448", + [144] = "PCKMO_ENCRYPT_DEA_KEY", + [145] = "PCKMO_ENCRYPT_TDEA_128_KEY", + [146] = "PCKMO_ENCRYPT_TDEA_192_KEY", + [147] = "PCKMO_ENCRYPT_AES_128_KEY", + [148] = "PCKMO_ENCRYPT_AES_192_KEY", + [149] = "PCKMO_ENCRYPT_AES_256_KEY", + [150] = "PCKMO_ENCRYPT_ECC_P256_KEY", + [151] = "PCKMO_ENCRYPT_ECC_P384_KEY", + [152] = "PCKMO_ENCRYPT_ECC_P521_KEY", + [153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY", + [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY", + [155] = "IBM_RESERVED_155", + [156] = "IBM_RESERVED_156", +}; + +static void __init attr_event_free(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + int i; + + for (i = 0; i < num; i++) { + struct device_attribute *dap; + + dap = container_of(attrs[i], struct device_attribute, attr); + pa = container_of(dap, struct perf_pmu_events_attr, attr); + kfree(pa); + } + kfree(attrs); +} + +static int __init attr_event_init_one(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + + pa = kzalloc(sizeof(*pa), GFP_KERNEL); + if (!pa) + return -ENOMEM; + + sysfs_attr_init(&pa->attr.attr); + pa->id = PAI_CRYPTO_BASE + num; + pa->attr.attr.name = paicrypt_ctrnames[num]; + pa->attr.attr.mode = 0444; + pa->attr.show = cpumf_events_sysfs_show; + pa->attr.store = NULL; + attrs[num] = &pa->attr.attr; + return 0; +} + +/* Create PMU sysfs event attributes on the fly. */ +static int __init attr_event_init(void) +{ + struct attribute **attrs; + int ret, i; + + attrs = kmalloc_array(ARRAY_SIZE(paicrypt_ctrnames) + 1, sizeof(*attrs), + GFP_KERNEL); + if (!attrs) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(paicrypt_ctrnames); i++) { + ret = attr_event_init_one(attrs, i); + if (ret) { + attr_event_free(attrs, i - 1); + return ret; + } + } + attrs[i] = NULL; + paicrypt_events_group.attrs = attrs; + return 0; +} + +static int __init paicrypt_init(void) +{ + struct qpaci_info_block ib; + int rc; + + if (!test_facility(196)) + return 0; + + qpaci(&ib); + paicrypt_cnt = ib.num_cc; + if (paicrypt_cnt == 0) + return 0; + if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR) + paicrypt_cnt = PAI_CRYPTO_MAXCTR - 1; + + rc = attr_event_init(); /* Export known PAI crypto events */ + if (rc) { + pr_err("Creation of PMU pai_crypto /sysfs failed\n"); + return rc; + } + + /* Setup s390dbf facility */ + cfm_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128); + if (!cfm_dbg) { + pr_err("Registration of s390dbf pai_crypto failed\n"); + return -ENOMEM; + } + debug_register_view(cfm_dbg, &debug_sprintf_view); + + rc = perf_pmu_register(&paicrypt, "pai_crypto", -1); + if (rc) { + pr_err("Registering the pai_crypto PMU failed with rc=%i\n", + rc); + debug_unregister_view(cfm_dbg, &debug_sprintf_view); + debug_unregister(cfm_dbg); + return rc; + } + return 0; +} + +device_initcall(paicrypt_init); diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index 9438368c3632..a9a1a6f45375 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -14,6 +14,7 @@ * moves the new kernel to its destination... * %r2 = pointer to first kimage_entry_t * %r3 = start address - where to jump to after the job is done... + * %r4 = subcode * * %r5 will be used as temp. storage * %r6 holds the destination address @@ -56,7 +57,7 @@ ENTRY(relocate_kernel) jo 0b j .base .done: - sgr %r0,%r0 # clear register r0 + lgr %r0,%r4 # subcode cghi %r3,0 je .diag la %r4,load_psw-.base(%r13) # load psw-address into the register diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index d860ac300919..8d91eccc0963 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -494,7 +494,7 @@ static void __init setup_lowcore_dat_off(void) lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); lc->preempt_count = PREEMPT_DISABLED; - set_prefix((u32)(unsigned long) lc); + set_prefix(__pa(lc)); lowcore_ptr[0] = lc; } diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 326cb8f75f58..6b7b6d5e3632 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -364,7 +364,7 @@ static inline int check_sync_clock(void) * Apply clock delta to the global data structures. * This is called once on the CPU that performed the clock sync. */ -static void clock_sync_global(unsigned long delta) +static void clock_sync_global(long delta) { unsigned long now, adj; struct ptff_qto qto; @@ -400,7 +400,7 @@ static void clock_sync_global(unsigned long delta) * Apply clock delta to the per-CPU data structures of this CPU. * This is called for each online CPU after the call to clock_sync_global. */ -static void clock_sync_local(unsigned long delta) +static void clock_sync_local(long delta) { /* Add the delta to the clock comparator. */ if (S390_lowcore.clock_comparator != clock_comparator_max) { @@ -424,7 +424,7 @@ static void __init time_init_wq(void) struct clock_sync_data { atomic_t cpus; int in_sync; - unsigned long clock_delta; + long clock_delta; }; /* @@ -544,7 +544,7 @@ static int stpinfo_valid(void) static int stp_sync_clock(void *data) { struct clock_sync_data *sync = data; - u64 clock_delta, flags; + long clock_delta, flags; static int first; int rc; @@ -554,9 +554,7 @@ static int stp_sync_clock(void *data) while (atomic_read(&sync->cpus) != 0) cpu_relax(); rc = 0; - if (stp_info.todoff[0] || stp_info.todoff[1] || - stp_info.todoff[2] || stp_info.todoff[3] || - stp_info.tmd != 2) { + if (stp_info.todoff || stp_info.tmd != 2) { flags = vdso_update_begin(); rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, &clock_delta); diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 99694260cac9..5075cde77b29 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/smp.h> #include <linux/time_namespace.h> +#include <linux/random.h> #include <vdso/datapage.h> #include <asm/vdso.h> @@ -160,10 +161,9 @@ int vdso_getcpu_init(void) } early_initcall(vdso_getcpu_init); /* Must be called before SMP init */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) { - unsigned long vdso_text_len, vdso_mapping_len; - unsigned long vvar_start, vdso_text_start; + unsigned long vvar_start, vdso_text_start, vdso_text_len; struct vm_special_mapping *vdso_mapping; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -180,8 +180,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) vdso_text_len = vdso64_end - vdso64_start; vdso_mapping = &vdso64_mapping; } - vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; - vvar_start = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); + vvar_start = get_unmapped_area(NULL, addr, vdso_mapping_len, 0, 0); rc = vvar_start; if (IS_ERR_VALUE(vvar_start)) goto out; @@ -210,6 +209,52 @@ out: return rc; } +static unsigned long vdso_addr(unsigned long start, unsigned long len) +{ + unsigned long addr, end, offset; + + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; + if (end >= VDSO_BASE) + end = VDSO_BASE; + end -= len; + + if (end > start) { + offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } + return addr; +} + +unsigned long vdso_size(void) +{ + unsigned long size = VVAR_NR_PAGES * PAGE_SIZE; + + if (is_compat_task()) + size += vdso32_end - vdso32_start; + else + size += vdso64_end - vdso64_start; + return PAGE_ALIGN(size); +} + +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + unsigned long addr = VDSO_BASE; + unsigned long size = vdso_size(); + + if (current->flags & PF_RANDOMIZE) + addr = vdso_addr(current->mm->start_stack + PAGE_SIZE, size); + return map_vdso(addr, size); +} + static struct page ** __init vdso_setup_pages(void *start, void *end) { int pages = (end - start) >> PAGE_SHIFT; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index da3dabda1a12..76ad6408cb2c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2384,7 +2384,16 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) return -EINVAL; if (mop->size > MEM_OP_MAX_SIZE) return -E2BIG; - if (kvm_s390_pv_is_protected(kvm)) + /* + * This is technically a heuristic only, if the kvm->lock is not + * taken, it is not guaranteed that the vm is/remains non-protected. + * This is ok from a kernel perspective, wrongdoing is detected + * on the access, -EFAULT is returned and the vm may crash the + * next time it accesses the memory in question. + * There is no sane usecase to do switching and a memop on two + * different CPUs at the same time. + */ + if (kvm_s390_pv_get_handle(kvm)) return -EINVAL; if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { if (access_key_invalid(mop->key)) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 5beb7a4a11b3..83bb5cf97282 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -11,7 +11,6 @@ #include <linux/kvm.h> #include <linux/gfp.h> #include <linux/errno.h> -#include <linux/compat.h> #include <linux/mm_types.h> #include <linux/pgtable.h> diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index 5e7ea8b111e8..04d4c6cf898e 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -75,7 +75,7 @@ static inline int arch_load_niai4(int *lock) int owner; asm_inline volatile( - ALTERNATIVE("", ".insn rre,0xb2fa0000,4,0", 49) /* NIAI 4 */ + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", 49) /* NIAI 4 */ " l %0,%1\n" : "=d" (owner) : "Q" (*lock) : "memory"); return owner; @@ -86,7 +86,7 @@ static inline int arch_cmpxchg_niai8(int *lock, int old, int new) int expected = old; asm_inline volatile( - ALTERNATIVE("", ".insn rre,0xb2fa0000,8,0", 49) /* NIAI 8 */ + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", 49) /* NIAI 8 */ " cs %0,%3,%1\n" : "=d" (old), "=Q" (*lock) : "0" (old), "d" (new), "Q" (*lock) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index af03cacf34ec..1ac73917a8d3 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1183,6 +1183,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table); static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, struct gmap_rmap *rmap) { + struct gmap_rmap *temp; void __rcu **slot; BUG_ON(!gmap_is_shadow(sg)); @@ -1190,6 +1191,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->raddr == rmap->raddr) { + kfree(rmap); + return; + } + } radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index e54f928503c5..d545f5c39f7e 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -58,9 +58,9 @@ static inline unsigned long mmap_base(unsigned long rnd, /* * Top of mmap area (just below the process stack). - * Leave at least a ~32 MB hole. + * Leave at least a ~128 MB hole. */ - gap_min = 32 * 1024 * 1024UL; + gap_min = SZ_128M; gap_max = (STACK_TOP / 6) * 5; if (gap < gap_min) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index aede9a3ca3f7..af35052d06ed 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1809,7 +1809,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) /* * Three initial passes: * - 1/2: Determine clobbered registers - * - 3: Calculate program size and addrs arrray + * - 3: Calculate program size and addrs array */ for (pass = 1; pass <= 3; pass++) { if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) { diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index e563cb65c0c4..bc980fd313d5 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -799,7 +799,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) struct zpci_dev *zdev; int rc; - zpci_dbg(3, "add fid:%x, fh:%x, c:%d\n", fid, fh, state); + zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", fid, fh, state); zdev = kzalloc(sizeof(*zdev), GFP_KERNEL); if (!zdev) return ERR_PTR(-ENOMEM); diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 1057d7af4a55..375e0a5120bc 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -30,7 +30,7 @@ bool zpci_unique_uid; void update_uid_checking(bool new) { if (zpci_unique_uid != new) - zpci_dbg(1, "uid checking:%d\n", new); + zpci_dbg(3, "uid checking:%d\n", new); zpci_unique_uid = new; } diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index 3408c0df3ebf..ca6bd98eec13 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -196,7 +196,7 @@ int __init zpci_debug_init(void) if (!pci_debug_err_id) return -EINVAL; debug_register_view(pci_debug_err_id, &debug_hex_ascii_view); - debug_set_level(pci_debug_err_id, 6); + debug_set_level(pci_debug_err_id, 3); debugfs_root = debugfs_create_dir("pci", NULL); return 0; diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index ea9db5cea64e..b9324ca2eb94 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -321,9 +321,6 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) zpci_dbg(3, "avl fid:%x, fh:%x, pec:%x\n", ccdf->fid, ccdf->fh, ccdf->pec); - zpci_err("avail CCDF:\n"); - zpci_err_hex(ccdf, sizeof(*ccdf)); - switch (ccdf->pec) { case 0x0301: /* Reserved|Standby -> Configured */ if (!zdev) { diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index 1710d006ee93..1a822b7799f8 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -18,16 +18,40 @@ #define ZPCI_INSN_BUSY_DELAY 1 /* 1 microsecond */ -static inline void zpci_err_insn(u8 cc, u8 status, u64 req, u64 offset) +struct zpci_err_insn_data { + u8 insn; + u8 cc; + u8 status; + union { + struct { + u64 req; + u64 offset; + }; + struct { + u64 addr; + u64 len; + }; + }; +} __packed; + +static inline void zpci_err_insn_req(int lvl, u8 insn, u8 cc, u8 status, + u64 req, u64 offset) { - struct { - u64 req; - u64 offset; - u8 cc; - u8 status; - } __packed data = {req, offset, cc, status}; - - zpci_err_hex(&data, sizeof(data)); + struct zpci_err_insn_data data = { + .insn = insn, .cc = cc, .status = status, + .req = req, .offset = offset}; + + zpci_err_hex_level(lvl, &data, sizeof(data)); +} + +static inline void zpci_err_insn_addr(int lvl, u8 insn, u8 cc, u8 status, + u64 addr, u64 len) +{ + struct zpci_err_insn_data data = { + .insn = insn, .cc = cc, .status = status, + .addr = addr, .len = len}; + + zpci_err_hex_level(lvl, &data, sizeof(data)); } /* Modify PCI Function Controls */ @@ -47,16 +71,24 @@ static inline u8 __mpcifc(u64 req, struct zpci_fib *fib, u8 *status) u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status) { + bool retried = false; u8 cc; do { cc = __mpcifc(req, fib, status); - if (cc == 2) + if (cc == 2) { msleep(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_req(1, 'M', cc, *status, req, 0); + retried = true; + } + } } while (cc == 2); if (cc) - zpci_err_insn(cc, *status, req, 0); + zpci_err_insn_req(0, 'M', cc, *status, req, 0); + else if (retried) + zpci_err_insn_req(1, 'M', cc, *status, req, 0); return cc; } @@ -80,16 +112,24 @@ static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status) int zpci_refresh_trans(u64 fn, u64 addr, u64 range) { + bool retried = false; u8 cc, status; do { cc = __rpcit(fn, addr, range, &status); - if (cc == 2) + if (cc == 2) { udelay(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_addr(1, 'R', cc, status, addr, range); + retried = true; + } + } } while (cc == 2); if (cc) - zpci_err_insn(cc, status, addr, range); + zpci_err_insn_addr(0, 'R', cc, status, addr, range); + else if (retried) + zpci_err_insn_addr(1, 'R', cc, status, addr, range); if (cc == 1 && (status == 4 || status == 16)) return -ENOMEM; @@ -144,17 +184,25 @@ static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status) int __zpci_load(u64 *data, u64 req, u64 offset) { + bool retried = false; u8 status; int cc; do { cc = __pcilg(data, req, offset, &status); - if (cc == 2) + if (cc == 2) { udelay(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_req(1, 'l', cc, status, req, offset); + retried = true; + } + } } while (cc == 2); if (cc) - zpci_err_insn(cc, status, req, offset); + zpci_err_insn_req(0, 'l', cc, status, req, offset); + else if (retried) + zpci_err_insn_req(1, 'l', cc, status, req, offset); return (cc > 0) ? -EIO : cc; } @@ -198,7 +246,7 @@ int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len) cc = __pcilg_mio(data, (__force u64) addr, len, &status); if (cc) - zpci_err_insn(cc, status, 0, (__force u64) addr); + zpci_err_insn_addr(0, 'L', cc, status, (__force u64) addr, len); return (cc > 0) ? -EIO : cc; } @@ -225,17 +273,25 @@ static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status) int __zpci_store(u64 data, u64 req, u64 offset) { + bool retried = false; u8 status; int cc; do { cc = __pcistg(data, req, offset, &status); - if (cc == 2) + if (cc == 2) { udelay(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_req(1, 's', cc, status, req, offset); + retried = true; + } + } } while (cc == 2); if (cc) - zpci_err_insn(cc, status, req, offset); + zpci_err_insn_req(0, 's', cc, status, req, offset); + else if (retried) + zpci_err_insn_req(1, 's', cc, status, req, offset); return (cc > 0) ? -EIO : cc; } @@ -278,7 +334,7 @@ int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len) cc = __pcistg_mio(data, (__force u64) addr, len, &status); if (cc) - zpci_err_insn(cc, status, 0, (__force u64) addr); + zpci_err_insn_addr(0, 'S', cc, status, (__force u64) addr, len); return (cc > 0) ? -EIO : cc; } @@ -304,17 +360,25 @@ static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status) int __zpci_store_block(const u64 *data, u64 req, u64 offset) { + bool retried = false; u8 status; int cc; do { cc = __pcistb(data, req, offset, &status); - if (cc == 2) + if (cc == 2) { udelay(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_req(0, 'b', cc, status, req, offset); + retried = true; + } + } } while (cc == 2); if (cc) - zpci_err_insn(cc, status, req, offset); + zpci_err_insn_req(0, 'b', cc, status, req, offset); + else if (retried) + zpci_err_insn_req(1, 'b', cc, status, req, offset); return (cc > 0) ? -EIO : cc; } @@ -358,7 +422,7 @@ int zpci_write_block(volatile void __iomem *dst, cc = __pcistb_mio(src, (__force u64) dst, len, &status); if (cc) - zpci_err_insn(cc, status, 0, (__force u64) dst); + zpci_err_insn_addr(0, 'B', cc, status, (__force u64) dst, len); return (cc > 0) ? -EIO : cc; } diff --git a/arch/s390/purgatory/head.S b/arch/s390/purgatory/head.S index 3d1c31e0cf3d..6f835124ee82 100644 --- a/arch/s390/purgatory/head.S +++ b/arch/s390/purgatory/head.S @@ -44,11 +44,14 @@ .endm .macro MEMSWAP dst,src,buf,len -10: cghi \len,bufsz +10: larl %r0,purgatory_end + larl %r1,stack + slgr %r0,%r1 + cgr \len,%r0 jh 11f lgr %r4,\len j 12f -11: lghi %r4,bufsz +11: lgr %r4,%r0 12: MEMCPY \buf,\dst,%r4 MEMCPY \dst,\src,%r4 @@ -135,12 +138,18 @@ ENTRY(purgatory_start) .start_crash_kernel: /* Location of purgatory_start in crash memory */ + larl %r0,.base_crash + larl %r1,purgatory_start + slgr %r0,%r1 lgr %r8,%r13 - aghi %r8,-(.base_crash-purgatory_start) + sgr %r8,%r0 /* Destination for this code i.e. end of memory to be swapped. */ + larl %r0,purgatory_end + larl %r1,purgatory_start + slgr %r0,%r1 lg %r9,crash_size-.base_crash(%r13) - aghi %r9,-(purgatory_end-purgatory_start) + sgr %r9,%r0 /* Destination in crash memory, i.e. same as r9 but in crash memory. */ lg %r10,crash_start-.base_crash(%r13) @@ -149,15 +158,19 @@ ENTRY(purgatory_start) /* Buffer location (in crash memory) and size. As the purgatory is * behind the point of no return it can re-use the stack as buffer. */ - lghi %r11,bufsz + larl %r11,purgatory_end larl %r12,stack + slgr %r11,%r12 MEMCPY %r12,%r9,%r11 /* dst -> (crash) buf */ MEMCPY %r9,%r8,%r11 /* self -> dst */ /* Jump to new location. */ lgr %r7,%r9 - aghi %r7,.jump_to_dst-purgatory_start + larl %r0,.jump_to_dst + larl %r1,purgatory_start + slgr %r0,%r1 + agr %r7,%r0 br %r7 .jump_to_dst: @@ -169,7 +182,10 @@ ENTRY(purgatory_start) /* Load new buffer location after jump */ larl %r7,stack - aghi %r10,stack-purgatory_start + lgr %r0,%r7 + larl %r1,purgatory_start + slgr %r0,%r1 + agr %r10,%r0 MEMCPY %r10,%r7,%r11 /* (new) buf -> (crash) buf */ /* Now the code is set up to run from its designated location. Start diff --git a/arch/sh/boards/board-sh7757lcr.c b/arch/sh/boards/board-sh7757lcr.c index c32b4c6229d3..f39c8196efdf 100644 --- a/arch/sh/boards/board-sh7757lcr.c +++ b/arch/sh/boards/board-sh7757lcr.c @@ -16,7 +16,7 @@ #include <linux/io.h> #include <linux/mfd/tmio.h> #include <linux/mmc/host.h> -#include <linux/mmc/sh_mmcif.h> +#include <linux/platform_data/sh_mmcif.h> #include <linux/sh_eth.h> #include <linux/sh_intc.h> #include <linux/usb/renesas_usbhs.h> diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index 4c9522dd351f..674da7ebd8b7 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -19,7 +19,7 @@ #include <linux/memblock.h> #include <linux/mfd/tmio.h> #include <linux/mmc/host.h> -#include <linux/mmc/sh_mmcif.h> +#include <linux/platform_data/sh_mmcif.h> #include <linux/mtd/physmap.h> #include <linux/gpio.h> #include <linux/gpio/machine.h> diff --git a/arch/sh/boot/romimage/mmcif-sh7724.c b/arch/sh/boot/romimage/mmcif-sh7724.c index 6595b6b45bf1..d30123d859e0 100644 --- a/arch/sh/boot/romimage/mmcif-sh7724.c +++ b/arch/sh/boot/romimage/mmcif-sh7724.c @@ -8,7 +8,7 @@ * for more details. */ -#include <linux/mmc/sh_mmcif.h> +#include <linux/platform_data/sh_mmcif.h> #include <mach/romimage.h> #define MMCIF_BASE (void __iomem *)0xa4ca0000 diff --git a/arch/sh/configs/rsk7201_defconfig b/arch/sh/configs/rsk7201_defconfig index e41526120be1..619c18699459 100644 --- a/arch/sh/configs/rsk7201_defconfig +++ b/arch/sh/configs/rsk7201_defconfig @@ -25,7 +25,6 @@ CONFIG_CMDLINE_OVERWRITE=y CONFIG_CMDLINE="console=ttySC0,115200 earlyprintk=serial ignore_loglevel" CONFIG_BINFMT_FLAT=y CONFIG_BINFMT_ZFLAT=y -CONFIG_BINFMT_SHARED_FLAT=y CONFIG_PM=y CONFIG_CPU_IDLE=y # CONFIG_STANDALONE is not set diff --git a/arch/sh/configs/rsk7203_defconfig b/arch/sh/configs/rsk7203_defconfig index 6af08fa1ddf8..5a54e2b883f0 100644 --- a/arch/sh/configs/rsk7203_defconfig +++ b/arch/sh/configs/rsk7203_defconfig @@ -30,7 +30,6 @@ CONFIG_CMDLINE_OVERWRITE=y CONFIG_CMDLINE="console=ttySC0,115200 earlyprintk=serial ignore_loglevel" CONFIG_BINFMT_FLAT=y CONFIG_BINFMT_ZFLAT=y -CONFIG_BINFMT_SHARED_FLAT=y CONFIG_PM=y CONFIG_CPU_IDLE=y CONFIG_NET=y diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig index 601d062250d1..122216123e63 100644 --- a/arch/sh/configs/se7206_defconfig +++ b/arch/sh/configs/se7206_defconfig @@ -40,7 +40,6 @@ CONFIG_CMDLINE_OVERWRITE=y CONFIG_CMDLINE="console=ttySC3,115200 ignore_loglevel earlyprintk=serial" CONFIG_BINFMT_FLAT=y CONFIG_BINFMT_ZFLAT=y -CONFIG_BINFMT_SHARED_FLAT=y CONFIG_BINFMT_MISC=y CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/sparc/include/asm/timex_32.h b/arch/sparc/include/asm/timex_32.h index 542915b46209..f86326a6f89e 100644 --- a/arch/sparc/include/asm/timex_32.h +++ b/arch/sparc/include/asm/timex_32.h @@ -9,8 +9,6 @@ #define CLOCK_TICK_RATE 1193180 /* Underlying HZ */ -/* XXX Maybe do something better at some point... -DaveM */ -typedef unsigned long cycles_t; -#define get_cycles() (0) +#include <asm-generic/timex.h> #endif diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 666f81e617ea..2fda57a3ea86 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -128,6 +128,7 @@ #define SO_TXREHASH 0x0053 +#define SO_RCVMARK 0x0054 #if !defined(__KERNEL__) diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index f9fe502b81c6..dad38960d1a8 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -779,5 +779,6 @@ static_assert(offsetof(compat_siginfo_t, si_upper) == 0x18); static_assert(offsetof(compat_siginfo_t, si_pkey) == 0x14); static_assert(offsetof(compat_siginfo_t, si_perf_data) == 0x10); static_assert(offsetof(compat_siginfo_t, si_perf_type) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_perf_flags) == 0x18); static_assert(offsetof(compat_siginfo_t, si_band) == 0x0c); static_assert(offsetof(compat_siginfo_t, si_fd) == 0x10); diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 8b9fc76cd3e0..570e43e6fda5 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -590,5 +590,6 @@ static_assert(offsetof(siginfo_t, si_upper) == 0x28); static_assert(offsetof(siginfo_t, si_pkey) == 0x20); static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); static_assert(offsetof(siginfo_t, si_band) == 0x10); static_assert(offsetof(siginfo_t, si_fd) == 0x14); diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile index c5e1545bc5cf..77d7b9032158 100644 --- a/arch/sparc/vdso/Makefile +++ b/arch/sparc/vdso/Makefile @@ -58,7 +58,7 @@ CFL := $(PROFILING) -mcmodel=medlow -fPIC -O2 -fasynchronous-unwind-tables -m64 SPARC_REG_CFLAGS = -ffixed-g4 -ffixed-g5 -fcall-used-g5 -fcall-used-g7 -$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(SPARC_REG_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(SPARC_REG_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. @@ -88,6 +88,7 @@ $(obj)/vdso32.so.dbg: asflags-$(CONFIG_SPARC64) += -m32 KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_32 := $(filter-out -mcmodel=medlow,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(SPARC_REG_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -fpic diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index b03269faef71..c4344b67628d 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -483,7 +483,6 @@ static void ubd_handler(void) if ((io_req->error == BLK_STS_NOTSUPP) && (req_op(io_req->req) == REQ_OP_DISCARD)) { blk_queue_max_discard_sectors(io_req->req->q, 0); blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, io_req->req->q); } blk_mq_end_request(io_req->req, io_req->error); kfree(io_req); @@ -800,10 +799,8 @@ static int ubd_open_dev(struct ubd *ubd_dev) } if (ubd_dev->no_trim == 0) { ubd_dev->queue->limits.discard_granularity = SECTOR_SIZE; - ubd_dev->queue->limits.discard_alignment = SECTOR_SIZE; blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST); blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, ubd_dev->queue); } blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue); return 0; diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 1d6f6a66766c..548265312743 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1255,7 +1255,8 @@ static int vector_net_open(struct net_device *dev) goto out_close; } - netif_napi_add(vp->dev, &vp->napi, vector_poll, get_depth(vp->parsed)); + netif_napi_add_weight(vp->dev, &vp->napi, vector_poll, + get_depth(vp->parsed)); napi_enable(&vp->napi); /* READ IRQ */ diff --git a/arch/um/include/asm/timex.h b/arch/um/include/asm/timex.h index e392a9a5bc9b..9f27176adb26 100644 --- a/arch/um/include/asm/timex.h +++ b/arch/um/include/asm/timex.h @@ -2,13 +2,8 @@ #ifndef __UM_TIMEX_H #define __UM_TIMEX_H -typedef unsigned long cycles_t; - -static inline cycles_t get_cycles (void) -{ - return 0; -} - #define CLOCK_TICK_RATE (HZ) +#include <asm-generic/timex.h> + #endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 675416deb298..6f59517c4356 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -188,7 +188,7 @@ config X86 select HAVE_CONTEXT_TRACKING if X86_64 select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT - select HAVE_OBJTOOL_MCOUNT if STACK_VALIDATION + select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL select HAVE_BUILDTIME_MCOUNT_SORT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS @@ -212,6 +212,7 @@ config X86 select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_JUMP_LABEL_HACK if HAVE_OBJTOOL select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 @@ -230,7 +231,10 @@ config X86 select HAVE_MOD_ARCH_SPECIFIC select HAVE_MOVE_PMD select HAVE_MOVE_PUD + select HAVE_NOINSTR_HACK if HAVE_OBJTOOL select HAVE_NMI + select HAVE_NOINSTR_VALIDATION if HAVE_OBJTOOL + select HAVE_OBJTOOL if X86_64 select HAVE_OPTPROBES select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS @@ -239,17 +243,17 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_SETUP_PER_CPU_AREA select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR - select HAVE_STACK_VALIDATION if X86_64 + select HAVE_STACK_VALIDATION if HAVE_OBJTOOL select HAVE_STATIC_CALL - select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION + select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL select HAVE_PREEMPT_DYNAMIC_CALL select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS @@ -268,7 +272,6 @@ config X86 select RTC_MC146818_LIB select SPARSE_IRQ select SRCU - select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE) select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT @@ -459,6 +462,7 @@ config GOLDFISH config RETPOLINE bool "Avoid speculative indirect branches in kernel" + select OBJTOOL if HAVE_OBJTOOL default y help Compile kernel with the retpoline compiler options to guard against @@ -472,6 +476,7 @@ config CC_HAS_SLS config SLS bool "Mitigate Straight-Line-Speculation" depends on CC_HAS_SLS && X86_64 + select OBJTOOL if HAVE_OBJTOOL default n help Compile the kernel with straight-line-speculation options to guard @@ -878,6 +883,21 @@ config ACRN_GUEST IOT with small footprint and real-time features. More details can be found in https://projectacrn.org/. +config INTEL_TDX_GUEST + bool "Intel TDX (Trust Domain Extensions) - Guest Support" + depends on X86_64 && CPU_SUP_INTEL + depends on X86_X2APIC + select ARCH_HAS_CC_PLATFORM + select X86_MEM_ENCRYPT + select X86_MCE + help + Support running as a guest under Intel TDX. Without this support, + the guest kernel can not boot or run under TDX. + TDX includes memory encryption and integrity capabilities + which protect the confidentiality and integrity of guest + memory contents and CPU state. TDX guests are protected from + some attacks from the VMM. + endif #HYPERVISOR_GUEST source "arch/x86/Kconfig.cpu" @@ -1319,7 +1339,7 @@ config MICROCODE config MICROCODE_INTEL bool "Intel microcode loading support" - depends on MICROCODE + depends on CPU_SUP_INTEL && MICROCODE default MICROCODE help This options enables microcode patch loading support for Intel @@ -1331,7 +1351,7 @@ config MICROCODE_INTEL config MICROCODE_AMD bool "AMD microcode loading support" - depends on MICROCODE + depends on CPU_SUP_AMD && MICROCODE help If you select this option, microcode patch loading support for AMD processors will be enabled. @@ -1822,17 +1842,6 @@ config ARCH_RANDOM If supported, this is a high bandwidth, cryptographically secure hardware random number generator. -config X86_SMAP - def_bool y - prompt "Supervisor Mode Access Prevention" if EXPERT - help - Supervisor Mode Access Prevention (SMAP) is a security - feature in newer Intel processors. There is a small - performance cost if this enabled and turned on; there is - also a small increase in the kernel size if this is enabled. - - If unsure, say Y. - config X86_UMIP def_bool y prompt "User Mode Instruction Prevention" if EXPERT @@ -1861,9 +1870,10 @@ config CC_HAS_IBT config X86_KERNEL_IBT prompt "Indirect Branch Tracking" bool - depends on X86_64 && CC_HAS_IBT && STACK_VALIDATION + depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f depends on !LD_IS_LLD || LLD_VERSION >= 140000 + select OBJTOOL help Build the kernel with support for Indirect Branch Tracking, a hardware support course-grain forward-edge Control Flow Integrity @@ -2332,7 +2342,9 @@ choice it can be used to assist security vulnerability exploitation. This setting can be changed at boot time via the kernel command - line parameter vsyscall=[emulate|xonly|none]. + line parameter vsyscall=[emulate|xonly|none]. Emulate mode + is deprecated and can only be enabled using the kernel command + line. On a system with recent enough glibc (2.14 or newer) and no static binaries, you can say None without a performance penalty @@ -2340,20 +2352,6 @@ choice If unsure, select "Emulate execution only". - config LEGACY_VSYSCALL_EMULATE - bool "Full emulation" - help - The kernel traps and emulates calls into the fixed vsyscall - address mapping. This makes the mapping non-executable, but - it still contains readable known contents, which could be - used in certain rare security vulnerability exploits. This - configuration is recommended when using legacy userspace - that still uses vsyscalls along with legacy binary - instrumentation tools that require code to be readable. - - An example of this type of legacy userspace is running - Pin on an old binary that still uses vsyscalls. - config LEGACY_VSYSCALL_XONLY bool "Emulate execution only" help @@ -2844,13 +2842,6 @@ config IA32_EMULATION 64-bit kernel. You should likely turn this on, unless you're 100% sure that you don't have any 32-bit programs left. -config IA32_AOUT - tristate "IA32 a.out support" - depends on IA32_EMULATION - depends on BROKEN - help - Support old a.out binaries in the 32bit emulation. - config X86_X32_ABI bool "x32 ABI for 64-bit mode" depends on X86_64 diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d3a6f74a94bd..d872a7522e55 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -237,7 +237,7 @@ choice config UNWINDER_ORC bool "ORC unwinder" depends on X86_64 - select STACK_VALIDATION + select OBJTOOL help This option enables the ORC (Oops Rewind Capability) unwinder for unwinding kernel stack traces. It uses a custom data format which is diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 63d50f65b828..1abd7cc9d6cd 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -313,5 +313,6 @@ define archhelp echo '' echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest' echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest' + echo ' x86_debug.config - Enable tip tree debugging options for testing' endef diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 34c9dbb6a47d..148ba5c5106e 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -26,6 +26,7 @@ #include "bitops.h" #include "ctype.h" #include "cpuflags.h" +#include "io.h" /* Useful macros */ #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) @@ -35,44 +36,10 @@ extern struct boot_params boot_params; #define cpu_relax() asm volatile("rep; nop") -/* Basic port I/O */ -static inline void outb(u8 v, u16 port) -{ - asm volatile("outb %0,%1" : : "a" (v), "dN" (port)); -} -static inline u8 inb(u16 port) -{ - u8 v; - asm volatile("inb %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - -static inline void outw(u16 v, u16 port) -{ - asm volatile("outw %0,%1" : : "a" (v), "dN" (port)); -} -static inline u16 inw(u16 port) -{ - u16 v; - asm volatile("inw %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - -static inline void outl(u32 v, u16 port) -{ - asm volatile("outl %0,%1" : : "a" (v), "dN" (port)); -} -static inline u32 inl(u16 port) -{ - u32 v; - asm volatile("inl %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - static inline void io_delay(void) { const u16 DELAY_PORT = 0x80; - asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT)); + outb(0, DELAY_PORT); } /* These functions are used to reference data in other segments. */ @@ -110,66 +77,78 @@ typedef unsigned int addr_t; static inline u8 rdfs8(addr_t addr) { + u8 *ptr = (u8 *)absolute_pointer(addr); u8 v; - asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); + asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*ptr)); return v; } static inline u16 rdfs16(addr_t addr) { + u16 *ptr = (u16 *)absolute_pointer(addr); u16 v; - asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr)); + asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline u32 rdfs32(addr_t addr) { + u32 *ptr = (u32 *)absolute_pointer(addr); u32 v; - asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr)); + asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline void wrfs8(u8 v, addr_t addr) { - asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v)); + u8 *ptr = (u8 *)absolute_pointer(addr); + asm volatile("movb %1,%%fs:%0" : "+m" (*ptr) : "qi" (v)); } static inline void wrfs16(u16 v, addr_t addr) { - asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v)); + u16 *ptr = (u16 *)absolute_pointer(addr); + asm volatile("movw %1,%%fs:%0" : "+m" (*ptr) : "ri" (v)); } static inline void wrfs32(u32 v, addr_t addr) { - asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v)); + u32 *ptr = (u32 *)absolute_pointer(addr); + asm volatile("movl %1,%%fs:%0" : "+m" (*ptr) : "ri" (v)); } static inline u8 rdgs8(addr_t addr) { + u8 *ptr = (u8 *)absolute_pointer(addr); u8 v; - asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); + asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*ptr)); return v; } static inline u16 rdgs16(addr_t addr) { + u16 *ptr = (u16 *)absolute_pointer(addr); u16 v; - asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr)); + asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline u32 rdgs32(addr_t addr) { + u32 *ptr = (u32 *)absolute_pointer(addr); u32 v; - asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr)); + asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline void wrgs8(u8 v, addr_t addr) { - asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v)); + u8 *ptr = (u8 *)absolute_pointer(addr); + asm volatile("movb %1,%%gs:%0" : "+m" (*ptr) : "qi" (v)); } static inline void wrgs16(u16 v, addr_t addr) { - asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v)); + u16 *ptr = (u16 *)absolute_pointer(addr); + asm volatile("movw %1,%%gs:%0" : "+m" (*ptr) : "ri" (v)); } static inline void wrgs32(u32 v, addr_t addr) { - asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v)); + u32 *ptr = (u32 *)absolute_pointer(addr); + asm volatile("movl %1,%%gs:%0" : "+m" (*ptr) : "ri" (v)); } /* Note: these only return true/false, not a signed return value! */ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 6115274fe10f..19e1905dcbf6 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -101,8 +101,10 @@ ifdef CONFIG_X86_64 endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o +vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o +vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a $(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 8bcbcee54aa1..9caf89063e77 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -3,10 +3,9 @@ #include "misc.h" #include "error.h" #include "../string.h" +#include "efi.h" #include <linux/numa.h> -#include <linux/efi.h> -#include <asm/efi.h> /* * Longest parameter of 'acpi=' is 'copy_dsdt', plus an extra '\0' @@ -20,153 +19,56 @@ */ struct mem_vector immovable_mem[MAX_NUMNODES*2]; -/* - * Search EFI system tables for RSDP. If both ACPI_20_TABLE_GUID and - * ACPI_TABLE_GUID are found, take the former, which has more features. - */ static acpi_physical_address -__efi_get_rsdp_addr(unsigned long config_tables, unsigned int nr_tables, - bool efi_64) +__efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len) { - acpi_physical_address rsdp_addr = 0; - #ifdef CONFIG_EFI - int i; - - /* Get EFI tables from systab. */ - for (i = 0; i < nr_tables; i++) { - acpi_physical_address table; - efi_guid_t guid; - - if (efi_64) { - efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables + i; - - guid = tbl->guid; - table = tbl->table; - - if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) { - debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n"); - return 0; - } - } else { - efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables + i; - - guid = tbl->guid; - table = tbl->table; - } + unsigned long rsdp_addr; + int ret; - if (!(efi_guidcmp(guid, ACPI_TABLE_GUID))) - rsdp_addr = table; - else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID))) - return table; - } + /* + * Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to + * ACPI_TABLE_GUID because it has more features. + */ + rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len, + ACPI_20_TABLE_GUID); + if (rsdp_addr) + return (acpi_physical_address)rsdp_addr; + + /* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */ + rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len, + ACPI_TABLE_GUID); + if (rsdp_addr) + return (acpi_physical_address)rsdp_addr; + + debug_putstr("Error getting RSDP address.\n"); #endif - return rsdp_addr; -} - -/* EFI/kexec support is 64-bit only. */ -#ifdef CONFIG_X86_64 -static struct efi_setup_data *get_kexec_setup_data_addr(void) -{ - struct setup_data *data; - u64 pa_data; - - pa_data = boot_params->hdr.setup_data; - while (pa_data) { - data = (struct setup_data *)pa_data; - if (data->type == SETUP_EFI) - return (struct efi_setup_data *)(pa_data + sizeof(struct setup_data)); - - pa_data = data->next; - } - return NULL; -} - -static acpi_physical_address kexec_get_rsdp_addr(void) -{ - efi_system_table_64_t *systab; - struct efi_setup_data *esd; - struct efi_info *ei; - char *sig; - - esd = (struct efi_setup_data *)get_kexec_setup_data_addr(); - if (!esd) - return 0; - - if (!esd->tables) { - debug_putstr("Wrong kexec SETUP_EFI data.\n"); - return 0; - } - - ei = &boot_params->efi_info; - sig = (char *)&ei->efi_loader_signature; - if (strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { - debug_putstr("Wrong kexec EFI loader signature.\n"); - return 0; - } - - /* Get systab from boot params. */ - systab = (efi_system_table_64_t *) (ei->efi_systab | ((__u64)ei->efi_systab_hi << 32)); - if (!systab) - error("EFI system table not found in kexec boot_params."); - - return __efi_get_rsdp_addr((unsigned long)esd->tables, systab->nr_tables, true); + return 0; } -#else -static acpi_physical_address kexec_get_rsdp_addr(void) { return 0; } -#endif /* CONFIG_X86_64 */ static acpi_physical_address efi_get_rsdp_addr(void) { #ifdef CONFIG_EFI - unsigned long systab, config_tables; + unsigned long cfg_tbl_pa = 0; + unsigned int cfg_tbl_len; + unsigned long systab_pa; unsigned int nr_tables; - struct efi_info *ei; - bool efi_64; - char *sig; - - ei = &boot_params->efi_info; - sig = (char *)&ei->efi_loader_signature; - - if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { - efi_64 = true; - } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) { - efi_64 = false; - } else { - debug_putstr("Wrong EFI loader signature.\n"); - return 0; - } + enum efi_type et; + int ret; - /* Get systab from boot params. */ -#ifdef CONFIG_X86_64 - systab = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32); -#else - if (ei->efi_systab_hi || ei->efi_memmap_hi) { - debug_putstr("Error getting RSDP address: EFI system table located above 4GB.\n"); + et = efi_get_type(boot_params); + if (et == EFI_TYPE_NONE) return 0; - } - systab = ei->efi_systab; -#endif - if (!systab) - error("EFI system table not found."); - /* Handle EFI bitness properly */ - if (efi_64) { - efi_system_table_64_t *stbl = (efi_system_table_64_t *)systab; + systab_pa = efi_get_system_table(boot_params); + if (!systab_pa) + error("EFI support advertised, but unable to locate system table."); - config_tables = stbl->tables; - nr_tables = stbl->nr_tables; - } else { - efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab; + ret = efi_get_conf_table(boot_params, &cfg_tbl_pa, &cfg_tbl_len); + if (ret || !cfg_tbl_pa) + error("EFI config table not found."); - config_tables = stbl->tables; - nr_tables = stbl->nr_tables; - } - - if (!config_tables) - error("EFI config tables not found."); - - return __efi_get_rsdp_addr(config_tables, nr_tables, efi_64); + return __efi_get_rsdp_addr(cfg_tbl_pa, cfg_tbl_len); #else return 0; #endif @@ -256,14 +158,6 @@ acpi_physical_address get_rsdp_addr(void) pa = boot_params->acpi_rsdp_addr; - /* - * Try to get EFI data from setup_data. This can happen when we're a - * kexec'ed kernel and kexec(1) has passed all the required EFI info to - * us. - */ - if (!pa) - pa = kexec_get_rsdp_addr(); - if (!pa) pa = efi_get_rsdp_addr(); diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c index 261e81fb9582..70a8d1706d0f 100644 --- a/arch/x86/boot/compressed/early_serial_console.c +++ b/arch/x86/boot/compressed/early_serial_console.c @@ -1,5 +1,6 @@ #include "misc.h" -int early_serial_base; +/* This might be accessed before .bss is cleared, so use .data instead. */ +int early_serial_base __section(".data"); #include "../early_serial_console.c" diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c new file mode 100644 index 000000000000..6edd034b0b30 --- /dev/null +++ b/arch/x86/boot/compressed/efi.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helpers for early access to EFI configuration table. + * + * Originally derived from arch/x86/boot/compressed/acpi.c + */ + +#include "misc.h" + +/** + * efi_get_type - Given a pointer to boot_params, determine the type of EFI environment. + * + * @bp: pointer to boot_params + * + * Return: EFI_TYPE_{32,64} for valid EFI environments, EFI_TYPE_NONE otherwise. + */ +enum efi_type efi_get_type(struct boot_params *bp) +{ + struct efi_info *ei; + enum efi_type et; + const char *sig; + + ei = &bp->efi_info; + sig = (char *)&ei->efi_loader_signature; + + if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { + et = EFI_TYPE_64; + } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) { + et = EFI_TYPE_32; + } else { + debug_putstr("No EFI environment detected.\n"); + et = EFI_TYPE_NONE; + } + +#ifndef CONFIG_X86_64 + /* + * Existing callers like acpi.c treat this case as an indicator to + * fall-through to non-EFI, rather than an error, so maintain that + * functionality here as well. + */ + if (ei->efi_systab_hi || ei->efi_memmap_hi) { + debug_putstr("EFI system table is located above 4GB and cannot be accessed.\n"); + et = EFI_TYPE_NONE; + } +#endif + + return et; +} + +/** + * efi_get_system_table - Given a pointer to boot_params, retrieve the physical address + * of the EFI system table. + * + * @bp: pointer to boot_params + * + * Return: EFI system table address on success. On error, return 0. + */ +unsigned long efi_get_system_table(struct boot_params *bp) +{ + unsigned long sys_tbl_pa; + struct efi_info *ei; + enum efi_type et; + + /* Get systab from boot params. */ + ei = &bp->efi_info; +#ifdef CONFIG_X86_64 + sys_tbl_pa = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32); +#else + sys_tbl_pa = ei->efi_systab; +#endif + if (!sys_tbl_pa) { + debug_putstr("EFI system table not found."); + return 0; + } + + return sys_tbl_pa; +} + +/* + * EFI config table address changes to virtual address after boot, which may + * not be accessible for the kexec'd kernel. To address this, kexec provides + * the initial physical address via a struct setup_data entry, which is + * checked for here, along with some sanity checks. + */ +static struct efi_setup_data *get_kexec_setup_data(struct boot_params *bp, + enum efi_type et) +{ +#ifdef CONFIG_X86_64 + struct efi_setup_data *esd = NULL; + struct setup_data *data; + u64 pa_data; + + pa_data = bp->hdr.setup_data; + while (pa_data) { + data = (struct setup_data *)pa_data; + if (data->type == SETUP_EFI) { + esd = (struct efi_setup_data *)(pa_data + sizeof(struct setup_data)); + break; + } + + pa_data = data->next; + } + + /* + * Original ACPI code falls back to attempting normal EFI boot in these + * cases, so maintain existing behavior by indicating non-kexec + * environment to the caller, but print them for debugging. + */ + if (esd && !esd->tables) { + debug_putstr("kexec EFI environment missing valid configuration table.\n"); + return NULL; + } + + return esd; +#endif + return NULL; +} + +/** + * efi_get_conf_table - Given a pointer to boot_params, locate and return the physical + * address of EFI configuration table. + * + * @bp: pointer to boot_params + * @cfg_tbl_pa: location to store physical address of config table + * @cfg_tbl_len: location to store number of config table entries + * + * Return: 0 on success. On error, return params are left unchanged. + */ +int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa, + unsigned int *cfg_tbl_len) +{ + unsigned long sys_tbl_pa; + enum efi_type et; + int ret; + + if (!cfg_tbl_pa || !cfg_tbl_len) + return -EINVAL; + + sys_tbl_pa = efi_get_system_table(bp); + if (!sys_tbl_pa) + return -EINVAL; + + /* Handle EFI bitness properly */ + et = efi_get_type(bp); + if (et == EFI_TYPE_64) { + efi_system_table_64_t *stbl = (efi_system_table_64_t *)sys_tbl_pa; + struct efi_setup_data *esd; + + /* kexec provides an alternative EFI conf table, check for it. */ + esd = get_kexec_setup_data(bp, et); + + *cfg_tbl_pa = esd ? esd->tables : stbl->tables; + *cfg_tbl_len = stbl->nr_tables; + } else if (et == EFI_TYPE_32) { + efi_system_table_32_t *stbl = (efi_system_table_32_t *)sys_tbl_pa; + + *cfg_tbl_pa = stbl->tables; + *cfg_tbl_len = stbl->nr_tables; + } else { + return -EINVAL; + } + + return 0; +} + +/* Get vendor table address/guid from EFI config table at the given index */ +static int get_vendor_table(void *cfg_tbl, unsigned int idx, + unsigned long *vendor_tbl_pa, + efi_guid_t *vendor_tbl_guid, + enum efi_type et) +{ + if (et == EFI_TYPE_64) { + efi_config_table_64_t *tbl_entry = (efi_config_table_64_t *)cfg_tbl + idx; + + if (!IS_ENABLED(CONFIG_X86_64) && tbl_entry->table >> 32) { + debug_putstr("Error: EFI config table entry located above 4GB.\n"); + return -EINVAL; + } + + *vendor_tbl_pa = tbl_entry->table; + *vendor_tbl_guid = tbl_entry->guid; + + } else if (et == EFI_TYPE_32) { + efi_config_table_32_t *tbl_entry = (efi_config_table_32_t *)cfg_tbl + idx; + + *vendor_tbl_pa = tbl_entry->table; + *vendor_tbl_guid = tbl_entry->guid; + } else { + return -EINVAL; + } + + return 0; +} + +/** + * efi_find_vendor_table - Given EFI config table, search it for the physical + * address of the vendor table associated with GUID. + * + * @bp: pointer to boot_params + * @cfg_tbl_pa: pointer to EFI configuration table + * @cfg_tbl_len: number of entries in EFI configuration table + * @guid: GUID of vendor table + * + * Return: vendor table address on success. On error, return 0. + */ +unsigned long efi_find_vendor_table(struct boot_params *bp, + unsigned long cfg_tbl_pa, + unsigned int cfg_tbl_len, + efi_guid_t guid) +{ + enum efi_type et; + unsigned int i; + + et = efi_get_type(bp); + if (et == EFI_TYPE_NONE) + return 0; + + for (i = 0; i < cfg_tbl_len; i++) { + unsigned long vendor_tbl_pa; + efi_guid_t vendor_tbl_guid; + int ret; + + ret = get_vendor_table((void *)cfg_tbl_pa, i, + &vendor_tbl_pa, + &vendor_tbl_guid, et); + if (ret) + return 0; + + if (!efi_guidcmp(guid, vendor_tbl_guid)) + return vendor_tbl_pa; + } + + return 0; +} diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h new file mode 100644 index 000000000000..7db2f41b54cd --- /dev/null +++ b/arch/x86/boot/compressed/efi.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_COMPRESSED_EFI_H +#define BOOT_COMPRESSED_EFI_H + +#if defined(_LINUX_EFI_H) || defined(_ASM_X86_EFI_H) +#error Please do not include kernel proper namespace headers +#endif + +typedef guid_t efi_guid_t __aligned(__alignof__(u32)); + +#define EFI_GUID(a, b, c, d...) (efi_guid_t){ { \ + (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ + (b) & 0xff, ((b) >> 8) & 0xff, \ + (c) & 0xff, ((c) >> 8) & 0xff, d } } + +#define ACPI_TABLE_GUID EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) +#define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81) +#define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42) + +#define EFI32_LOADER_SIGNATURE "EL32" +#define EFI64_LOADER_SIGNATURE "EL64" + +/* + * Generic EFI table header + */ +typedef struct { + u64 signature; + u32 revision; + u32 headersize; + u32 crc32; + u32 reserved; +} efi_table_hdr_t; + +#define EFI_CONVENTIONAL_MEMORY 7 + +#define EFI_MEMORY_MORE_RELIABLE \ + ((u64)0x0000000000010000ULL) /* higher reliability */ +#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ + +#define EFI_PAGE_SHIFT 12 + +typedef struct { + u32 type; + u32 pad; + u64 phys_addr; + u64 virt_addr; + u64 num_pages; + u64 attribute; +} efi_memory_desc_t; + +#define efi_early_memdesc_ptr(map, desc_size, n) \ + (efi_memory_desc_t *)((void *)(map) + ((n) * (desc_size))) + +typedef struct { + efi_guid_t guid; + u64 table; +} efi_config_table_64_t; + +typedef struct { + efi_guid_t guid; + u32 table; +} efi_config_table_32_t; + +typedef struct { + efi_table_hdr_t hdr; + u64 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 __pad1; + u64 con_in_handle; + u64 con_in; + u64 con_out_handle; + u64 con_out; + u64 stderr_handle; + u64 stderr; + u64 runtime; + u64 boottime; + u32 nr_tables; + u32 __pad2; + u64 tables; +} efi_system_table_64_t; + +typedef struct { + efi_table_hdr_t hdr; + u32 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 con_in_handle; + u32 con_in; + u32 con_out_handle; + u32 con_out; + u32 stderr_handle; + u32 stderr; + u32 runtime; + u32 boottime; + u32 nr_tables; + u32 tables; +} efi_system_table_32_t; + +/* kexec external ABI */ +struct efi_setup_data { + u64 fw_vendor; + u64 __unused; + u64 tables; + u64 smbios; + u64 reserved[8]; +}; + +static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right) +{ + return memcmp(&left, &right, sizeof (efi_guid_t)); +} + +#ifdef CONFIG_EFI +bool __pure __efi_soft_reserve_enabled(void); + +static inline bool __pure efi_soft_reserve_enabled(void) +{ + return IS_ENABLED(CONFIG_EFI_SOFT_RESERVE) + && __efi_soft_reserve_enabled(); +} +#else +static inline bool efi_soft_reserve_enabled(void) +{ + return false; +} +#endif /* CONFIG_EFI */ +#endif /* BOOT_COMPRESSED_EFI_H */ diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index dea95301196b..d33f060900d2 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -189,11 +189,11 @@ SYM_FUNC_START(startup_32) subl $32, %eax /* Encryption bit is always above bit 31 */ bts %eax, %edx /* Set encryption mask for page tables */ /* - * Mark SEV as active in sev_status so that startup32_check_sev_cbit() - * will do a check. The sev_status memory will be fully initialized - * with the contents of MSR_AMD_SEV_STATUS later in - * set_sev_encryption_mask(). For now it is sufficient to know that SEV - * is active. + * Set MSR_AMD64_SEV_ENABLED_BIT in sev_status so that + * startup32_check_sev_cbit() will do a check. sev_enable() will + * initialize sev_status with all the bits reported by + * MSR_AMD_SEV_STATUS later, but only MSR_AMD64_SEV_ENABLED_BIT + * needs to be set for now. */ movl $1, rva(sev_status)(%ebp) 1: @@ -289,7 +289,7 @@ SYM_FUNC_START(startup_32) pushl %eax /* Enter paged protected Mode, activating Long Mode */ - movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */ + movl $CR0_STATE, %eax movl %eax, %cr0 /* Jump from 32bit compatibility mode into 64bit mode. */ @@ -447,6 +447,23 @@ SYM_CODE_START(startup_64) call load_stage1_idt popq %rsi +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Now that the stage1 interrupt handlers are set up, #VC exceptions from + * CPUID instructions can be properly handled for SEV-ES guests. + * + * For SEV-SNP, the CPUID table also needs to be set up in advance of any + * CPUID instructions being issued, so go ahead and do that now via + * sev_enable(), which will also handle the rest of the SEV-related + * detection/setup to ensure that has been done in advance of any dependent + * code. + */ + pushq %rsi + movq %rsi, %rdi /* real mode address */ + call sev_enable + popq %rsi +#endif + /* * paging_prepare() sets up the trampoline and checks if we need to * enable 5-level paging. @@ -558,17 +575,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) shrq $3, %rcx rep stosq -/* - * If running as an SEV guest, the encryption mask is required in the - * page-table setup code below. When the guest also has SEV-ES enabled - * set_sev_encryption_mask() will cause #VC exceptions, but the stage2 - * handler can't map its GHCB because the page-table is not set up yet. - * So set up the encryption mask here while still on the stage1 #VC - * handler. Then load stage2 IDT and switch to the kernel's own - * page-table. - */ pushq %rsi - call set_sev_encryption_mask call load_stage2_idt /* Pass boot_params to initialize_identity_maps() */ @@ -642,12 +649,28 @@ SYM_CODE_START(trampoline_32bit_src) movl $MSR_EFER, %ecx rdmsr btsl $_EFER_LME, %eax + /* Avoid writing EFER if no change was made (for TDX guest) */ + jc 1f wrmsr - popl %edx +1: popl %edx popl %ecx +#ifdef CONFIG_X86_MCE + /* + * Preserve CR4.MCE if the kernel will enable #MC support. + * Clearing MCE may fault in some environments (that also force #MC + * support). Any machine check that occurs before #MC support is fully + * configured will crash the system regardless of the CR4.MCE value set + * here. + */ + movl %cr4, %eax + andl $X86_CR4_MCE, %eax +#else + movl $0, %eax +#endif + /* Enable PAE and LA57 (if required) paging modes */ - movl $X86_CR4_PAE, %eax + orl $X86_CR4_PAE, %eax testl %edx, %edx jz 1f orl $X86_CR4_LA57, %eax @@ -661,8 +684,9 @@ SYM_CODE_START(trampoline_32bit_src) pushl $__KERNEL_CS pushl %eax - /* Enable paging again */ - movl $(X86_CR0_PG | X86_CR0_PE), %eax + /* Enable paging again. */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 lret diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index f7213d0943b8..44c350d627c7 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -90,7 +90,7 @@ static struct x86_mapping_info mapping_info; /* * Adds the specified range to the identity mappings. */ -static void add_identity_map(unsigned long start, unsigned long end) +void kernel_add_identity_map(unsigned long start, unsigned long end) { int ret; @@ -157,14 +157,15 @@ void initialize_identity_maps(void *rmode) * explicitly here in case the compressed kernel does not touch them, * or does not touch all the pages covering them. */ - add_identity_map((unsigned long)_head, (unsigned long)_end); + kernel_add_identity_map((unsigned long)_head, (unsigned long)_end); boot_params = rmode; - add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1)); + kernel_add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1)); cmdline = get_cmd_line_ptr(); - add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); + kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); + + sev_prep_identity_maps(top_level_pgt); /* Load the new page-table. */ - sev_verify_cbit(top_level_pgt); write_cr3(top_level_pgt); } @@ -246,10 +247,10 @@ static int set_clr_page_flags(struct x86_mapping_info *info, * It should already exist, but keep things generic. * * To map the page just read from it and fault it in if there is no - * mapping yet. add_identity_map() can't be called here because that - * would unconditionally map the address on PMD level, destroying any - * PTE-level mappings that might already exist. Use assembly here so - * the access won't be optimized away. + * mapping yet. kernel_add_identity_map() can't be called here because + * that would unconditionally map the address on PMD level, destroying + * any PTE-level mappings that might already exist. Use assembly here + * so the access won't be optimized away. */ asm volatile("mov %[address], %%r9" :: [address] "g" (*(unsigned long *)address) @@ -275,15 +276,31 @@ static int set_clr_page_flags(struct x86_mapping_info *info, * Changing encryption attributes of a page requires to flush it from * the caches. */ - if ((set | clr) & _PAGE_ENC) + if ((set | clr) & _PAGE_ENC) { clflush_page(address); + /* + * If the encryption attribute is being cleared, change the page state + * to shared in the RMP table. + */ + if (clr) + snp_set_page_shared(__pa(address & PAGE_MASK)); + } + /* Update PTE */ pte = *ptep; pte = pte_set_flags(pte, set); pte = pte_clear_flags(pte, clr); set_pte(ptep, pte); + /* + * If the encryption attribute is being set, then change the page state to + * private in the RMP entry. The page state change must be done after the PTE + * is updated. + */ + if (set & _PAGE_ENC) + snp_set_page_private(__pa(address & PAGE_MASK)); + /* Flush TLB after changing encryption attribute */ write_cr3(top_level_pgt); @@ -347,5 +364,5 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) * Error code is sane - now identity map the 2M region around * the faulting address. */ - add_identity_map(address, end); + kernel_add_identity_map(address, end); } diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c index 9b93567d663a..6debb816e83d 100644 --- a/arch/x86/boot/compressed/idt_64.c +++ b/arch/x86/boot/compressed/idt_64.c @@ -39,7 +39,23 @@ void load_stage1_idt(void) load_boot_idt(&boot_idt_desc); } -/* Setup IDT after kernel jumping to .Lrelocated */ +/* + * Setup IDT after kernel jumping to .Lrelocated. + * + * initialize_identity_maps() needs a #PF handler to be setup + * in order to be able to fault-in identity mapping ranges; see + * do_boot_page_fault(). + * + * This #PF handler setup needs to happen in load_stage2_idt() where the + * IDT is loaded and there the #VC IDT entry gets setup too. + * + * In order to be able to handle #VCs, one needs a GHCB which + * gets setup with an already set up pagetable, which is done in + * initialize_identity_maps(). And there's the catch 22: the boot #VC + * handler do_boot_stage2_vc() needs to call early_setup_ghcb() itself + * (and, especially set_page_decrypted()) because the SEV-ES setup code + * cannot initialize a GHCB as there's no #PF handler yet... + */ void load_stage2_idt(void) { boot_idt_desc.address = (unsigned long)boot_idt; diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 411b268bc0a2..4a3f223973f4 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -22,15 +22,14 @@ #include "misc.h" #include "error.h" #include "../string.h" +#include "efi.h" #include <generated/compile.h> #include <linux/module.h> #include <linux/uts.h> #include <linux/utsname.h> #include <linux/ctype.h> -#include <linux/efi.h> #include <generated/utsrelease.h> -#include <asm/efi.h> #define _SETUP #include <asm/setup.h> /* For COMMAND_LINE_SIZE */ diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index a63424d13627..a73e4d783cae 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -187,42 +187,6 @@ SYM_CODE_END(startup32_vc_handler) .code64 #include "../../kernel/sev_verify_cbit.S" -SYM_FUNC_START(set_sev_encryption_mask) -#ifdef CONFIG_AMD_MEM_ENCRYPT - push %rbp - push %rdx - - movq %rsp, %rbp /* Save current stack pointer */ - - call get_sev_encryption_bit /* Get the encryption bit position */ - testl %eax, %eax - jz .Lno_sev_mask - - bts %rax, sme_me_mask(%rip) /* Create the encryption mask */ - - /* - * Read MSR_AMD64_SEV again and store it to sev_status. Can't do this in - * get_sev_encryption_bit() because this function is 32-bit code and - * shared between 64-bit and 32-bit boot path. - */ - movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */ - rdmsr - - /* Store MSR value in sev_status */ - shlq $32, %rdx - orq %rdx, %rax - movq %rax, sev_status(%rip) - -.Lno_sev_mask: - movq %rbp, %rsp /* Restore original stack pointer */ - - pop %rdx - pop %rbp -#endif - - xor %rax, %rax - RET -SYM_FUNC_END(set_sev_encryption_mask) .data diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 1cdcaf34ee36..cf690d8712f4 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -48,12 +48,17 @@ void *memmove(void *dest, const void *src, size_t n); */ struct boot_params *boot_params; +struct port_io_ops pio_ops; + memptr free_mem_ptr; memptr free_mem_end_ptr; static char *vidmem; static int vidport; -static int lines, cols; + +/* These might be accessed before .bss is cleared, so use .data instead. */ +static int lines __section(".data"); +static int cols __section(".data"); #ifdef CONFIG_KERNEL_GZIP #include "../../../../lib/decompress_inflate.c" @@ -371,6 +376,16 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, lines = boot_params->screen_info.orig_video_lines; cols = boot_params->screen_info.orig_video_cols; + init_default_io_ops(); + + /* + * Detect TDX guest environment. + * + * It has to be done before console_init() in order to use + * paravirtualized port I/O operations if needed. + */ + early_tdx_detect(); + console_init(); /* diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 16ed360b6692..4910bf230d7b 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -22,17 +22,21 @@ #include <linux/linkage.h> #include <linux/screen_info.h> #include <linux/elf.h> -#include <linux/io.h> #include <asm/page.h> #include <asm/boot.h> #include <asm/bootparam.h> #include <asm/desc_defs.h> +#include "tdx.h" + #define BOOT_CTYPE_H #include <linux/acpi.h> #define BOOT_BOOT_H #include "../ctype.h" +#include "../io.h" + +#include "efi.h" #ifdef CONFIG_X86_64 #define memptr long @@ -120,17 +124,23 @@ static inline void console_init(void) { } #endif -void set_sev_encryption_mask(void); - #ifdef CONFIG_AMD_MEM_ENCRYPT +void sev_enable(struct boot_params *bp); void sev_es_shutdown_ghcb(void); extern bool sev_es_check_ghcb_fault(unsigned long address); +void snp_set_page_private(unsigned long paddr); +void snp_set_page_shared(unsigned long paddr); +void sev_prep_identity_maps(unsigned long top_level_pgt); #else +static inline void sev_enable(struct boot_params *bp) { } static inline void sev_es_shutdown_ghcb(void) { } static inline bool sev_es_check_ghcb_fault(unsigned long address) { return false; } +static inline void snp_set_page_private(unsigned long paddr) { } +static inline void snp_set_page_shared(unsigned long paddr) { } +static inline void sev_prep_identity_maps(unsigned long top_level_pgt) { } #endif /* acpi.c */ @@ -151,6 +161,7 @@ static inline int count_immovable_mem_regions(void) { return 0; } #ifdef CONFIG_X86_5LEVEL extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; #endif +extern void kernel_add_identity_map(unsigned long start, unsigned long end); /* Used by PAGE_KERN* macros: */ extern pteval_t __default_kernel_pte_mask; @@ -172,4 +183,47 @@ void boot_stage2_vc(void); unsigned long sev_verify_cbit(unsigned long cr3); +enum efi_type { + EFI_TYPE_64, + EFI_TYPE_32, + EFI_TYPE_NONE, +}; + +#ifdef CONFIG_EFI +/* helpers for early EFI config table access */ +enum efi_type efi_get_type(struct boot_params *bp); +unsigned long efi_get_system_table(struct boot_params *bp); +int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa, + unsigned int *cfg_tbl_len); +unsigned long efi_find_vendor_table(struct boot_params *bp, + unsigned long cfg_tbl_pa, + unsigned int cfg_tbl_len, + efi_guid_t guid); +#else +static inline enum efi_type efi_get_type(struct boot_params *bp) +{ + return EFI_TYPE_NONE; +} + +static inline unsigned long efi_get_system_table(struct boot_params *bp) +{ + return 0; +} + +static inline int efi_get_conf_table(struct boot_params *bp, + unsigned long *cfg_tbl_pa, + unsigned int *cfg_tbl_len) +{ + return -ENOENT; +} + +static inline unsigned long efi_find_vendor_table(struct boot_params *bp, + unsigned long cfg_tbl_pa, + unsigned int cfg_tbl_len, + efi_guid_t guid) +{ + return 0; +} +#endif /* CONFIG_EFI */ + #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h index 6ff7e81b5628..cc9b2529a086 100644 --- a/arch/x86/boot/compressed/pgtable.h +++ b/arch/x86/boot/compressed/pgtable.h @@ -6,7 +6,7 @@ #define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 #define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE -#define TRAMPOLINE_32BIT_CODE_SIZE 0x70 +#define TRAMPOLINE_32BIT_CODE_SIZE 0x80 #define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index a1733319a22a..2ac12ff4111b 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,11 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" -#include <linux/efi.h> #include <asm/e820/types.h> #include <asm/processor.h> -#include <asm/efi.h> #include "pgtable.h" #include "../string.h" +#include "efi.h" #define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ #define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 28bcf04c022e..52f989f6acc2 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -20,8 +20,10 @@ #include <asm/fpu/xcr.h> #include <asm/ptrace.h> #include <asm/svm.h> +#include <asm/cpuid.h> #include "error.h" +#include "../msr.h" struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); struct ghcb *boot_ghcb; @@ -56,23 +58,19 @@ static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) static inline u64 sev_es_rd_ghcb_msr(void) { - unsigned long low, high; + struct msr m; - asm volatile("rdmsr" : "=a" (low), "=d" (high) : - "c" (MSR_AMD64_SEV_ES_GHCB)); + boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); - return ((high << 32) | low); + return m.q; } static inline void sev_es_wr_ghcb_msr(u64 val) { - u32 low, high; + struct msr m; - low = val & 0xffffffffUL; - high = val >> 32; - - asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB), - "a"(low), "d" (high) : "memory"); + m.q = val; + boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); } static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) @@ -119,11 +117,54 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, /* Include code for early handlers */ #include "../../kernel/sev-shared.c" -static bool early_setup_sev_es(void) +static inline bool sev_snp_enabled(void) +{ + return sev_status & MSR_AMD64_SEV_SNP_ENABLED; +} + +static void __page_state_change(unsigned long paddr, enum psc_op op) +{ + u64 val; + + if (!sev_snp_enabled()) + return; + + /* + * If private -> shared then invalidate the page before requesting the + * state change in the RMP table. + */ + if (op == SNP_PAGE_STATE_SHARED && pvalidate(paddr, RMP_PG_SIZE_4K, 0)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + + /* Issue VMGEXIT to change the page state in RMP table. */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + /* Read the response of the VMGEXIT. */ + val = sev_es_rd_ghcb_msr(); + if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + /* + * Now that page state is changed in the RMP table, validate it so that it is + * consistent with the RMP entry. + */ + if (op == SNP_PAGE_STATE_PRIVATE && pvalidate(paddr, RMP_PG_SIZE_4K, 1)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); +} + +void snp_set_page_private(unsigned long paddr) +{ + __page_state_change(paddr, SNP_PAGE_STATE_PRIVATE); +} + +void snp_set_page_shared(unsigned long paddr) { - if (!sev_es_negotiate_protocol()) - sev_es_terminate(GHCB_SEV_ES_PROT_UNSUPPORTED); + __page_state_change(paddr, SNP_PAGE_STATE_SHARED); +} +static bool early_setup_ghcb(void) +{ if (set_page_decrypted((unsigned long)&boot_ghcb_page)) return false; @@ -135,6 +176,10 @@ static bool early_setup_sev_es(void) /* Initialize lookup tables for the instruction decoder */ inat_init_tables(); + /* SNP guest requires the GHCB GPA must be registered */ + if (sev_snp_enabled()) + snp_register_ghcb_early(__pa(&boot_ghcb_page)); + return true; } @@ -174,8 +219,8 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) struct es_em_ctxt ctxt; enum es_result result; - if (!boot_ghcb && !early_setup_sev_es()) - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + if (!boot_ghcb && !early_setup_ghcb()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); vc_ghcb_invalidate(boot_ghcb); result = vc_init_em_ctxt(&ctxt, regs, exit_code); @@ -202,5 +247,191 @@ finish: if (result == ES_OK) vc_finish_insn(&ctxt); else if (result != ES_RETRY) - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} + +static void enforce_vmpl0(void) +{ + u64 attrs; + int err; + + /* + * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically + * higher) privilege level. Here, clear the VMPL1 permission mask of the + * GHCB page. If the guest is not running at VMPL0, this will fail. + * + * If the guest is running at VMPL0, it will succeed. Even if that operation + * modifies permission bits, it is still ok to do so currently because Linux + * SNP guests are supported only on VMPL0 so VMPL1 or higher permission masks + * changing is a don't-care. + */ + attrs = 1; + if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, attrs)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); +} + +void sev_enable(struct boot_params *bp) +{ + unsigned int eax, ebx, ecx, edx; + struct msr m; + bool snp; + + /* + * Setup/preliminary detection of SNP. This will be sanity-checked + * against CPUID/MSR values later. + */ + snp = snp_init(bp); + + /* Check for the SME/SEV support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + /* Check whether SEV is supported */ + if (!(eax & BIT(1))) { + if (snp) + error("SEV-SNP support indicated by CC blob, but not CPUID."); + return; + } + + /* Set the SME mask if this is an SEV guest. */ + boot_rdmsr(MSR_AMD64_SEV, &m); + sev_status = m.q; + if (!(sev_status & MSR_AMD64_SEV_ENABLED)) + return; + + /* Negotiate the GHCB protocol version. */ + if (sev_status & MSR_AMD64_SEV_ES_ENABLED) { + if (!sev_es_negotiate_protocol()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED); + } + + /* + * SNP is supported in v2 of the GHCB spec which mandates support for HV + * features. + */ + if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) { + if (!(get_hv_features() & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + enforce_vmpl0(); + } + + if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + error("SEV-SNP supported indicated by CC blob, but not SEV status MSR."); + + sme_me_mask = BIT_ULL(ebx & 0x3f); +} + +/* Search for Confidential Computing blob in the EFI config table. */ +static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp) +{ + unsigned long cfg_table_pa; + unsigned int cfg_table_len; + int ret; + + ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len); + if (ret) + return NULL; + + return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa, + cfg_table_len, + EFI_CC_BLOB_GUID); +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the boot kernel + * by firmware/bootloader in the following ways: + * + * - via an entry in the EFI config table + * - via a setup_data structure, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + cc_info = find_cc_blob_efi(bp); + if (cc_info) + goto found_cc_info; + + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + return cc_info; +} + +/* + * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks + * will verify the SNP CPUID/MSR bits. + */ +bool snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + /* + * If a SNP-specific Confidential Computing blob is present, then + * firmware/bootloader have indicated SNP support. Verifying this + * involves CPUID checks which will be more reliable if the SNP + * CPUID table is used. See comments over snp_setup_cpuid_table() for + * more details. + */ + setup_cpuid_table(cc_info); + + /* + * Pass run-time kernel a pointer to CC info via boot_params so EFI + * config table doesn't need to be searched again during early startup + * phase. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} + +void sev_prep_identity_maps(unsigned long top_level_pgt) +{ + /* + * The Confidential Computing blob is used very early in uncompressed + * kernel to find the in-memory CPUID table to handle CPUID + * instructions. Make sure an identity-mapping exists so it can be + * accessed after switchover. + */ + if (sev_snp_enabled()) { + unsigned long cc_info_pa = boot_params->cc_blob_address; + struct cc_blob_sev_info *cc_info; + + kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info)); + + cc_info = (struct cc_blob_sev_info *)cc_info_pa; + kernel_add_identity_map(cc_info->cpuid_phys, cc_info->cpuid_phys + cc_info->cpuid_len); + } + + sev_verify_cbit(top_level_pgt); } diff --git a/arch/x86/boot/compressed/tdcall.S b/arch/x86/boot/compressed/tdcall.S new file mode 100644 index 000000000000..46d0495e0d3a --- /dev/null +++ b/arch/x86/boot/compressed/tdcall.S @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "../../coco/tdx/tdcall.S" diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c new file mode 100644 index 000000000000..918a7606f53c --- /dev/null +++ b/arch/x86/boot/compressed/tdx.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "../cpuflags.h" +#include "../string.h" +#include "../io.h" +#include "error.h" + +#include <vdso/limits.h> +#include <uapi/asm/vmx.h> + +#include <asm/shared/tdx.h> + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void) +{ + error("TDVMCALL failed. TDX module bug?"); +} + +static inline unsigned int tdx_io_in(int size, u16 port) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_IO_INSTRUCTION, + .r12 = size, + .r13 = 0, + .r14 = port, + }; + + if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + return UINT_MAX; + + return args.r11; +} + +static inline void tdx_io_out(int size, u16 port, u32 value) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_IO_INSTRUCTION, + .r12 = size, + .r13 = 1, + .r14 = port, + .r15 = value, + }; + + __tdx_hypercall(&args, 0); +} + +static inline u8 tdx_inb(u16 port) +{ + return tdx_io_in(1, port); +} + +static inline void tdx_outb(u8 value, u16 port) +{ + tdx_io_out(1, port, value); +} + +static inline void tdx_outw(u16 value, u16 port) +{ + tdx_io_out(2, port, value); +} + +void early_tdx_detect(void) +{ + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); + + if (memcmp(TDX_IDENT, sig, sizeof(sig))) + return; + + /* Use hypercalls instead of I/O instructions */ + pio_ops.f_inb = tdx_inb; + pio_ops.f_outb = tdx_outb; + pio_ops.f_outw = tdx_outw; +} diff --git a/arch/x86/boot/compressed/tdx.h b/arch/x86/boot/compressed/tdx.h new file mode 100644 index 000000000000..9055482cd35c --- /dev/null +++ b/arch/x86/boot/compressed/tdx.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_COMPRESSED_TDX_H +#define BOOT_COMPRESSED_TDX_H + +#include <linux/types.h> + +#ifdef CONFIG_INTEL_TDX_GUEST +void early_tdx_detect(void); +#else +static inline void early_tdx_detect(void) { }; +#endif + +#endif /* BOOT_COMPRESSED_TDX_H */ diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index e1478d32de1a..fed8d13ce252 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -27,6 +27,7 @@ #include <asm/required-features.h> #include <asm/msr-index.h> #include "string.h" +#include "msr.h" static u32 err_flags[NCAPINTS]; @@ -130,12 +131,11 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) /* If this is an AMD and we're only missing SSE+SSE2, try to turn them on */ - u32 ecx = MSR_K7_HWCR; - u32 eax, edx; + struct msr m; - asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); - eax &= ~(1 << 15); - asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); + boot_rdmsr(MSR_K7_HWCR, &m); + m.l &= ~(1 << 15); + boot_wrmsr(MSR_K7_HWCR, &m); get_cpuflags(); /* Make sure it really did something */ err = check_cpuflags(); @@ -145,28 +145,28 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) /* If this is a VIA C3, we might have to enable CX8 explicitly */ - u32 ecx = MSR_VIA_FCR; - u32 eax, edx; + struct msr m; - asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); - eax |= (1<<1)|(1<<7); - asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); + boot_rdmsr(MSR_VIA_FCR, &m); + m.l |= (1 << 1) | (1 << 7); + boot_wrmsr(MSR_VIA_FCR, &m); set_bit(X86_FEATURE_CX8, cpu.flags); err = check_cpuflags(); } else if (err == 0x01 && is_transmeta()) { /* Transmeta might have masked feature bits in word 0 */ - u32 ecx = 0x80860004; - u32 eax, edx; + struct msr m, m_tmp; u32 level = 1; - asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); - asm("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx)); + boot_rdmsr(0x80860004, &m); + m_tmp = m; + m_tmp.l = ~0; + boot_wrmsr(0x80860004, &m_tmp); asm("cpuid" : "+a" (level), "=d" (cpu.flags[0]) : : "ecx", "ebx"); - asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); + boot_wrmsr(0x80860004, &m); err = check_cpuflags(); } else if (err == 0x01 && diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index a0b75f73dc63..a83d67ec627d 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -71,8 +71,7 @@ int has_eflag(unsigned long mask) # define EBX_REG "=b" #endif -static inline void cpuid_count(u32 id, u32 count, - u32 *a, u32 *b, u32 *c, u32 *d) +void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d) { asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t" "cpuid \n\t" diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index 2e20814d3ce3..475b8fde90f7 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -17,5 +17,6 @@ extern u32 cpu_vendor[3]; int has_eflag(unsigned long mask); void get_cpuflags(void); +void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d); #endif diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 6dbd7e9f74c9..0352e4589efa 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -163,7 +163,11 @@ extra_header_fields: .long 0x200 # SizeOfHeaders .long 0 # CheckSum .word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application) +#ifdef CONFIG_DXE_MEM_ATTRIBUTES + .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics +#else .word 0 # DllCharacteristics +#endif #ifdef CONFIG_X86_32 .long 0 # SizeOfStackReserve .long 0 # SizeOfStackCommit diff --git a/arch/x86/boot/io.h b/arch/x86/boot/io.h new file mode 100644 index 000000000000..110880907f87 --- /dev/null +++ b/arch/x86/boot/io.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_IO_H +#define BOOT_IO_H + +#include <asm/shared/io.h> + +#undef inb +#undef inw +#undef inl +#undef outb +#undef outw +#undef outl + +struct port_io_ops { + u8 (*f_inb)(u16 port); + void (*f_outb)(u8 v, u16 port); + void (*f_outw)(u16 v, u16 port); +}; + +extern struct port_io_ops pio_ops; + +/* + * Use the normal I/O instructions by default. + * TDX guests override these to use hypercalls. + */ +static inline void init_default_io_ops(void) +{ + pio_ops.f_inb = __inb; + pio_ops.f_outb = __outb; + pio_ops.f_outw = __outw; +} + +/* + * Redirect port I/O operations via pio_ops callbacks. + * TDX guests override these callbacks with TDX-specific helpers. + */ +#define inb pio_ops.f_inb +#define outb pio_ops.f_outb +#define outw pio_ops.f_outw + +#endif diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index e3add857c2c9..c4ea5258ab55 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -17,6 +17,8 @@ struct boot_params boot_params __attribute__((aligned(16))); +struct port_io_ops pio_ops; + char *HEAP = _end; char *heap_end = _end; /* Default end of heap = no heap */ @@ -33,7 +35,7 @@ static void copy_boot_params(void) u16 cl_offset; }; const struct old_cmdline * const oldcmd = - (const struct old_cmdline *)OLD_CL_ADDRESS; + absolute_pointer(OLD_CL_ADDRESS); BUILD_BUG_ON(sizeof(boot_params) != 4096); memcpy(&boot_params.hdr, &hdr, sizeof(hdr)); @@ -133,6 +135,8 @@ static void init_heap(void) void main(void) { + init_default_io_ops(); + /* First, copy the boot header into the "zeropage" */ copy_boot_params(); diff --git a/arch/x86/boot/msr.h b/arch/x86/boot/msr.h new file mode 100644 index 000000000000..aed66f7ae199 --- /dev/null +++ b/arch/x86/boot/msr.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Helpers/definitions related to MSR access. + */ + +#ifndef BOOT_MSR_H +#define BOOT_MSR_H + +#include <asm/shared/msr.h> + +/* + * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the + * boot kernel since they rely on tracepoint/exception handling infrastructure + * that's not available here. + */ +static inline void boot_rdmsr(unsigned int reg, struct msr *m) +{ + asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg)); +} + +static inline void boot_wrmsr(unsigned int reg, const struct msr *m) +{ + asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory"); +} + +#endif /* BOOT_MSR_H */ diff --git a/arch/x86/coco/Makefile b/arch/x86/coco/Makefile index c1ead00017a7..c816acf78b6a 100644 --- a/arch/x86/coco/Makefile +++ b/arch/x86/coco/Makefile @@ -4,3 +4,5 @@ KASAN_SANITIZE_core.o := n CFLAGS_core.o += -fno-stack-protector obj-y += core.o + +obj-$(CONFIG_INTEL_TDX_GUEST) += tdx/ diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index fc1365dd927e..49b44f881484 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -18,7 +18,15 @@ static u64 cc_mask __ro_after_init; static bool intel_cc_platform_has(enum cc_attr attr) { - return false; + switch (attr) { + case CC_ATTR_GUEST_UNROLL_STRING_IO: + case CC_ATTR_HOTPLUG_DISABLED: + case CC_ATTR_GUEST_MEM_ENCRYPT: + case CC_ATTR_MEM_ENCRYPT: + return true; + default: + return false; + } } /* @@ -57,6 +65,9 @@ static bool amd_cc_platform_has(enum cc_attr attr) return (sev_status & MSR_AMD64_SEV_ENABLED) && !(sev_status & MSR_AMD64_SEV_ES_ENABLED); + case CC_ATTR_GUEST_SEV_SNP: + return sev_status & MSR_AMD64_SEV_SNP_ENABLED; + default: return false; } @@ -87,9 +98,18 @@ EXPORT_SYMBOL_GPL(cc_platform_has); u64 cc_mkenc(u64 val) { + /* + * Both AMD and Intel use a bit in the page table to indicate + * encryption status of the page. + * + * - for AMD, bit *set* means the page is encrypted + * - for Intel *clear* means encrypted. + */ switch (vendor) { case CC_VENDOR_AMD: return val | cc_mask; + case CC_VENDOR_INTEL: + return val & ~cc_mask; default: return val; } @@ -97,9 +117,12 @@ u64 cc_mkenc(u64 val) u64 cc_mkdec(u64 val) { + /* See comment in cc_mkenc() */ switch (vendor) { case CC_VENDOR_AMD: return val & ~cc_mask; + case CC_VENDOR_INTEL: + return val | cc_mask; default: return val; } diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile new file mode 100644 index 000000000000..46c55998557d --- /dev/null +++ b/arch/x86/coco/tdx/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += tdx.o tdcall.o diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S new file mode 100644 index 000000000000..f9eb1134f22d --- /dev/null +++ b/arch/x86/coco/tdx/tdcall.S @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <asm/asm-offsets.h> +#include <asm/asm.h> +#include <asm/frame.h> +#include <asm/unwind_hints.h> + +#include <linux/linkage.h> +#include <linux/bits.h> +#include <linux/errno.h> + +#include "../../virt/vmx/tdx/tdxcall.S" + +/* + * Bitmasks of exposed registers (with VMM). + */ +#define TDX_R10 BIT(10) +#define TDX_R11 BIT(11) +#define TDX_R12 BIT(12) +#define TDX_R13 BIT(13) +#define TDX_R14 BIT(14) +#define TDX_R15 BIT(15) + +/* + * These registers are clobbered to hold arguments for each + * TDVMCALL. They are safe to expose to the VMM. + * Each bit in this mask represents a register ID. Bit field + * details can be found in TDX GHCI specification, section + * titled "TDCALL [TDG.VP.VMCALL] leaf". + */ +#define TDVMCALL_EXPOSE_REGS_MASK ( TDX_R10 | TDX_R11 | \ + TDX_R12 | TDX_R13 | \ + TDX_R14 | TDX_R15 ) + +/* + * __tdx_module_call() - Used by TDX guests to request services from + * the TDX module (does not include VMM services) using TDCALL instruction. + * + * Transforms function call register arguments into the TDCALL register ABI. + * After TDCALL operation, TDX module output is saved in @out (if it is + * provided by the user). + * + *------------------------------------------------------------------------- + * TDCALL ABI: + *------------------------------------------------------------------------- + * Input Registers: + * + * RAX - TDCALL Leaf number. + * RCX,RDX,R8-R9 - TDCALL Leaf specific input registers. + * + * Output Registers: + * + * RAX - TDCALL instruction error code. + * RCX,RDX,R8-R11 - TDCALL Leaf specific output registers. + * + *------------------------------------------------------------------------- + * + * __tdx_module_call() function ABI: + * + * @fn (RDI) - TDCALL Leaf ID, moved to RAX + * @rcx (RSI) - Input parameter 1, moved to RCX + * @rdx (RDX) - Input parameter 2, moved to RDX + * @r8 (RCX) - Input parameter 3, moved to R8 + * @r9 (R8) - Input parameter 4, moved to R9 + * + * @out (R9) - struct tdx_module_output pointer + * stored temporarily in R12 (not + * shared with the TDX module). It + * can be NULL. + * + * Return status of TDCALL via RAX. + */ +SYM_FUNC_START(__tdx_module_call) + FRAME_BEGIN + TDX_MODULE_CALL host=0 + FRAME_END + RET +SYM_FUNC_END(__tdx_module_call) + +/* + * __tdx_hypercall() - Make hypercalls to a TDX VMM using TDVMCALL leaf + * of TDCALL instruction + * + * Transforms values in function call argument struct tdx_hypercall_args @args + * into the TDCALL register ABI. After TDCALL operation, VMM output is saved + * back in @args. + * + *------------------------------------------------------------------------- + * TD VMCALL ABI: + *------------------------------------------------------------------------- + * + * Input Registers: + * + * RAX - TDCALL instruction leaf number (0 - TDG.VP.VMCALL) + * RCX - BITMAP which controls which part of TD Guest GPR + * is passed as-is to the VMM and back. + * R10 - Set 0 to indicate TDCALL follows standard TDX ABI + * specification. Non zero value indicates vendor + * specific ABI. + * R11 - VMCALL sub function number + * RBX, RBP, RDI, RSI - Used to pass VMCALL sub function specific arguments. + * R8-R9, R12-R15 - Same as above. + * + * Output Registers: + * + * RAX - TDCALL instruction status (Not related to hypercall + * output). + * R10 - Hypercall output error code. + * R11-R15 - Hypercall sub function specific output values. + * + *------------------------------------------------------------------------- + * + * __tdx_hypercall() function ABI: + * + * @args (RDI) - struct tdx_hypercall_args for input and output + * @flags (RSI) - TDX_HCALL_* flags + * + * On successful completion, return the hypercall error code. + */ +SYM_FUNC_START(__tdx_hypercall) + FRAME_BEGIN + + /* Save callee-saved GPRs as mandated by the x86_64 ABI */ + push %r15 + push %r14 + push %r13 + push %r12 + + /* Mangle function call ABI into TDCALL ABI: */ + /* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */ + xor %eax, %eax + + /* Copy hypercall registers from arg struct: */ + movq TDX_HYPERCALL_r10(%rdi), %r10 + movq TDX_HYPERCALL_r11(%rdi), %r11 + movq TDX_HYPERCALL_r12(%rdi), %r12 + movq TDX_HYPERCALL_r13(%rdi), %r13 + movq TDX_HYPERCALL_r14(%rdi), %r14 + movq TDX_HYPERCALL_r15(%rdi), %r15 + + movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx + + /* + * For the idle loop STI needs to be called directly before the TDCALL + * that enters idle (EXIT_REASON_HLT case). STI instruction enables + * interrupts only one instruction later. If there is a window between + * STI and the instruction that emulates the HALT state, there is a + * chance for interrupts to happen in this window, which can delay the + * HLT operation indefinitely. Since this is the not the desired + * result, conditionally call STI before TDCALL. + */ + testq $TDX_HCALL_ISSUE_STI, %rsi + jz .Lskip_sti + sti +.Lskip_sti: + tdcall + + /* + * RAX==0 indicates a failure of the TDVMCALL mechanism itself and that + * something has gone horribly wrong with the TDX module. + * + * The return status of the hypercall operation is in a separate + * register (in R10). Hypercall errors are a part of normal operation + * and are handled by callers. + */ + testq %rax, %rax + jne .Lpanic + + /* TDVMCALL leaf return code is in R10 */ + movq %r10, %rax + + /* Copy hypercall result registers to arg struct if needed */ + testq $TDX_HCALL_HAS_OUTPUT, %rsi + jz .Lout + + movq %r10, TDX_HYPERCALL_r10(%rdi) + movq %r11, TDX_HYPERCALL_r11(%rdi) + movq %r12, TDX_HYPERCALL_r12(%rdi) + movq %r13, TDX_HYPERCALL_r13(%rdi) + movq %r14, TDX_HYPERCALL_r14(%rdi) + movq %r15, TDX_HYPERCALL_r15(%rdi) +.Lout: + /* + * Zero out registers exposed to the VMM to avoid speculative execution + * with VMM-controlled values. This needs to include all registers + * present in TDVMCALL_EXPOSE_REGS_MASK (except R12-R15). R12-R15 + * context will be restored. + */ + xor %r10d, %r10d + xor %r11d, %r11d + + /* Restore callee-saved GPRs as mandated by the x86_64 ABI */ + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + FRAME_END + + RET +.Lpanic: + call __tdx_hypercall_failed + /* __tdx_hypercall_failed never returns */ + REACHABLE + jmp .Lpanic +SYM_FUNC_END(__tdx_hypercall) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c new file mode 100644 index 000000000000..03deb4d6920d --- /dev/null +++ b/arch/x86/coco/tdx/tdx.c @@ -0,0 +1,692 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021-2022 Intel Corporation */ + +#undef pr_fmt +#define pr_fmt(fmt) "tdx: " fmt + +#include <linux/cpufeature.h> +#include <asm/coco.h> +#include <asm/tdx.h> +#include <asm/vmx.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> +#include <asm/pgtable.h> + +/* TDX module Call Leaf IDs */ +#define TDX_GET_INFO 1 +#define TDX_GET_VEINFO 3 +#define TDX_ACCEPT_PAGE 6 + +/* TDX hypercall Leaf IDs */ +#define TDVMCALL_MAP_GPA 0x10001 + +/* MMIO direction */ +#define EPT_READ 0 +#define EPT_WRITE 1 + +/* Port I/O direction */ +#define PORT_READ 0 +#define PORT_WRITE 1 + +/* See Exit Qualification for I/O Instructions in VMX documentation */ +#define VE_IS_IO_IN(e) ((e) & BIT(3)) +#define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1) +#define VE_GET_PORT_NUM(e) ((e) >> 16) +#define VE_IS_IO_STRING(e) ((e) & BIT(4)) + +/* + * Wrapper for standard use of __tdx_hypercall with no output aside from + * return code. + */ +static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = fn, + .r12 = r12, + .r13 = r13, + .r14 = r14, + .r15 = r15, + }; + + return __tdx_hypercall(&args, 0); +} + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void) +{ + panic("TDVMCALL failed. TDX module bug?"); +} + +/* + * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined + * independently from but are currently matched 1:1 with VMX EXIT_REASONs. + * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and + * guest sides of these calls. + */ +static u64 hcall_func(u64 exit_reason) +{ + return exit_reason; +} + +#ifdef CONFIG_KVM_GUEST +long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2, + unsigned long p3, unsigned long p4) +{ + struct tdx_hypercall_args args = { + .r10 = nr, + .r11 = p1, + .r12 = p2, + .r13 = p3, + .r14 = p4, + }; + + return __tdx_hypercall(&args, 0); +} +EXPORT_SYMBOL_GPL(tdx_kvm_hypercall); +#endif + +/* + * Used for TDX guests to make calls directly to the TD module. This + * should only be used for calls that have no legitimate reason to fail + * or where the kernel can not survive the call failing. + */ +static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, + struct tdx_module_output *out) +{ + if (__tdx_module_call(fn, rcx, rdx, r8, r9, out)) + panic("TDCALL %lld failed (Buggy TDX module!)\n", fn); +} + +static u64 get_cc_mask(void) +{ + struct tdx_module_output out; + unsigned int gpa_width; + + /* + * TDINFO TDX module call is used to get the TD execution environment + * information like GPA width, number of available vcpus, debug mode + * information, etc. More details about the ABI can be found in TDX + * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL + * [TDG.VP.INFO]. + * + * The GPA width that comes out of this call is critical. TDX guests + * can not meaningfully run without it. + */ + tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out); + + gpa_width = out.rcx & GENMASK(5, 0); + + /* + * The highest bit of a guest physical address is the "sharing" bit. + * Set it for shared pages and clear it for private pages. + */ + return BIT_ULL(gpa_width - 1); +} + +static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_HLT), + .r12 = irq_disabled, + }; + + /* + * Emulate HLT operation via hypercall. More info about ABI + * can be found in TDX Guest-Host-Communication Interface + * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>. + * + * The VMM uses the "IRQ disabled" param to understand IRQ + * enabled status (RFLAGS.IF) of the TD guest and to determine + * whether or not it should schedule the halted vCPU if an + * IRQ becomes pending. E.g. if IRQs are disabled, the VMM + * can keep the vCPU in virtual HLT, even if an IRQ is + * pending, without hanging/breaking the guest. + */ + return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0); +} + +static bool handle_halt(void) +{ + /* + * Since non safe halt is mainly used in CPU offlining + * and the guest will always stay in the halt state, don't + * call the STI instruction (set do_sti as false). + */ + const bool irq_disabled = irqs_disabled(); + const bool do_sti = false; + + if (__halt(irq_disabled, do_sti)) + return false; + + return true; +} + +void __cpuidle tdx_safe_halt(void) +{ + /* + * For do_sti=true case, __tdx_hypercall() function enables + * interrupts using the STI instruction before the TDCALL. So + * set irq_disabled as false. + */ + const bool irq_disabled = false; + const bool do_sti = true; + + /* + * Use WARN_ONCE() to report the failure. + */ + if (__halt(irq_disabled, do_sti)) + WARN_ONCE(1, "HLT instruction emulation failed\n"); +} + +static bool read_msr(struct pt_regs *regs) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_MSR_READ), + .r12 = regs->cx, + }; + + /* + * Emulate the MSR read via hypercall. More info about ABI + * can be found in TDX Guest-Host-Communication Interface + * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>". + */ + if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + return false; + + regs->ax = lower_32_bits(args.r11); + regs->dx = upper_32_bits(args.r11); + return true; +} + +static bool write_msr(struct pt_regs *regs) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_MSR_WRITE), + .r12 = regs->cx, + .r13 = (u64)regs->dx << 32 | regs->ax, + }; + + /* + * Emulate the MSR write via hypercall. More info about ABI + * can be found in TDX Guest-Host-Communication Interface + * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>". + */ + return !__tdx_hypercall(&args, 0); +} + +static bool handle_cpuid(struct pt_regs *regs) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_CPUID), + .r12 = regs->ax, + .r13 = regs->cx, + }; + + /* + * Only allow VMM to control range reserved for hypervisor + * communication. + * + * Return all-zeros for any CPUID outside the range. It matches CPU + * behaviour for non-supported leaf. + */ + if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) { + regs->ax = regs->bx = regs->cx = regs->dx = 0; + return true; + } + + /* + * Emulate the CPUID instruction via a hypercall. More info about + * ABI can be found in TDX Guest-Host-Communication Interface + * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>". + */ + if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + return false; + + /* + * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of + * EAX, EBX, ECX, EDX registers after the CPUID instruction execution. + * So copy the register contents back to pt_regs. + */ + regs->ax = args.r12; + regs->bx = args.r13; + regs->cx = args.r14; + regs->dx = args.r15; + + return true; +} + +static bool mmio_read(int size, unsigned long addr, unsigned long *val) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION), + .r12 = size, + .r13 = EPT_READ, + .r14 = addr, + .r15 = *val, + }; + + if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + return false; + *val = args.r11; + return true; +} + +static bool mmio_write(int size, unsigned long addr, unsigned long val) +{ + return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size, + EPT_WRITE, addr, val); +} + +static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve) +{ + char buffer[MAX_INSN_SIZE]; + unsigned long *reg, val; + struct insn insn = {}; + enum mmio_type mmio; + int size, extend_size; + u8 extend_val = 0; + + /* Only in-kernel MMIO is supported */ + if (WARN_ON_ONCE(user_mode(regs))) + return false; + + if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE)) + return false; + + if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64)) + return false; + + mmio = insn_decode_mmio(&insn, &size); + if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED)) + return false; + + if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { + reg = insn_get_modrm_reg_ptr(&insn, regs); + if (!reg) + return false; + } + + ve->instr_len = insn.length; + + /* Handle writes first */ + switch (mmio) { + case MMIO_WRITE: + memcpy(&val, reg, size); + return mmio_write(size, ve->gpa, val); + case MMIO_WRITE_IMM: + val = insn.immediate.value; + return mmio_write(size, ve->gpa, val); + case MMIO_READ: + case MMIO_READ_ZERO_EXTEND: + case MMIO_READ_SIGN_EXTEND: + /* Reads are handled below */ + break; + case MMIO_MOVS: + case MMIO_DECODE_FAILED: + /* + * MMIO was accessed with an instruction that could not be + * decoded or handled properly. It was likely not using io.h + * helpers or accessed MMIO accidentally. + */ + return false; + default: + WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?"); + return false; + } + + /* Handle reads */ + if (!mmio_read(size, ve->gpa, &val)) + return false; + + switch (mmio) { + case MMIO_READ: + /* Zero-extend for 32-bit operation */ + extend_size = size == 4 ? sizeof(*reg) : 0; + break; + case MMIO_READ_ZERO_EXTEND: + /* Zero extend based on operand size */ + extend_size = insn.opnd_bytes; + break; + case MMIO_READ_SIGN_EXTEND: + /* Sign extend based on operand size */ + extend_size = insn.opnd_bytes; + if (size == 1 && val & BIT(7)) + extend_val = 0xFF; + else if (size > 1 && val & BIT(15)) + extend_val = 0xFF; + break; + default: + /* All other cases has to be covered with the first switch() */ + WARN_ON_ONCE(1); + return false; + } + + if (extend_size) + memset(reg, extend_val, extend_size); + memcpy(reg, &val, size); + return true; +} + +static bool handle_in(struct pt_regs *regs, int size, int port) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), + .r12 = size, + .r13 = PORT_READ, + .r14 = port, + }; + u64 mask = GENMASK(BITS_PER_BYTE * size, 0); + bool success; + + /* + * Emulate the I/O read via hypercall. More info about ABI can be found + * in TDX Guest-Host-Communication Interface (GHCI) section titled + * "TDG.VP.VMCALL<Instruction.IO>". + */ + success = !__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT); + + /* Update part of the register affected by the emulated instruction */ + regs->ax &= ~mask; + if (success) + regs->ax |= args.r11 & mask; + + return success; +} + +static bool handle_out(struct pt_regs *regs, int size, int port) +{ + u64 mask = GENMASK(BITS_PER_BYTE * size, 0); + + /* + * Emulate the I/O write via hypercall. More info about ABI can be found + * in TDX Guest-Host-Communication Interface (GHCI) section titled + * "TDG.VP.VMCALL<Instruction.IO>". + */ + return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size, + PORT_WRITE, port, regs->ax & mask); +} + +/* + * Emulate I/O using hypercall. + * + * Assumes the IO instruction was using ax, which is enforced + * by the standard io.h macros. + * + * Return True on success or False on failure. + */ +static bool handle_io(struct pt_regs *regs, u32 exit_qual) +{ + int size, port; + bool in; + + if (VE_IS_IO_STRING(exit_qual)) + return false; + + in = VE_IS_IO_IN(exit_qual); + size = VE_GET_IO_SIZE(exit_qual); + port = VE_GET_PORT_NUM(exit_qual); + + + if (in) + return handle_in(regs, size, port); + else + return handle_out(regs, size, port); +} + +/* + * Early #VE exception handler. Only handles a subset of port I/O. + * Intended only for earlyprintk. If failed, return false. + */ +__init bool tdx_early_handle_ve(struct pt_regs *regs) +{ + struct ve_info ve; + + tdx_get_ve_info(&ve); + + if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION) + return false; + + return handle_io(regs, ve.exit_qual); +} + +void tdx_get_ve_info(struct ve_info *ve) +{ + struct tdx_module_output out; + + /* + * Called during #VE handling to retrieve the #VE info from the + * TDX module. + * + * This has to be called early in #VE handling. A "nested" #VE which + * occurs before this will raise a #DF and is not recoverable. + * + * The call retrieves the #VE info from the TDX module, which also + * clears the "#VE valid" flag. This must be done before anything else + * because any #VE that occurs while the valid flag is set will lead to + * #DF. + * + * Note, the TDX module treats virtual NMIs as inhibited if the #VE + * valid flag is set. It means that NMI=>#VE will not result in a #DF. + */ + tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out); + + /* Transfer the output parameters */ + ve->exit_reason = out.rcx; + ve->exit_qual = out.rdx; + ve->gla = out.r8; + ve->gpa = out.r9; + ve->instr_len = lower_32_bits(out.r10); + ve->instr_info = upper_32_bits(out.r10); +} + +/* Handle the user initiated #VE */ +static bool virt_exception_user(struct pt_regs *regs, struct ve_info *ve) +{ + switch (ve->exit_reason) { + case EXIT_REASON_CPUID: + return handle_cpuid(regs); + default: + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + return false; + } +} + +/* Handle the kernel #VE */ +static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) +{ + switch (ve->exit_reason) { + case EXIT_REASON_HLT: + return handle_halt(); + case EXIT_REASON_MSR_READ: + return read_msr(regs); + case EXIT_REASON_MSR_WRITE: + return write_msr(regs); + case EXIT_REASON_CPUID: + return handle_cpuid(regs); + case EXIT_REASON_EPT_VIOLATION: + return handle_mmio(regs, ve); + case EXIT_REASON_IO_INSTRUCTION: + return handle_io(regs, ve->exit_qual); + default: + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + return false; + } +} + +bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) +{ + bool ret; + + if (user_mode(regs)) + ret = virt_exception_user(regs, ve); + else + ret = virt_exception_kernel(regs, ve); + + /* After successful #VE handling, move the IP */ + if (ret) + regs->ip += ve->instr_len; + + return ret; +} + +static bool tdx_tlb_flush_required(bool private) +{ + /* + * TDX guest is responsible for flushing TLB on private->shared + * transition. VMM is responsible for flushing on shared->private. + * + * The VMM _can't_ flush private addresses as it can't generate PAs + * with the guest's HKID. Shared memory isn't subject to integrity + * checking, i.e. the VMM doesn't need to flush for its own protection. + * + * There's no need to flush when converting from shared to private, + * as flushing is the VMM's responsibility in this case, e.g. it must + * flush to avoid integrity failures in the face of a buggy or + * malicious guest. + */ + return !private; +} + +static bool tdx_cache_flush_required(void) +{ + /* + * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence. + * TDX doesn't have such capability. + * + * Flush cache unconditionally. + */ + return true; +} + +static bool try_accept_one(phys_addr_t *start, unsigned long len, + enum pg_level pg_level) +{ + unsigned long accept_size = page_level_size(pg_level); + u64 tdcall_rcx; + u8 page_size; + + if (!IS_ALIGNED(*start, accept_size)) + return false; + + if (len < accept_size) + return false; + + /* + * Pass the page physical address to the TDX module to accept the + * pending, private page. + * + * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G. + */ + switch (pg_level) { + case PG_LEVEL_4K: + page_size = 0; + break; + case PG_LEVEL_2M: + page_size = 1; + break; + case PG_LEVEL_1G: + page_size = 2; + break; + default: + return false; + } + + tdcall_rcx = *start | page_size; + if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL)) + return false; + + *start += accept_size; + return true; +} + +/* + * Inform the VMM of the guest's intent for this physical page: shared with + * the VMM or private to the guest. The VMM is expected to change its mapping + * of the page in response. + */ +static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc) +{ + phys_addr_t start = __pa(vaddr); + phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE); + + if (!enc) { + /* Set the shared (decrypted) bits: */ + start |= cc_mkdec(0); + end |= cc_mkdec(0); + } + + /* + * Notify the VMM about page mapping conversion. More info about ABI + * can be found in TDX Guest-Host-Communication Interface (GHCI), + * section "TDG.VP.VMCALL<MapGPA>" + */ + if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0)) + return false; + + /* private->shared conversion requires only MapGPA call */ + if (!enc) + return true; + + /* + * For shared->private conversion, accept the page using + * TDX_ACCEPT_PAGE TDX module call. + */ + while (start < end) { + unsigned long len = end - start; + + /* + * Try larger accepts first. It gives chance to VMM to keep + * 1G/2M SEPT entries where possible and speeds up process by + * cutting number of hypercalls (if successful). + */ + + if (try_accept_one(&start, len, PG_LEVEL_1G)) + continue; + + if (try_accept_one(&start, len, PG_LEVEL_2M)) + continue; + + if (!try_accept_one(&start, len, PG_LEVEL_4K)) + return false; + } + + return true; +} + +void __init tdx_early_init(void) +{ + u64 cc_mask; + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); + + if (memcmp(TDX_IDENT, sig, sizeof(sig))) + return; + + setup_force_cpu_cap(X86_FEATURE_TDX_GUEST); + + cc_set_vendor(CC_VENDOR_INTEL); + cc_mask = get_cc_mask(); + cc_set_mask(cc_mask); + + /* + * All bits above GPA width are reserved and kernel treats shared bit + * as flag, not as part of physical address. + * + * Adjust physical mask to only cover valid GPA bits. + */ + physical_mask &= cc_mask - 1; + + x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required; + x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required; + x86_platform.guest.enc_status_change_finish = tdx_enc_status_changed; + + pr_info("Guest detected\n"); +} diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index a4c061fb7c6e..29b36e9e4e74 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -63,7 +63,7 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -.macro PUSH_REGS rdx=%rdx rax=%rax save_ret=0 +.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -73,7 +73,7 @@ For 32-bit we have the following conventions - kernel is built with pushq %rsi /* pt_regs->si */ .endif pushq \rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ + pushq \rcx /* pt_regs->cx */ pushq \rax /* pt_regs->ax */ pushq %r8 /* pt_regs->r8 */ pushq %r9 /* pt_regs->r9 */ @@ -99,6 +99,7 @@ For 32-bit we have the following conventions - kernel is built with * well before they could be put to use in a speculative execution * gadget. */ + xorl %esi, %esi /* nospec si */ xorl %edx, %edx /* nospec dx */ xorl %ecx, %ecx /* nospec cx */ xorl %r8d, %r8d /* nospec r8 */ @@ -114,32 +115,24 @@ For 32-bit we have the following conventions - kernel is built with .endm -.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 - PUSH_REGS rdx=\rdx, rax=\rax, save_ret=\save_ret +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 + PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret CLEAR_REGS .endm -.macro POP_REGS pop_rdi=1 skip_r11rcx=0 +.macro POP_REGS pop_rdi=1 popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx - .if \skip_r11rcx - popq %rsi - .else popq %r11 - .endif popq %r10 popq %r9 popq %r8 popq %rax - .if \skip_r11rcx - popq %rsi - .else popq %rcx - .endif popq %rdx popq %rsi .if \pop_rdi diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 73d958522b6a..4300ba49b5ee 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -191,8 +191,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ - POP_REGS pop_rdi=0 skip_r11rcx=1 + POP_REGS pop_rdi=0 /* * Now all regs are restored except RSP and RDI. @@ -215,8 +214,13 @@ syscall_return_via_sysret: popq %rdi popq %rsp +SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) + ANNOTATE_NOENDBR swapgs sysretq +SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + int3 SYM_CODE_END(entry_SYSCALL_64) /* @@ -318,6 +322,14 @@ SYM_CODE_END(ret_from_fork) #endif .endm +/* Save all registers in pt_regs */ +SYM_CODE_START_LOCAL(push_and_clear_regs) + UNWIND_HINT_FUNC + PUSH_AND_CLEAR_REGS save_ret=1 + ENCODE_FRAME_POINTER 8 + RET +SYM_CODE_END(push_and_clear_regs) + /** * idtentry_body - Macro to emit code calling the C function * @cfunc: C function to be called @@ -325,7 +337,21 @@ SYM_CODE_END(ret_from_fork) */ .macro idtentry_body cfunc has_error_code:req - call error_entry + call push_and_clear_regs + UNWIND_HINT_REGS + + /* + * Call error_entry() and switch to the task stack if from userspace. + * + * When in XENPV, it is already in the task stack, and it can't fault + * for native_iret() nor native_load_gs_index() since XENPV uses its + * own pvops for IRET and load_gs_index(). And it doesn't need to + * switch the CR3. So it can skip invoking error_entry(). + */ + ALTERNATIVE "call error_entry; movq %rax, %rsp", \ + "", X86_FEATURE_XENPV + + ENCODE_FRAME_POINTER UNWIND_HINT_REGS movq %rsp, %rdi /* pt_regs pointer into 1st argument*/ @@ -358,6 +384,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 ENDBR ASM_CLAC + cld .if \has_error_code == 0 pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -426,6 +453,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS ENDBR ASM_CLAC + cld pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -482,6 +510,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS ENDBR ASM_CLAC + cld /* * If the entry is from userspace, switch stacks and treat it as @@ -508,6 +537,7 @@ SYM_CODE_START(\asmsym) call vc_switch_off_ist movq %rax, %rsp /* Switch to new stack */ + ENCODE_FRAME_POINTER UNWIND_HINT_REGS /* Update pt_regs */ @@ -544,6 +574,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS offset=8 ENDBR ASM_CLAC + cld /* paranoid_entry returns GS information for paranoid_exit in EBX. */ call paranoid_entry @@ -869,7 +900,6 @@ SYM_CODE_END(xen_failsafe_callback) */ SYM_CODE_START_LOCAL(paranoid_entry) UNWIND_HINT_FUNC - cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 @@ -983,13 +1013,10 @@ SYM_CODE_START_LOCAL(paranoid_exit) SYM_CODE_END(paranoid_exit) /* - * Save all registers in pt_regs, and switch GS if needed. + * Switch GS and CR3 if needed. */ SYM_CODE_START_LOCAL(error_entry) UNWIND_HINT_FUNC - cld - PUSH_AND_CLEAR_REGS save_ret=1 - ENCODE_FRAME_POINTER 8 testb $3, CS+8(%rsp) jz .Lerror_kernelspace @@ -997,19 +1024,15 @@ SYM_CODE_START_LOCAL(error_entry) * We entered from user mode or we're pretending to have entered * from user mode due to an IRET fault. */ - SWAPGS + swapgs FENCE_SWAPGS_USER_ENTRY /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ .Lerror_entry_from_usermode_after_swapgs: /* Put us onto the real thread stack. */ - popq %r12 /* save return addr in %12 */ - movq %rsp, %rdi /* arg0 = pt_regs pointer */ call sync_regs - movq %rax, %rsp /* switch stack */ - ENCODE_FRAME_POINTER - pushq %r12 RET /* @@ -1033,7 +1056,7 @@ SYM_CODE_START_LOCAL(error_entry) * gsbase and proceed. We'll fix up the exception and land in * .Lgs_change's error handler with kernel gsbase. */ - SWAPGS + swapgs /* * Issue an LFENCE to prevent GS speculation, regardless of whether it is a @@ -1041,6 +1064,7 @@ SYM_CODE_START_LOCAL(error_entry) */ .Lerror_entry_done_lfence: FENCE_SWAPGS_KERNEL_ENTRY + leaq 8(%rsp), %rax /* return pt_regs pointer */ RET .Lbstep_iret: @@ -1053,7 +1077,7 @@ SYM_CODE_START_LOCAL(error_entry) * We came from an IRET to user mode, so we have user * gsbase and CR3. Switch to kernel gsbase and CR3: */ - SWAPGS + swapgs FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax @@ -1061,9 +1085,9 @@ SYM_CODE_START_LOCAL(error_entry) * Pretend that the exception came from user mode: set up pt_regs * as if we faulted immediately after IRET. */ - mov %rsp, %rdi + leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ call fixup_bad_iret - mov %rax, %rsp + mov %rax, %rdi jmp .Lerror_entry_from_usermode_after_swapgs SYM_CODE_END(error_entry) @@ -1126,6 +1150,7 @@ SYM_CODE_START(asm_exc_nmi) */ ASM_CLAC + cld /* Use %rdx as our temp variable throughout */ pushq %rdx @@ -1145,7 +1170,6 @@ SYM_CODE_START(asm_exc_nmi) */ swapgs - cld FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 4fdb007cddbd..d1052742ad0c 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -50,7 +50,7 @@ SYM_CODE_START(entry_SYSENTER_compat) UNWIND_HINT_EMPTY ENDBR /* Interrupts are off on entry. */ - SWAPGS + swapgs pushq %rax SWITCH_TO_KERNEL_CR3 scratch_reg=%rax @@ -83,32 +83,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) movl %eax, %eax pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 = 0 */ - xorl %r8d, %r8d /* nospec r8 */ - pushq $0 /* pt_regs->r9 = 0 */ - xorl %r9d, %r9d /* nospec r9 */ - pushq $0 /* pt_regs->r10 = 0 */ - xorl %r10d, %r10d /* nospec r10 */ - pushq $0 /* pt_regs->r11 = 0 */ - xorl %r11d, %r11d /* nospec r11 */ - pushq %rbx /* pt_regs->rbx */ - xorl %ebx, %ebx /* nospec rbx */ - pushq %rbp /* pt_regs->rbp (will be overwritten) */ - xorl %ebp, %ebp /* nospec rbp */ - pushq $0 /* pt_regs->r12 = 0 */ - xorl %r12d, %r12d /* nospec r12 */ - pushq $0 /* pt_regs->r13 = 0 */ - xorl %r13d, %r13d /* nospec r13 */ - pushq $0 /* pt_regs->r14 = 0 */ - xorl %r14d, %r14d /* nospec r14 */ - pushq $0 /* pt_regs->r15 = 0 */ - xorl %r15d, %r15d /* nospec r15 */ - + PUSH_AND_CLEAR_REGS rax=$-ENOSYS UNWIND_HINT_REGS cld @@ -225,35 +200,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) movl %eax, %eax /* discard orig_ax high bits */ pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - xorl %esi, %esi /* nospec si */ - pushq %rdx /* pt_regs->dx */ - xorl %edx, %edx /* nospec dx */ - pushq %rbp /* pt_regs->cx (stashed in bp) */ - xorl %ecx, %ecx /* nospec cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 = 0 */ - xorl %r8d, %r8d /* nospec r8 */ - pushq $0 /* pt_regs->r9 = 0 */ - xorl %r9d, %r9d /* nospec r9 */ - pushq $0 /* pt_regs->r10 = 0 */ - xorl %r10d, %r10d /* nospec r10 */ - pushq $0 /* pt_regs->r11 = 0 */ - xorl %r11d, %r11d /* nospec r11 */ - pushq %rbx /* pt_regs->rbx */ - xorl %ebx, %ebx /* nospec rbx */ - pushq %rbp /* pt_regs->rbp (will be overwritten) */ - xorl %ebp, %ebp /* nospec rbp */ - pushq $0 /* pt_regs->r12 = 0 */ - xorl %r12d, %r12d /* nospec r12 */ - pushq $0 /* pt_regs->r13 = 0 */ - xorl %r13d, %r13d /* nospec r13 */ - pushq $0 /* pt_regs->r14 = 0 */ - xorl %r14d, %r14d /* nospec r14 */ - pushq $0 /* pt_regs->r15 = 0 */ - xorl %r15d, %r15d /* nospec r15 */ - + PUSH_AND_CLEAR_REGS rcx=%rbp rax=$-ENOSYS UNWIND_HINT_REGS movq %rsp, %rdi @@ -297,6 +244,8 @@ sysret32_from_system_call: * code. We zero R8-R10 to avoid info leaks. */ movq RSP-ORIG_RAX(%rsp), %rsp +SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) + ANNOTATE_NOENDBR /* * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored @@ -314,6 +263,9 @@ sysret32_from_system_call: xorl %r10d, %r10d swapgs sysretl +SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + int3 SYM_CODE_END(entry_SYSCALL_compat) /* @@ -362,54 +314,25 @@ SYM_CODE_START(entry_INT80_compat) /* switch to thread stack expects orig_ax and rdi to be pushed */ pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ /* Need to switch before accessing the thread stack. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax /* In the Xen PV case we already run on the thread stack. */ ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV - movq %rsp, %rdi + movq %rsp, %rax movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - pushq 6*8(%rdi) /* regs->ss */ - pushq 5*8(%rdi) /* regs->rsp */ - pushq 4*8(%rdi) /* regs->eflags */ - pushq 3*8(%rdi) /* regs->cs */ - pushq 2*8(%rdi) /* regs->ip */ - pushq 1*8(%rdi) /* regs->orig_ax */ - pushq (%rdi) /* pt_regs->di */ + pushq 5*8(%rax) /* regs->ss */ + pushq 4*8(%rax) /* regs->rsp */ + pushq 3*8(%rax) /* regs->eflags */ + pushq 2*8(%rax) /* regs->cs */ + pushq 1*8(%rax) /* regs->ip */ + pushq 0*8(%rax) /* regs->orig_ax */ .Lint80_keep_stack: - pushq %rsi /* pt_regs->si */ - xorl %esi, %esi /* nospec si */ - pushq %rdx /* pt_regs->dx */ - xorl %edx, %edx /* nospec dx */ - pushq %rcx /* pt_regs->cx */ - xorl %ecx, %ecx /* nospec cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 */ - xorl %r8d, %r8d /* nospec r8 */ - pushq %r9 /* pt_regs->r9 */ - xorl %r9d, %r9d /* nospec r9 */ - pushq %r10 /* pt_regs->r10*/ - xorl %r10d, %r10d /* nospec r10 */ - pushq %r11 /* pt_regs->r11 */ - xorl %r11d, %r11d /* nospec r11 */ - pushq %rbx /* pt_regs->rbx */ - xorl %ebx, %ebx /* nospec rbx */ - pushq %rbp /* pt_regs->rbp */ - xorl %ebp, %ebp /* nospec rbp */ - pushq %r12 /* pt_regs->r12 */ - xorl %r12d, %r12d /* nospec r12 */ - pushq %r13 /* pt_regs->r13 */ - xorl %r13d, %r13d /* nospec r13 */ - pushq %r14 /* pt_regs->r14 */ - xorl %r14d, %r14d /* nospec r14 */ - pushq %r15 /* pt_regs->r15 */ - xorl %r15d, %r15d /* nospec r15 */ - + PUSH_AND_CLEAR_REGS rax=$-ENOSYS UNWIND_HINT_REGS cld diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 693f8b9031fb..c2a8b76ae0bc 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -91,7 +91,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),) endif endif -$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. @@ -148,6 +148,7 @@ KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32)) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 235a5794296a..1000d457c332 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -438,7 +438,7 @@ bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) static __init int vdso_setup(char *s) { vdso64_enabled = simple_strtoul(s, NULL, 0); - return 0; + return 1; } __setup("vdso=", vdso_setup); diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index fd2ee9408e91..4af81df133ee 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -48,7 +48,7 @@ static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = #elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) XONLY; #else - EMULATE; + #error VSYSCALL config is broken #endif static int __init vsyscall_setup(char *str) diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index d6cdfe631674..09c56965750a 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -44,4 +44,12 @@ config PERF_EVENTS_AMD_UNCORE To compile this driver as a module, choose M here: the module will be called 'amd-uncore'. + +config PERF_EVENTS_AMD_BRS + depends on PERF_EVENTS && CPU_SUP_AMD + bool "AMD Zen3 Branch Sampling support" + help + Enable AMD Zen3 branch sampling support (BRS) which samples up to + 16 consecutive taken branches in registers. + endmenu diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index 6cbe38d5fd9d..b9f5d4610256 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_CPU_SUP_AMD) += core.o +obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c new file mode 100644 index 000000000000..bee8765a1e9b --- /dev/null +++ b/arch/x86/events/amd/brs.c @@ -0,0 +1,367 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement support for AMD Fam19h Branch Sampling feature + * Based on specifications published in AMD PPR Fam19 Model 01 + * + * Copyright 2021 Google LLC + * Contributed by Stephane Eranian <eranian@google.com> + */ +#include <linux/kernel.h> +#include <linux/jump_label.h> +#include <asm/msr.h> +#include <asm/cpufeature.h> + +#include "../perf_event.h" + +#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */ + +/* Debug Extension Configuration register layout */ +union amd_debug_extn_cfg { + __u64 val; + struct { + __u64 rsvd0:2, /* reserved */ + brsmen:1, /* branch sample enable */ + rsvd4_3:2,/* reserved - must be 0x3 */ + vb:1, /* valid branches recorded */ + rsvd2:10, /* reserved */ + msroff:4, /* index of next entry to write */ + rsvd3:4, /* reserved */ + pmc:3, /* #PMC holding the sampling event */ + rsvd4:37; /* reserved */ + }; +}; + +static inline unsigned int brs_from(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx; +} + +static inline unsigned int brs_to(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1; +} + +static inline void set_debug_extn_cfg(u64 val) +{ + /* bits[4:3] must always be set to 11b */ + wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3); +} + +static inline u64 get_debug_extn_cfg(void) +{ + u64 val; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, val); + return val; +} + +static bool __init amd_brs_detect(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_BRS)) + return false; + + switch (boot_cpu_data.x86) { + case 0x19: /* AMD Fam19h (Zen3) */ + x86_pmu.lbr_nr = 16; + + /* No hardware filtering supported */ + x86_pmu.lbr_sel_map = NULL; + x86_pmu.lbr_sel_mask = 0; + break; + default: + return false; + } + + return true; +} + +/* + * Current BRS implementation does not support branch type or privilege level + * filtering. Therefore, this function simply enforces these limitations. No need for + * a br_sel_map. Software filtering is not supported because it would not correlate well + * with a sampling period. + */ +int amd_brs_setup_filter(struct perf_event *event) +{ + u64 type = event->attr.branch_sample_type; + + /* No BRS support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* Can only capture all branches, i.e., no filtering */ + if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) + return -EINVAL; + + return 0; +} + +/* tos = top of stack, i.e., last valid entry written */ +static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) +{ + /* + * msroff: index of next entry to write so top-of-stack is one off + * if BRS is full then msroff is set back to 0. + */ + return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1; +} + +/* + * make sure we have a sane BRS offset to begin with + * especially with kexec + */ +void amd_brs_reset(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_BRS)) + return; + + /* + * Reset config + */ + set_debug_extn_cfg(0); + + /* + * Mark first entry as poisoned + */ + wrmsrl(brs_to(0), BRS_POISON); +} + +int __init amd_brs_init(void) +{ + if (!amd_brs_detect()) + return -EOPNOTSUPP; + + pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr); + + return 0; +} + +void amd_brs_enable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Activate only on first user */ + if (++cpuc->brs_active > 1) + return; + + cfg.val = 0; /* reset all fields */ + cfg.brsmen = 1; /* enable branch sampling */ + + /* Set enable bit */ + set_debug_extn_cfg(cfg.val); +} + +void amd_brs_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_enable(); +} + +void amd_brs_disable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Check if active (could be disabled via x86_pmu_disable_all()) */ + if (!cpuc->brs_active) + return; + + /* Only disable for last user */ + if (--cpuc->brs_active) + return; + + /* + * Clear the brsmen bit but preserve the others as they contain + * useful state such as vb and msroff + */ + cfg.val = get_debug_extn_cfg(); + + /* + * When coming in on interrupt and BRS is full, then hw will have + * already stopped BRS, no need to issue wrmsr again + */ + if (cfg.brsmen) { + cfg.brsmen = 0; + set_debug_extn_cfg(cfg.val); + } +} + +void amd_brs_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_disable(); +} + +static bool amd_brs_match_plm(struct perf_event *event, u64 to) +{ + int type = event->attr.branch_sample_type; + int plm_k = PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_HV; + int plm_u = PERF_SAMPLE_BRANCH_USER; + + if (!(type & plm_k) && kernel_ip(to)) + return 0; + + if (!(type & plm_u) && !kernel_ip(to)) + return 0; + + return 1; +} + +/* + * Caller must ensure amd_brs_inuse() is true before calling + * return: + */ +void amd_brs_drain(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event = cpuc->events[0]; + struct perf_branch_entry *br = cpuc->lbr_entries; + union amd_debug_extn_cfg cfg; + u32 i, nr = 0, num, tos, start; + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + /* + * BRS event forced on PMC0, + * so check if there is an event. + * It is possible to have lbr_users > 0 but the event + * not yet scheduled due to long latency PMU irq + */ + if (!event) + goto empty; + + cfg.val = get_debug_extn_cfg(); + + /* Sanity check [0-x86_pmu.lbr_nr] */ + if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr)) + goto empty; + + /* No valid branch */ + if (cfg.vb == 0) + goto empty; + + /* + * msr.off points to next entry to be written + * tos = most recent entry index = msr.off - 1 + * BRS register buffer saturates, so we know we have + * start < tos and that we have to read from start to tos + */ + start = 0; + tos = amd_brs_get_tos(&cfg); + + num = tos - start + 1; + + /* + * BRS is only one pass (saturation) from MSROFF to depth-1 + * MSROFF wraps to zero when buffer is full + */ + for (i = 0; i < num; i++) { + u32 brs_idx = tos - i; + u64 from, to; + + rdmsrl(brs_to(brs_idx), to); + + /* Entry does not belong to us (as marked by kernel) */ + if (to == BRS_POISON) + break; + + /* + * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. + * Necessary to generate proper virtual addresses suitable for + * symbolization + */ + to = (u64)(((s64)to << shift) >> shift); + + if (!amd_brs_match_plm(event, to)) + continue; + + rdmsrl(brs_from(brs_idx), from); + + perf_clear_branch_entry_bitfields(br+nr); + + br[nr].from = from; + br[nr].to = to; + + nr++; + } +empty: + /* Record number of sampled branches */ + cpuc->lbr_stack.nr = nr; +} + +/* + * Poison most recent entry to prevent reuse by next task + * required because BRS entry are not tagged by PID + */ +static void amd_brs_poison_buffer(void) +{ + union amd_debug_extn_cfg cfg; + unsigned int idx; + + /* Get current state */ + cfg.val = get_debug_extn_cfg(); + + /* idx is most recently written entry */ + idx = amd_brs_get_tos(&cfg); + + /* Poison target of entry */ + wrmsrl(brs_to(idx), BRS_POISON); +} + +/* + * On context switch in, we need to make sure no samples from previous user + * are left in the BRS. + * + * On ctxswin, sched_in = true, called after the PMU has started + * On ctxswout, sched_in = false, called before the PMU is stopped + */ +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* no active users */ + if (!cpuc->lbr_users) + return; + + /* + * On context switch in, we need to ensure we do not use entries + * from previous BRS user on that CPU, so we poison the buffer as + * a faster way compared to resetting all entries. + */ + if (sched_in) + amd_brs_poison_buffer(); +} + +/* + * called from ACPI processor_idle.c or acpi_pad.c + * with interrupts disabled + */ +void perf_amd_brs_lopwr_cb(bool lopwr_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* + * on mwait in, we may end up in non C0 state. + * we must disable branch sampling to avoid holding the NMI + * for too long. We disable it in hardware but we + * keep the state in cpuc, so we can re-enable. + * + * The hardware will deliver the NMI if needed when brsmen cleared + */ + if (cpuc->brs_active) { + cfg.val = get_debug_extn_cfg(); + cfg.brsmen = !lopwr_in; + set_debug_extn_cfg(cfg.val); + } +} + +DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb); + +void __init amd_brs_lopwr_init(void) +{ + static_call_update(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 9687a8aef01c..9ac3718410ce 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/perf_event.h> +#include <linux/jump_label.h> #include <linux/export.h> #include <linux/types.h> #include <linux/init.h> @@ -7,6 +8,7 @@ #include <linux/delay.h> #include <linux/jiffies.h> #include <asm/apicdef.h> +#include <asm/apic.h> #include <asm/nmi.h> #include "../perf_event.h" @@ -18,6 +20,9 @@ static unsigned long perf_nmi_window; #define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL) #define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE) +/* PMC Enable and Overflow bits for PerfCntrGlobal* registers */ +static u64 amd_pmu_global_cntr_mask __read_mostly; + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -325,8 +330,16 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + static int amd_core_hw_config(struct perf_event *event) { + int ret = 0; + if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -343,7 +356,66 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; - return 0; + /* + * if branch stack is requested + */ + if (has_branch_stack(event)) { + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + } + return ret; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) @@ -366,7 +438,7 @@ static int amd_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && get_ibs_caps()) return -ENOENT; - if (has_branch_stack(event)) + if (has_branch_stack(event) && !x86_pmu.lbr_nr) return -EOPNOTSUPP; ret = x86_pmu_hw_config(event); @@ -510,6 +582,18 @@ static struct amd_nb *amd_alloc_nb(int cpu) return nb; } +static void amd_pmu_cpu_reset(int cpu) +{ + if (x86_pmu.version < 2) + return; + + /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + + /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask); +} + static int amd_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); @@ -555,6 +639,9 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; + + amd_brs_reset(); + amd_pmu_cpu_reset(cpu); } static void amd_pmu_cpu_dead(int cpu) @@ -574,8 +661,54 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } + + amd_pmu_cpu_reset(cpu); +} + +static inline void amd_pmu_set_global_ctl(u64 ctl) +{ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); } +static inline u64 amd_pmu_get_global_status(void) +{ + u64 status; + + /* PerfCntrGlobalStatus is read-only */ + rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); + + return status & amd_pmu_global_cntr_mask; +} + +static inline void amd_pmu_ack_global_status(u64 status) +{ + /* + * PerfCntrGlobalStatus is read-only but an overflow acknowledgment + * mechanism exists; writing 1 to a bit in PerfCntrGlobalStatusClr + * clears the same bit in PerfCntrGlobalStatus + */ + + /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */ + status &= amd_pmu_global_cntr_mask; + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); +} + +static bool amd_pmu_test_overflow_topbit(int idx) +{ + u64 counter; + + rdmsrl(x86_pmu_event_addr(idx), counter); + + return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1)); +} + +static bool amd_pmu_test_overflow_status(int idx) +{ + return amd_pmu_get_global_status() & BIT_ULL(idx); +} + +DEFINE_STATIC_CALL(amd_pmu_test_overflow, amd_pmu_test_overflow_topbit); + /* * When a PMC counter overflows, an NMI is used to process the event and * reset the counter. NMI latency can result in the counter being updated @@ -588,7 +721,6 @@ static void amd_pmu_cpu_dead(int cpu) static void amd_pmu_wait_on_overflow(int idx) { unsigned int i; - u64 counter; /* * Wait for the counter to be reset if it has overflowed. This loop @@ -596,8 +728,7 @@ static void amd_pmu_wait_on_overflow(int idx) * forever... */ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { - rdmsrl(x86_pmu_event_addr(idx), counter); - if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) + if (!static_call(amd_pmu_test_overflow)(idx)) break; /* Might be in IRQ context, so can't sleep */ @@ -605,13 +736,11 @@ static void amd_pmu_wait_on_overflow(int idx) } } -static void amd_pmu_disable_all(void) +static void amd_pmu_check_overflow(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - x86_pmu_disable_all(); - /* * This shouldn't be called from NMI context, but add a safeguard here * to return, since if we're in NMI context we can't wait for an NMI @@ -634,6 +763,47 @@ static void amd_pmu_disable_all(void) } } +static void amd_pmu_enable_event(struct perf_event *event) +{ + x86_pmu_enable_event(event); +} + +static void amd_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int idx; + + amd_brs_enable_all(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + /* only activate events which are marked as active */ + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_enable_event(cpuc->events[idx]); + } +} + +static void amd_pmu_v2_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * Testing cpu_hw_events.enabled should be skipped in this case unlike + * in x86_pmu_enable_event(). + * + * Since cpu_hw_events.enabled is set only after returning from + * x86_pmu_start(), the PMCs must be programmed and kept ready. + * Counting starts only after x86_pmu_enable_all() is called. + */ + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -651,6 +821,32 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_disable_all(void) +{ + amd_brs_disable_all(); + x86_pmu_disable_all(); + amd_pmu_check_overflow(); +} + +static void amd_pmu_v2_disable_all(void) +{ + /* Disable all PMCs */ + amd_pmu_set_global_ctl(0); + amd_pmu_check_overflow(); +} + +static void amd_pmu_add_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_add(event); +} + +static void amd_pmu_del_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_del(event); +} + /* * Because of NMI latency, if multiple PMC counters are active or other sources * of NMIs are received, the perf NMI handler can handle one or more overflowed @@ -669,13 +865,8 @@ static void amd_pmu_disable_event(struct perf_event *event) * handled a counter. When an un-handled NMI is received, it will be claimed * only if arriving within that window. */ -static int amd_pmu_handle_irq(struct pt_regs *regs) +static inline int amd_pmu_adjust_nmi_window(int handled) { - int handled; - - /* Process any counter overflows */ - handled = x86_pmu_handle_irq(regs); - /* * If a counter was handled, record a timestamp such that un-handled * NMIs will be claimed if arriving within that window. @@ -692,6 +883,113 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) return NMI_HANDLED; } +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int handled; + int pmu_enabled; + + /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* stop everything (includes BRS) */ + amd_pmu_disable_all(); + + /* Drain BRS is in use (could be inactive) */ + if (cpuc->lbr_users) + amd_brs_drain(); + + /* Process any counter overflows */ + handled = x86_pmu_handle_irq(regs); + + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + amd_pmu_enable_all(0); + + return amd_pmu_adjust_nmi_window(handled); +} + +static int amd_pmu_v2_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_sample_data data; + struct hw_perf_event *hwc; + struct perf_event *event; + int handled = 0, idx; + u64 status, mask; + bool pmu_enabled; + + /* + * Save the PMU state as it needs to be restored when leaving the + * handler + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* Stop counting */ + amd_pmu_v2_disable_all(); + + status = amd_pmu_get_global_status(); + + /* Check if any overflows are pending */ + if (!status) + goto done; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + x86_perf_event_update(event); + mask = BIT_ULL(idx); + + if (!(status & mask)) + continue; + + /* Event overflow */ + handled++; + perf_sample_data_init(&data, 0, hwc->last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + + status &= ~mask; + } + + /* + * It should never be the case that some overflows are not handled as + * the corresponding PMCs are expected to be inactive according to the + * active_mask + */ + WARN_ON(status > 0); + + /* Clear overflow bits */ + amd_pmu_ack_global_status(~status); + + /* + * Unmasking the LVTPC is not required as the Mask (M) bit of the LVT + * PMI entry is not set by the local APIC when a PMC overflow occurs + */ + inc_irq_stat(apic_perf_irqs); + +done: + cpuc->enabled = pmu_enabled; + + /* Resume counting only if PMU is active */ + if (pmu_enabled) + amd_pmu_v2_enable_all(0); + + return amd_pmu_adjust_nmi_window(handled); +} + static struct event_constraint * amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -897,6 +1195,51 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc, --cpuc->n_pair; } +/* + * Because of the way BRS operates with an inactive and active phases, and + * the link to one counter, it is not possible to have two events using BRS + * scheduled at the same time. There would be an issue with enforcing the + * period of each one and given that the BRS saturates, it would not be possible + * to guarantee correlated content for all events. Therefore, in situations + * where multiple events want to use BRS, the kernel enforces mutual exclusion. + * Exclusion is enforced by chosing only one counter for events using BRS. + * The event scheduling logic will then automatically multiplex the + * events and ensure that at most one event is actively using BRS. + * + * The BRS counter could be any counter, but there is no constraint on Fam19h, + * therefore all counters are equal and thus we pick the first one: PMC0 + */ +static struct event_constraint amd_fam19h_brs_cntr0_constraint = + EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK); + +static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint = + __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR); + +static struct event_constraint * +amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + bool has_brs = has_amd_brs(hwc); + + /* + * In case BRS is used with an event requiring a counter pair, + * the kernel allows it but only on counter 0 & 1 to enforce + * multiplexing requiring to protect BRS in case of multiple + * BRS users + */ + if (amd_is_pair_event_code(hwc)) { + return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint + : &pair_constraint; + } + + if (has_brs) + return &amd_fam19h_brs_cntr0_constraint; + + return &unconstrained; +} + + static ssize_t amd_event_sysfs_show(char *page, u64 config) { u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | @@ -905,12 +1248,31 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } +static void amd_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + if (sched_in && x86_pmu.lbr_nr) + amd_pmu_brs_sched_task(ctx, sched_in); +} + +static u64 amd_pmu_limit_period(struct perf_event *event, u64 left) +{ + /* + * Decrease period by the depth of the BRS feature to get the last N + * taken branches and approximate the desired period + */ + if (has_branch_stack(event) && left > x86_pmu.lbr_nr) + left -= x86_pmu.lbr_nr; + + return left; +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -920,6 +1282,8 @@ static __initconst const struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), .num_counters = AMD64_NUM_COUNTERS, + .add = amd_pmu_add_event, + .del = amd_pmu_del_event, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -938,8 +1302,55 @@ static __initconst const struct x86_pmu amd_pmu = { .amd_nb_constraints = 1, }; +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *amd_pmu_brs_attrs[] = { + &dev_attr_branches.attr, + NULL, +}; + +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} + +static struct attribute_group group_caps_amd_brs = { + .name = "caps", + .attrs = amd_pmu_brs_attrs, + .is_visible = amd_brs_is_visible, +}; + +EVENT_ATTR_STR(branch-brs, amd_branch_brs, + "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n"); + +static struct attribute *amd_brs_events_attrs[] = { + EVENT_PTR(amd_branch_brs), + NULL, +}; + +static struct attribute_group group_events_amd_brs = { + .name = "events", + .attrs = amd_brs_events_attrs, + .is_visible = amd_brs_is_visible, +}; + +static const struct attribute_group *amd_attr_update[] = { + &group_caps_amd_brs, + &group_events_amd_brs, + NULL, +}; + static int __init amd_core_pmu_init(void) { + union cpuid_0x80000022_ebx ebx; u64 even_ctr_mask = 0ULL; int i; @@ -957,6 +1368,27 @@ static int __init amd_core_pmu_init(void) x86_pmu.eventsel = MSR_F15H_PERF_CTL; x86_pmu.perfctr = MSR_F15H_PERF_CTR; x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + /* Check for Performance Monitoring v2 support */ + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + + /* Update PMU version for later usage */ + x86_pmu.version = 2; + + /* Find the number of available Core PMCs */ + x86_pmu.num_counters = ebx.split.num_core_pmc; + + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + + /* Update PMC handling functions */ + x86_pmu.enable_all = amd_pmu_v2_enable_all; + x86_pmu.disable_all = amd_pmu_v2_disable_all; + x86_pmu.enable = amd_pmu_v2_enable_event; + x86_pmu.handle_irq = amd_pmu_v2_handle_irq; + static_call_update(amd_pmu_test_overflow, amd_pmu_test_overflow_status); + } + /* * AMD Core perfctr has separate MSRs for the NB events, see * the amd/uncore.c driver. @@ -989,6 +1421,23 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } + /* + * BRS requires special event constraints and flushing on ctxsw. + */ + if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { + x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; + x86_pmu.sched_task = amd_pmu_sched_task; + x86_pmu.limit_period = amd_pmu_limit_period; + /* + * put_event_constraints callback same as Fam17h, set above + */ + + /* branch sampling must be stopped when entering low power */ + amd_brs_lopwr_init(); + } + + x86_pmu.attr_update = amd_attr_update; + pr_cont("core perfctr, "); return 0; } @@ -1023,6 +1472,24 @@ __init int amd_pmu_init(void) return 0; } +static inline void amd_pmu_reload_virt(void) +{ + if (x86_pmu.version >= 2) { + /* + * Clear global enable bits, reprogram the PERF_CTL + * registers with updated perf_ctr_virt_mask and then + * set global enable bits once again + */ + amd_pmu_v2_disable_all(); + amd_pmu_enable_all(0); + amd_pmu_v2_enable_all(0); + return; + } + + amd_pmu_disable_all(); + amd_pmu_enable_all(0); +} + void amd_pmu_enable_virt(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1030,8 +1497,7 @@ void amd_pmu_enable_virt(void) cpuc->perf_ctr_virt_mask = 0; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); @@ -1048,7 +1514,6 @@ void amd_pmu_disable_virt(void) cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 9739019d4b67..c251bc44c088 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -94,10 +94,6 @@ struct perf_ibs { unsigned int fetch_ignore_if_zero_rip : 1; struct cpu_perf_ibs __percpu *pcpu; - struct attribute **format_attrs; - struct attribute_group format_group; - const struct attribute_group *attr_groups[2]; - u64 (*get_count)(u64 config); }; @@ -304,6 +300,16 @@ static int perf_ibs_init(struct perf_event *event) hwc->config_base = perf_ibs->msr; hwc->config = config; + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. Setting _EARLY flag + * makes sure we unwind call-stack before perf sample rip is set to + * IbsOpRip. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; + return 0; } @@ -518,16 +524,118 @@ static void perf_ibs_del(struct perf_event *event, int flags) static void perf_ibs_read(struct perf_event *event) { } +/* + * We need to initialize with empty group if all attributes in the + * group are dynamic. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; + +static struct attribute_group empty_format_group = { + .name = "format", + .attrs = attrs_empty, +}; + +static struct attribute_group empty_caps_group = { + .name = "caps", + .attrs = attrs_empty, +}; + +static const struct attribute_group *empty_attr_groups[] = { + &empty_format_group, + &empty_caps_group, + NULL, +}; + PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); +PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); +PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); +PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); -static struct attribute *ibs_fetch_format_attrs[] = { +static umode_t +zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; +} + +static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, }; -static struct attribute *ibs_op_format_attrs[] = { - NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */ +static struct attribute *fetch_l3missonly_attrs[] = { + &fetch_l3missonly.attr.attr, + NULL, +}; + +static struct attribute *zen4_ibs_extensions_attrs[] = { + &zen4_ibs_extensions.attr.attr, + NULL, +}; + +static struct attribute_group group_rand_en = { + .name = "format", + .attrs = rand_en_attrs, +}; + +static struct attribute_group group_fetch_l3missonly = { + .name = "format", + .attrs = fetch_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + +static struct attribute_group group_zen4_ibs_extensions = { + .name = "caps", + .attrs = zen4_ibs_extensions_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + +static const struct attribute_group *fetch_attr_groups[] = { + &group_rand_en, + &empty_caps_group, + NULL, +}; + +static const struct attribute_group *fetch_attr_update[] = { + &group_fetch_l3missonly, + &group_zen4_ibs_extensions, + NULL, +}; + +static umode_t +cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; +} + +static struct attribute *cnt_ctl_attrs[] = { + &format_attr_cnt_ctl.attr, + NULL, +}; + +static struct attribute *op_l3missonly_attrs[] = { + &op_l3missonly.attr.attr, + NULL, +}; + +static struct attribute_group group_cnt_ctl = { + .name = "format", + .attrs = cnt_ctl_attrs, + .is_visible = cnt_ctl_is_visible, +}; + +static struct attribute_group group_op_l3missonly = { + .name = "format", + .attrs = op_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + +static const struct attribute_group *op_attr_update[] = { + &group_cnt_ctl, + &group_op_l3missonly, + &group_zen4_ibs_extensions, NULL, }; @@ -551,7 +659,6 @@ static struct perf_ibs perf_ibs_fetch = { .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, - .format_attrs = ibs_fetch_format_attrs, .get_count = get_ibs_fetch_count, }; @@ -577,7 +684,6 @@ static struct perf_ibs perf_ibs_op = { .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, - .format_attrs = ibs_op_format_attrs, .get_count = get_ibs_op_count, }; @@ -687,6 +793,14 @@ fail: data.raw = &raw; } + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + data.callchain = perf_callchain(event, iregs); + throttle = perf_event_overflow(event, &data, ®s); out: if (throttle) { @@ -739,17 +853,6 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) perf_ibs->pcpu = pcpu; - /* register attributes */ - if (perf_ibs->format_attrs[0]) { - memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group)); - perf_ibs->format_group.name = "format"; - perf_ibs->format_group.attrs = perf_ibs->format_attrs; - - memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups)); - perf_ibs->attr_groups[0] = &perf_ibs->format_group; - perf_ibs->pmu.attr_groups = perf_ibs->attr_groups; - } - ret = perf_pmu_register(&perf_ibs->pmu, name, -1); if (ret) { perf_ibs->pcpu = NULL; @@ -759,10 +862,8 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) return ret; } -static __init void perf_event_ibs_init(void) +static __init int perf_ibs_fetch_init(void) { - struct attribute **attr = ibs_op_format_attrs; - /* * Some chips fail to reset the fetch count when it is written; instead * they need a 0-1 transition of IbsFetchEn. @@ -773,12 +874,19 @@ static __init void perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; - perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY; + + perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; + perf_ibs_fetch.pmu.attr_update = fetch_attr_update; - if (ibs_caps & IBS_CAPS_OPCNT) { + return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); +} + +static __init int perf_ibs_op_init(void) +{ + if (ibs_caps & IBS_CAPS_OPCNT) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; - *attr++ = &format_attr_cnt_ctl.attr; - } if (ibs_caps & IBS_CAPS_OPCNTEXT) { perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; @@ -786,15 +894,52 @@ static __init void perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } - perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; + + perf_ibs_op.pmu.attr_groups = empty_attr_groups; + perf_ibs_op.pmu.attr_update = op_attr_update; + + return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); +} + +static __init int perf_event_ibs_init(void) +{ + int ret; + + ret = perf_ibs_fetch_init(); + if (ret) + return ret; + + ret = perf_ibs_op_init(); + if (ret) + goto err_op; + + ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); + if (ret) + goto err_nmi; - register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps); + return 0; + +err_nmi: + perf_pmu_unregister(&perf_ibs_op.pmu); + free_percpu(perf_ibs_op.pcpu); + perf_ibs_op.pcpu = NULL; +err_op: + perf_pmu_unregister(&perf_ibs_fetch.pmu); + free_percpu(perf_ibs_fetch.pcpu); + perf_ibs_fetch.pcpu = NULL; + + return ret; } #else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ -static __init void perf_event_ibs_init(void) { } +static __init int perf_event_ibs_init(void) +{ + return 0; +} #endif @@ -1064,9 +1209,7 @@ static __init int amd_ibs_init(void) x86_pmu_amd_ibs_starting_cpu, x86_pmu_amd_ibs_dying_cpu); - perf_event_ibs_init(); - - return 0; + return perf_event_ibs_init(); } /* Since we need the pci subsystem to init ibs we can't do this earlier: */ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index eef816fc216d..30788894124f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1338,6 +1338,10 @@ static void x86_pmu_enable(struct pmu *pmu) if (hwc->state & PERF_HES_ARCH) continue; + /* + * if cpuc->enabled = 0, then no wrmsr as + * per x86_pmu_enable_event() + */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1704,11 +1708,15 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } @@ -1837,7 +1845,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, cha /* string trumps id */ if (pmu_attr->event_str) - return sprintf(page, "%s", pmu_attr->event_str); + return sprintf(page, "%s\n", pmu_attr->event_str); return x86_pmu.events_sysfs_show(page, config); } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fc7f458eb3de..955ae91c56dc 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6216,7 +6216,9 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ALDERLAKE: case INTEL_FAM6_ALDERLAKE_L: + case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: + case INTEL_FAM6_RAPTORLAKE_P: /* * Alder Lake has 2 types of CPU, core and atom. * diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 48e5db21142c..8ec23f47fee9 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -682,7 +682,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index fe1742c4ca49..13179f31fe10 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -769,6 +769,7 @@ void intel_pmu_lbr_disable_all(void) void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; @@ -784,15 +785,11 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].mispred = 0; - cpuc->lbr_entries[i].predicted = 0; - cpuc->lbr_entries[i].in_tx = 0; - cpuc->lbr_entries[i].abort = 0; - cpuc->lbr_entries[i].cycles = 0; - cpuc->lbr_entries[i].type = 0; - cpuc->lbr_entries[i].reserved = 0; + perf_clear_branch_entry_bitfields(br); + + br->from = msr_lastbranch.from; + br->to = msr_lastbranch.to; + br++; } cpuc->lbr_stack.nr = i; cpuc->lbr_stack.hw_idx = tos; @@ -807,6 +804,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; @@ -878,15 +876,14 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (abort && x86_pmu.lbr_double_abort && out > 0) out--; - cpuc->lbr_entries[out].from = from; - cpuc->lbr_entries[out].to = to; - cpuc->lbr_entries[out].mispred = mis; - cpuc->lbr_entries[out].predicted = pred; - cpuc->lbr_entries[out].in_tx = in_tx; - cpuc->lbr_entries[out].abort = abort; - cpuc->lbr_entries[out].cycles = cycles; - cpuc->lbr_entries[out].type = 0; - cpuc->lbr_entries[out].reserved = 0; + perf_clear_branch_entry_bitfields(br+out); + br[out].from = from; + br[out].to = to; + br[out].mispred = mis; + br[out].predicted = pred; + br[out].in_tx = in_tx; + br[out].abort = abort; + br[out].cycles = cycles; out++; } cpuc->lbr_stack.nr = out; @@ -951,6 +948,8 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, to = rdlbr_to(i, lbr); info = rdlbr_info(i, lbr); + perf_clear_branch_entry_bitfields(e); + e->from = from; e->to = to; e->mispred = get_lbr_mispred(info); @@ -959,7 +958,6 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->abort = !!(info & LBR_INFO_ABORT); e->cycles = get_lbr_cycles(info); e->type = get_lbr_br_type(info); - e->reserved = 0; } cpuc->lbr_stack.nr = i; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7695dcae280e..db6c31bca809 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1828,7 +1828,9 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 4262351f52b6..ce440011cc4e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -79,10 +79,43 @@ #define PCI_DEVICE_ID_INTEL_ADL_14_IMC 0x4650 #define PCI_DEVICE_ID_INTEL_ADL_15_IMC 0x4668 #define PCI_DEVICE_ID_INTEL_ADL_16_IMC 0x4670 +#define PCI_DEVICE_ID_INTEL_ADL_17_IMC 0x4614 +#define PCI_DEVICE_ID_INTEL_ADL_18_IMC 0x4617 +#define PCI_DEVICE_ID_INTEL_ADL_19_IMC 0x4618 +#define PCI_DEVICE_ID_INTEL_ADL_20_IMC 0x461B +#define PCI_DEVICE_ID_INTEL_ADL_21_IMC 0x461C #define PCI_DEVICE_ID_INTEL_RPL_1_IMC 0xA700 #define PCI_DEVICE_ID_INTEL_RPL_2_IMC 0xA702 #define PCI_DEVICE_ID_INTEL_RPL_3_IMC 0xA706 #define PCI_DEVICE_ID_INTEL_RPL_4_IMC 0xA709 +#define PCI_DEVICE_ID_INTEL_RPL_5_IMC 0xA701 +#define PCI_DEVICE_ID_INTEL_RPL_6_IMC 0xA703 +#define PCI_DEVICE_ID_INTEL_RPL_7_IMC 0xA704 +#define PCI_DEVICE_ID_INTEL_RPL_8_IMC 0xA705 +#define PCI_DEVICE_ID_INTEL_RPL_9_IMC 0xA706 +#define PCI_DEVICE_ID_INTEL_RPL_10_IMC 0xA707 +#define PCI_DEVICE_ID_INTEL_RPL_11_IMC 0xA708 +#define PCI_DEVICE_ID_INTEL_RPL_12_IMC 0xA709 +#define PCI_DEVICE_ID_INTEL_RPL_13_IMC 0xA70a +#define PCI_DEVICE_ID_INTEL_RPL_14_IMC 0xA70b +#define PCI_DEVICE_ID_INTEL_RPL_15_IMC 0xA715 +#define PCI_DEVICE_ID_INTEL_RPL_16_IMC 0xA716 +#define PCI_DEVICE_ID_INTEL_RPL_17_IMC 0xA717 +#define PCI_DEVICE_ID_INTEL_RPL_18_IMC 0xA718 +#define PCI_DEVICE_ID_INTEL_RPL_19_IMC 0xA719 +#define PCI_DEVICE_ID_INTEL_RPL_20_IMC 0xA71A +#define PCI_DEVICE_ID_INTEL_RPL_21_IMC 0xA71B +#define PCI_DEVICE_ID_INTEL_RPL_22_IMC 0xA71C +#define PCI_DEVICE_ID_INTEL_RPL_23_IMC 0xA728 +#define PCI_DEVICE_ID_INTEL_RPL_24_IMC 0xA729 +#define PCI_DEVICE_ID_INTEL_RPL_25_IMC 0xA72A + + +#define IMC_UNCORE_DEV(a) \ +{ \ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_##a##_IMC), \ + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), \ +} /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -849,242 +882,80 @@ static struct intel_uncore_type *snb_pci_uncores[] = { }; static const struct pci_device_id snb_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(SNB), { /* end: all zeroes */ }, }; static const struct pci_device_id ivb_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(IVB), + IMC_UNCORE_DEV(IVB_E3), { /* end: all zeroes */ }, }; static const struct pci_device_id hsw_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(HSW), + IMC_UNCORE_DEV(HSW_U), { /* end: all zeroes */ }, }; static const struct pci_device_id bdw_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(BDW), { /* end: all zeroes */ }, }; static const struct pci_device_id skl_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_Y_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_E3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_Y_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_UQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_HQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_WQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4H_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6H_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(SKL_Y), + IMC_UNCORE_DEV(SKL_U), + IMC_UNCORE_DEV(SKL_HD), + IMC_UNCORE_DEV(SKL_HQ), + IMC_UNCORE_DEV(SKL_SD), + IMC_UNCORE_DEV(SKL_SQ), + IMC_UNCORE_DEV(SKL_E3), + IMC_UNCORE_DEV(KBL_Y), + IMC_UNCORE_DEV(KBL_U), + IMC_UNCORE_DEV(KBL_UQ), + IMC_UNCORE_DEV(KBL_SD), + IMC_UNCORE_DEV(KBL_SQ), + IMC_UNCORE_DEV(KBL_HQ), + IMC_UNCORE_DEV(KBL_WQ), + IMC_UNCORE_DEV(CFL_2U), + IMC_UNCORE_DEV(CFL_4U), + IMC_UNCORE_DEV(CFL_4H), + IMC_UNCORE_DEV(CFL_6H), + IMC_UNCORE_DEV(CFL_2S_D), + IMC_UNCORE_DEV(CFL_4S_D), + IMC_UNCORE_DEV(CFL_6S_D), + IMC_UNCORE_DEV(CFL_8S_D), + IMC_UNCORE_DEV(CFL_4S_W), + IMC_UNCORE_DEV(CFL_6S_W), + IMC_UNCORE_DEV(CFL_8S_W), + IMC_UNCORE_DEV(CFL_4S_S), + IMC_UNCORE_DEV(CFL_6S_S), + IMC_UNCORE_DEV(CFL_8S_S), + IMC_UNCORE_DEV(AML_YD), + IMC_UNCORE_DEV(AML_YQ), + IMC_UNCORE_DEV(WHL_UQ), + IMC_UNCORE_DEV(WHL_4_UQ), + IMC_UNCORE_DEV(WHL_UD), + IMC_UNCORE_DEV(CML_H1), + IMC_UNCORE_DEV(CML_H2), + IMC_UNCORE_DEV(CML_H3), + IMC_UNCORE_DEV(CML_U1), + IMC_UNCORE_DEV(CML_U2), + IMC_UNCORE_DEV(CML_U3), + IMC_UNCORE_DEV(CML_S1), + IMC_UNCORE_DEV(CML_S2), + IMC_UNCORE_DEV(CML_S3), + IMC_UNCORE_DEV(CML_S4), + IMC_UNCORE_DEV(CML_S5), { /* end: all zeroes */ }, }; static const struct pci_device_id icl_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(ICL_U), + IMC_UNCORE_DEV(ICL_U2), + IMC_UNCORE_DEV(RKL_1), + IMC_UNCORE_DEV(RKL_2), { /* end: all zeroes */ }, }; @@ -1326,106 +1197,57 @@ void nhm_uncore_cpu_init(void) /* Tiger Lake MMIO uncore support */ static const struct pci_device_id tgl_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U4_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_H_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_4_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_5_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_6_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_7_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_8_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_9_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_10_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_11_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_12_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_13_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_14_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_15_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_16_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_4_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(TGL_U1), + IMC_UNCORE_DEV(TGL_U2), + IMC_UNCORE_DEV(TGL_U3), + IMC_UNCORE_DEV(TGL_U4), + IMC_UNCORE_DEV(TGL_H), + IMC_UNCORE_DEV(ADL_1), + IMC_UNCORE_DEV(ADL_2), + IMC_UNCORE_DEV(ADL_3), + IMC_UNCORE_DEV(ADL_4), + IMC_UNCORE_DEV(ADL_5), + IMC_UNCORE_DEV(ADL_6), + IMC_UNCORE_DEV(ADL_7), + IMC_UNCORE_DEV(ADL_8), + IMC_UNCORE_DEV(ADL_9), + IMC_UNCORE_DEV(ADL_10), + IMC_UNCORE_DEV(ADL_11), + IMC_UNCORE_DEV(ADL_12), + IMC_UNCORE_DEV(ADL_13), + IMC_UNCORE_DEV(ADL_14), + IMC_UNCORE_DEV(ADL_15), + IMC_UNCORE_DEV(ADL_16), + IMC_UNCORE_DEV(ADL_17), + IMC_UNCORE_DEV(ADL_18), + IMC_UNCORE_DEV(ADL_19), + IMC_UNCORE_DEV(ADL_20), + IMC_UNCORE_DEV(ADL_21), + IMC_UNCORE_DEV(RPL_1), + IMC_UNCORE_DEV(RPL_2), + IMC_UNCORE_DEV(RPL_3), + IMC_UNCORE_DEV(RPL_4), + IMC_UNCORE_DEV(RPL_5), + IMC_UNCORE_DEV(RPL_6), + IMC_UNCORE_DEV(RPL_7), + IMC_UNCORE_DEV(RPL_8), + IMC_UNCORE_DEV(RPL_9), + IMC_UNCORE_DEV(RPL_10), + IMC_UNCORE_DEV(RPL_11), + IMC_UNCORE_DEV(RPL_12), + IMC_UNCORE_DEV(RPL_13), + IMC_UNCORE_DEV(RPL_14), + IMC_UNCORE_DEV(RPL_15), + IMC_UNCORE_DEV(RPL_16), + IMC_UNCORE_DEV(RPL_17), + IMC_UNCORE_DEV(RPL_18), + IMC_UNCORE_DEV(RPL_19), + IMC_UNCORE_DEV(RPL_20), + IMC_UNCORE_DEV(RPL_21), + IMC_UNCORE_DEV(RPL_22), + IMC_UNCORE_DEV(RPL_23), + IMC_UNCORE_DEV(RPL_24), + IMC_UNCORE_DEV(RPL_25), { /* end: all zeroes */ } }; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 6d759f88315c..ac542f98c070 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -103,7 +103,9 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ROCKETLAKE: case INTEL_FAM6_ALDERLAKE: case INTEL_FAM6_ALDERLAKE_L: + case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: + case INTEL_FAM6_RAPTORLAKE_P: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 150261d929b9..21a5482bcf84 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -67,22 +67,23 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ - -#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ -#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ -#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ -#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ +#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ + +#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ +#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ +#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ +#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ +#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ +#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ static inline bool is_topdown_count(struct perf_event *event) { @@ -325,6 +326,8 @@ struct cpu_hw_events { * AMD specific bits */ struct amd_nb *amd_nb; + int brs_active; /* BRS is enabled */ + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; int n_pair; /* Large increment events */ @@ -1105,6 +1108,11 @@ int x86_pmu_hw_config(struct perf_event *event); void x86_pmu_disable_all(void); +static inline bool has_amd_brs(struct hw_perf_event *hwc) +{ + return hwc->flags & PERF_X86_EVENT_AMD_BRS; +} + static inline bool is_counter_pair(struct hw_perf_event *hwc) { return hwc->flags & PERF_X86_EVENT_PAIR; @@ -1211,6 +1219,75 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) int amd_pmu_init(void); +#ifdef CONFIG_PERF_EVENTS_AMD_BRS +int amd_brs_init(void); +void amd_brs_disable(void); +void amd_brs_enable(void); +void amd_brs_enable_all(void); +void amd_brs_disable_all(void); +void amd_brs_drain(void); +void amd_brs_lopwr_init(void); +void amd_brs_disable_all(void); +int amd_brs_setup_filter(struct perf_event *event); +void amd_brs_reset(void); + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + perf_sched_cb_inc(event->ctx->pmu); + cpuc->lbr_users++; + /* + * No need to reset BRS because it is reset + * on brs_enable() and it is saturating + */ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); +#else +static inline int amd_brs_init(void) +{ + return 0; +} +static inline void amd_brs_disable(void) {} +static inline void amd_brs_enable(void) {} +static inline void amd_brs_drain(void) {} +static inline void amd_brs_lopwr_init(void) {} +static inline void amd_brs_disable_all(void) {} +static inline int amd_brs_setup_filter(struct perf_event *event) +{ + return 0; +} +static inline void amd_brs_reset(void) {} + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ +} + +static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ +} + +static inline void amd_brs_enable_all(void) +{ +} + +#endif + #else /* CONFIG_CPU_SUP_AMD */ static inline int amd_pmu_init(void) @@ -1218,6 +1295,22 @@ static inline int amd_pmu_init(void) return 0; } +static inline int amd_brs_init(void) +{ + return -EOPNOTSUPP; +} + +static inline void amd_brs_drain(void) +{ +} + +static inline void amd_brs_enable_all(void) +{ +} + +static inline void amd_brs_disable_all(void) +{ +} #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index 8e4d0391ff6c..e481056698de 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -5,7 +5,5 @@ obj-$(CONFIG_IA32_EMULATION) := ia32_signal.o -obj-$(CONFIG_IA32_AOUT) += ia32_aout.o - audit-class-$(CONFIG_AUDIT) := audit.o obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c deleted file mode 100644 index 9bd15241fadb..000000000000 --- a/arch/x86/ia32/ia32_aout.c +++ /dev/null @@ -1,325 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * a.out loader for x86-64 - * - * Copyright (C) 1991, 1992, 1996 Linus Torvalds - * Hacked together by Andi Kleen - */ - -#include <linux/module.h> - -#include <linux/time.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/a.out.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/binfmts.h> -#include <linux/personality.h> -#include <linux/init.h> -#include <linux/jiffies.h> -#include <linux/perf_event.h> -#include <linux/sched/task_stack.h> - -#include <linux/uaccess.h> -#include <asm/cacheflush.h> -#include <asm/user32.h> -#include <asm/ia32.h> - -#undef WARN_OLD - -static int load_aout_binary(struct linux_binprm *); -static int load_aout_library(struct file *); - -static struct linux_binfmt aout_format = { - .module = THIS_MODULE, - .load_binary = load_aout_binary, - .load_shlib = load_aout_library, -}; - -static int set_brk(unsigned long start, unsigned long end) -{ - start = PAGE_ALIGN(start); - end = PAGE_ALIGN(end); - if (end <= start) - return 0; - return vm_brk(start, end - start); -} - - -/* - * create_aout_tables() parses the env- and arg-strings in new user - * memory and creates the pointer tables from them, and puts their - * addresses on the "stack", returning the new stack pointer value. - */ -static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) -{ - u32 __user *argv, *envp, *sp; - int argc = bprm->argc, envc = bprm->envc; - - sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p); - sp -= envc+1; - envp = sp; - sp -= argc+1; - argv = sp; - put_user((unsigned long) envp, --sp); - put_user((unsigned long) argv, --sp); - put_user(argc, --sp); - current->mm->arg_start = (unsigned long) p; - while (argc-- > 0) { - char c; - - put_user((u32)(unsigned long)p, argv++); - do { - get_user(c, p++); - } while (c); - } - put_user(0, argv); - current->mm->arg_end = current->mm->env_start = (unsigned long) p; - while (envc-- > 0) { - char c; - - put_user((u32)(unsigned long)p, envp++); - do { - get_user(c, p++); - } while (c); - } - put_user(0, envp); - current->mm->env_end = (unsigned long) p; - return sp; -} - -/* - * These are the functions used to load a.out style executables and shared - * libraries. There is no binary dependent code anywhere else. - */ -static int load_aout_binary(struct linux_binprm *bprm) -{ - unsigned long error, fd_offset, rlim; - struct pt_regs *regs = current_pt_regs(); - struct exec ex; - int retval; - - ex = *((struct exec *) bprm->buf); /* exec-header */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && - N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || - N_TRSIZE(ex) || N_DRSIZE(ex) || - i_size_read(file_inode(bprm->file)) < - ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - return -ENOEXEC; - } - - fd_offset = N_TXTOFF(ex); - - /* Check initial limits. This avoids letting people circumvent - * size limits imposed on them by creating programs with large - * arrays in the data or bss. - */ - rlim = rlimit(RLIMIT_DATA); - if (rlim >= RLIM_INFINITY) - rlim = ~0; - if (ex.a_data + ex.a_bss > rlim) - return -ENOMEM; - - /* Flush all traces of the currently running executable */ - retval = begin_new_exec(bprm); - if (retval) - return retval; - - /* OK, This is the point of no return */ - set_personality(PER_LINUX); - set_personality_ia32(false); - - setup_new_exec(bprm); - - regs->cs = __USER32_CS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - - current->mm->end_code = ex.a_text + - (current->mm->start_code = N_TXTADDR(ex)); - current->mm->end_data = ex.a_data + - (current->mm->start_data = N_DATADDR(ex)); - current->mm->brk = ex.a_bss + - (current->mm->start_brk = N_BSSADDR(ex)); - - retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) - return retval; - - if (N_MAGIC(ex) == OMAGIC) { - unsigned long text_addr, map_size; - - text_addr = N_TXTADDR(ex); - map_size = ex.a_text+ex.a_data; - - error = vm_brk(text_addr & PAGE_MASK, map_size); - - if (error) - return error; - - error = read_code(bprm->file, text_addr, 32, - ex.a_text + ex.a_data); - if ((signed long)error < 0) - return error; - } else { -#ifdef WARN_OLD - static unsigned long error_time, error_time2; - if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && - (N_MAGIC(ex) != NMAGIC) && - time_after(jiffies, error_time2 + 5*HZ)) { - printk(KERN_NOTICE "executable not page aligned\n"); - error_time2 = jiffies; - } - - if ((fd_offset & ~PAGE_MASK) != 0 && - time_after(jiffies, error_time + 5*HZ)) { - printk(KERN_WARNING - "fd_offset is not page aligned. Please convert " - "program: %pD\n", - bprm->file); - error_time = jiffies; - } -#endif - - if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) { - error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - if (error) - return error; - - read_code(bprm->file, N_TXTADDR(ex), fd_offset, - ex.a_text+ex.a_data); - goto beyond_if; - } - - error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, - PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_32BIT, - fd_offset); - - if (error != N_TXTADDR(ex)) - return error; - - error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_32BIT, - fd_offset + ex.a_text); - if (error != N_DATADDR(ex)) - return error; - } - -beyond_if: - error = set_brk(current->mm->start_brk, current->mm->brk); - if (error) - return error; - - set_binfmt(&aout_format); - - current->mm->start_stack = - (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); - /* start thread */ - loadsegment(fs, 0); - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); - load_gs_index(0); - (regs)->ip = ex.a_entry; - (regs)->sp = current->mm->start_stack; - (regs)->flags = 0x200; - (regs)->cs = __USER32_CS; - (regs)->ss = __USER32_DS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = - regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; - return 0; -} - -static int load_aout_library(struct file *file) -{ - unsigned long bss, start_addr, len, error; - int retval; - struct exec ex; - loff_t pos = 0; - - retval = -ENOEXEC; - error = kernel_read(file, &ex, sizeof(ex), &pos); - if (error != sizeof(ex)) - goto out; - - /* We come in here for the regular a.out style of shared libraries */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || - N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || - i_size_read(file_inode(file)) < - ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - goto out; - } - - if (N_FLAGS(ex)) - goto out; - - /* For QMAGIC, the starting address is 0x20 into the page. We mask - this off to get the starting address for the page */ - - start_addr = ex.a_entry & 0xfffff000; - - if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { -#ifdef WARN_OLD - static unsigned long error_time; - if (time_after(jiffies, error_time + 5*HZ)) { - printk(KERN_WARNING - "N_TXTOFF is not page aligned. Please convert " - "library: %pD\n", - file); - error_time = jiffies; - } -#endif - retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - if (retval) - goto out; - - read_code(file, start_addr, N_TXTOFF(ex), - ex.a_text + ex.a_data); - retval = 0; - goto out; - } - /* Now use mmap to map the library into memory. */ - error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_32BIT, - N_TXTOFF(ex)); - retval = error; - if (error != start_addr) - goto out; - - len = PAGE_ALIGN(ex.a_text + ex.a_data); - bss = ex.a_text + ex.a_data + ex.a_bss; - if (bss > len) { - retval = vm_brk(start_addr + len, bss - len); - if (retval) - goto out; - } - retval = 0; -out: - return retval; -} - -static int __init init_aout_binfmt(void) -{ - register_binfmt(&aout_format); - return 0; -} - -static void __exit exit_aout_binfmt(void) -{ - unregister_binfmt(&aout_format); -} - -module_init(init_aout_binfmt); -module_exit(exit_aout_binfmt); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h index 9aff97f0de7f..d937c55e717e 100644 --- a/arch/x86/include/asm/acenv.h +++ b/arch/x86/include/asm/acenv.h @@ -13,7 +13,19 @@ /* Asm macros */ -#define ACPI_FLUSH_CPU_CACHE() wbinvd() +/* + * ACPI_FLUSH_CPU_CACHE() flushes caches on entering sleep states. + * It is required to prevent data loss. + * + * While running inside virtual machine, the kernel can bypass cache flushing. + * Changing sleep state in a virtual machine doesn't affect the host system + * sleep state and cannot lead to data loss. + */ +#define ACPI_FLUSH_CPU_CACHE() \ +do { \ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) \ + wbinvd(); \ +} while (0) int __acpi_acquire_global_lock(unsigned int *lock); int __acpi_release_global_lock(unsigned int *lock); diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index 46e1df45efc0..aabdbb5ab920 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -49,7 +49,7 @@ union ibs_op_ctl { }; }; -/* MSR 0xc0011035: IBS Op Data 2 */ +/* MSR 0xc0011035: IBS Op Data 1 */ union ibs_op_data { __u64 val; struct { diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 00d1a400b7a1..ed0eaf65c437 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -16,7 +16,6 @@ extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; extern bool early_is_amd_nb(u32 value); extern struct resource *amd_get_mmconfig_range(struct resource *res); -extern int amd_cache_northbridges(void); extern void amd_flush_garts(void); extern int amd_numa_init(void); extern int amd_get_subcaches(int); diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 48067af94678..bd8ae0a7010a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -328,6 +328,8 @@ struct apic { /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); + /* wakeup secondary CPU using 64-bit wakeup point */ + int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip); void (*inquire_remote_apic)(int apicid); @@ -488,6 +490,11 @@ static inline unsigned int read_apic_id(void) return apic->get_apic_id(reg); } +#ifdef CONFIG_X86_64 +typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip); +extern void acpi_wake_cpu_handler_update(wakeup_cpu_handler handler); +#endif + extern int default_apic_id_valid(u32 apicid); extern int default_acpi_madt_oem_check(char *, char *); extern void default_setup_apic_routing(void); diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 5716f22f81ac..92035eb3afee 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -95,12 +95,6 @@ #define APIC_LVTTHMR 0x330 #define APIC_LVTPC 0x340 #define APIC_LVT0 0x350 -#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18) -#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3) -#define SET_APIC_TIMER_BASE(x) (((x) << 18)) -#define APIC_TIMER_BASE_CLKIN 0x0 -#define APIC_TIMER_BASE_TMBASE 0x1 -#define APIC_TIMER_BASE_DIV 0x2 #define APIC_LVT_TIMER_ONESHOT (0 << 17) #define APIC_LVT_TIMER_PERIODIC (1 << 17) #define APIC_LVT_TIMER_TSCDEADLINE (2 << 17) diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 981fe923a59f..53e9b0620d96 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -74,6 +74,7 @@ static void sanitize_boot_params(struct boot_params *boot_params) BOOT_PARAM_PRESERVE(hdr), BOOT_PARAM_PRESERVE(e820_table), BOOT_PARAM_PRESERVE(eddbuf), + BOOT_PARAM_PRESERVE(cc_blob_address), }; memset(&scratch, 0, sizeof(scratch)); diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index aaf0cb0db4ae..a3ec87d198ac 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -18,7 +18,7 @@ #ifdef CONFIG_X86_32 # define __BUG_REL(val) ".long " __stringify(val) #else -# define __BUG_REL(val) ".long " __stringify(val) " - 2b" +# define __BUG_REL(val) ".long " __stringify(val) " - ." #endif #ifdef CONFIG_DEBUG_BUGVERBOSE diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 0a7fe0321613..215f5a65790f 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -42,6 +42,9 @@ static inline void set_64bit(volatile u64 *ptr, u64 value) #define arch_cmpxchg64_local(ptr, o, n) \ ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \ (unsigned long long)(n))) +#define arch_try_cmpxchg64(ptr, po, n) \ + __try_cmpxchg64((ptr), (unsigned long long *)(po), \ + (unsigned long long)(n)) #endif static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) @@ -70,6 +73,24 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) return prev; } +static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new) +{ + bool success; + u64 old = *pold; + asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]" + CC_SET(z) + : CC_OUT(z) (success), + [ptr] "+m" (*ptr), + "+A" (old) + : "b" ((u32)new), + "c" ((u32)(new >> 32)) + : "memory"); + + if (unlikely(!success)) + *pold = old; + return success; +} + #ifndef CONFIG_X86_CMPXCHG64 /* * Building a kernel capable running on 80386 and 80486. It may be necessary diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 072e5459fe2f..250187ac8248 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -19,6 +19,12 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) arch_cmpxchg_local((ptr), (o), (n)); \ }) +#define arch_try_cmpxchg64(ptr, po, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + arch_try_cmpxchg((ptr), (po), (n)); \ +}) + #define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) #endif /* _ASM_X86_CMPXCHG_64_H */ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 86e5e4e26fcb..8cbf623f0ecf 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -36,6 +36,8 @@ extern int _debug_hotplug_cpu(int cpu, int action); #endif #endif +extern void ap_init_aperfmperf(void); + int mwait_usable(const struct cpuinfo_x86 *); unsigned int x86_family(unsigned int sig); @@ -43,14 +45,12 @@ unsigned int x86_model(unsigned int sig); unsigned int x86_stepping(unsigned int sig); #ifdef CONFIG_CPU_SUP_INTEL extern void __init sld_setup(struct cpuinfo_x86 *c); -extern void switch_to_sld(unsigned long tifn); extern bool handle_user_split_lock(struct pt_regs *regs, long error_code); extern bool handle_guest_split_lock(unsigned long ip); extern void handle_bus_lock(struct pt_regs *regs); u8 get_this_hybrid_cpu_type(void); #else static inline void __init sld_setup(struct cpuinfo_x86 *c) {} -static inline void switch_to_sld(unsigned long tifn) {} static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code) { return false; @@ -76,4 +76,22 @@ static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {} extern __noendbr void cet_disable(void); +struct ucode_cpu_info; + +int intel_cpu_collect_info(struct ucode_cpu_info *uci); + +static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1, + unsigned int s2, unsigned int p2) +{ + if (s1 != s2) + return false; + + /* Processor flags are either both 0 ... */ + if (!p1 && !p2) + return true; + + /* ... or they intersect. */ + return p1 & p2; +} + #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index dd5ea1bdf04c..75efc4c6f076 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -143,7 +143,7 @@ extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); extern struct cpu_entry_area *get_cpu_entry_area(int cpu); -static inline struct entry_stack *cpu_entry_stack(int cpu) +static __always_inline struct entry_stack *cpu_entry_stack(int cpu) { return &get_cpu_entry_area(cpu)->entry_stack_page.stack; } diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1261842d006c..66d3e3b1d24d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -34,14 +34,17 @@ enum cpuid_leafs CPUID_8000_001F_EAX, }; +#define X86_CAP_FMT_NUM "%d:%d" +#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) + #ifdef CONFIG_X86_FEATURE_NAMES extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; #define X86_CAP_FMT "%s" #define x86_cap_flag(flag) x86_cap_flags[flag] #else -#define X86_CAP_FMT "%d:%d" -#define x86_cap_flag(flag) ((flag) >> 5), ((flag) & 31) +#define X86_CAP_FMT X86_CAP_FMT_NUM +#define x86_cap_flag x86_cap_flag_num #endif /* diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73e643ae94b6..21bb78dfd41d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -201,7 +201,7 @@ #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -/* FREE! ( 7*32+10) */ +#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ @@ -211,7 +211,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -/* FREE! ( 7*32+20) */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ @@ -238,6 +238,7 @@ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ #define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ #define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ +#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -315,6 +316,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ +#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h new file mode 100644 index 000000000000..70b2db18165e --- /dev/null +++ b/arch/x86/include/asm/cpuid.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * CPUID-related helpers/definitions + * + * Derived from arch/x86/kvm/cpuid.c + */ + +#ifndef _ASM_X86_CPUID_H +#define _ASM_X86_CPUID_H + +static __always_inline bool cpuid_function_is_indexed(u32 function) +{ + switch (function) { + case 4: + case 7: + case 0xb: + case 0xd: + case 0xf: + case 0x10: + case 0x12: + case 0x14: + case 0x17: + case 0x18: + case 0x1d: + case 0x1e: + case 0x1f: + case 0x8000001d: + return true; + } + + return false; +} + +#endif /* _ASM_X86_CPUID_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 1231d63f836d..36369e76cc63 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -10,12 +10,6 @@ * cpu_feature_enabled(). */ -#ifdef CONFIG_X86_SMAP -# define DISABLE_SMAP 0 -#else -# define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31)) -#endif - #ifdef CONFIG_X86_UMIP # define DISABLE_UMIP 0 #else @@ -68,6 +62,12 @@ # define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) #endif +#ifdef CONFIG_INTEL_TDX_GUEST +# define DISABLE_TDX_GUEST 0 +#else +# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -79,8 +79,8 @@ #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) -#define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX) +#define DISABLED_MASK8 (DISABLE_TDX_GUEST) +#define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 #define DISABLED_MASK12 0 diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 98938a68251c..bed74a0f2932 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -357,6 +357,11 @@ static inline u32 efi64_convert_status(efi_status_t status) runtime), \ func, __VA_ARGS__)) +#define efi_dxe_call(func, ...) \ + (efi_is_native() \ + ? efi_dxe_table->func(__VA_ARGS__) \ + : __efi64_thunk_map(efi_dxe_table, func, __VA_ARGS__)) + #else /* CONFIG_EFI_MIXED */ static inline bool efi_is_64bit(void) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 29fea180a665..cb0ff1055ab1 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -116,7 +116,7 @@ extern unsigned int vdso32_enabled; * now struct_user_regs, they are different) */ -#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \ +#define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ pr_reg[0] = regs->bx; \ pr_reg[1] = regs->cx; \ @@ -128,6 +128,7 @@ do { \ pr_reg[7] = regs->ds; \ pr_reg[8] = regs->es; \ pr_reg[9] = regs->fs; \ + savesegment(gs, pr_reg[10]); \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ pr_reg[13] = regs->cs; \ @@ -136,18 +137,6 @@ do { \ pr_reg[16] = regs->ss; \ } while (0); -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ -do { \ - ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\ - pr_reg[10] = get_user_gs(regs); \ -} while (0); - -#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \ -do { \ - ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\ - savesegment(gs, pr_reg[10]); \ -} while (0); - #define ELF_PLATFORM (utsname()->machine) #define set_personality_64bit() do { } while (0) diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 43184640b579..674ed46d3ced 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -10,7 +10,7 @@ #include <asm/fpu/api.h> /* Check that the stack and regs on entry from user mode are sane. */ -static __always_inline void arch_check_user_regs(struct pt_regs *regs) +static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { /* @@ -42,7 +42,7 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) WARN_ON_ONCE(regs != task_pt_regs(current)); } } -#define arch_check_user_regs arch_check_user_regs +#define arch_enter_from_user_mode arch_enter_from_user_mode static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, unsigned long ti_work) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index c83b3020350a..6b0f31fb53f7 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -162,7 +162,6 @@ static inline bool fpstate_is_confidential(struct fpu_guest *gfpu) } /* prctl */ -struct task_struct; -extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2); +extern long fpu_xstate_prctl(int option, unsigned long arg2); #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/arch/x86/include/asm/fpu/internal.h +++ /dev/null diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 032e020853aa..731ee7cc40a5 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -26,6 +26,7 @@ #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/fixmap.h> +#include <asm/pgtable_areas.h> /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 7924f27f5c8b..72184b0b2219 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -632,6 +632,10 @@ DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap); #endif +#ifdef CONFIG_INTEL_TDX_GUEST +DECLARE_IDTENTRY(X86_TRAP_VE, exc_virtualization_exception); +#endif + /* Device interrupts common/spurious */ DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index e9736af126b2..1870b99c3356 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -44,6 +44,7 @@ #include <asm/page.h> #include <asm/early_ioremap.h> #include <asm/pgtable_types.h> +#include <asm/shared/io.h> #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -256,37 +257,23 @@ static inline void slow_down_io(void) #endif #define BUILDIO(bwl, bw, type) \ -static inline void out##bwl(unsigned type value, int port) \ -{ \ - asm volatile("out" #bwl " %" #bw "0, %w1" \ - : : "a"(value), "Nd"(port)); \ -} \ - \ -static inline unsigned type in##bwl(int port) \ -{ \ - unsigned type value; \ - asm volatile("in" #bwl " %w1, %" #bw "0" \ - : "=a"(value) : "Nd"(port)); \ - return value; \ -} \ - \ -static inline void out##bwl##_p(unsigned type value, int port) \ +static inline void out##bwl##_p(type value, u16 port) \ { \ out##bwl(value, port); \ slow_down_io(); \ } \ \ -static inline unsigned type in##bwl##_p(int port) \ +static inline type in##bwl##_p(u16 port) \ { \ - unsigned type value = in##bwl(port); \ + type value = in##bwl(port); \ slow_down_io(); \ return value; \ } \ \ -static inline void outs##bwl(int port, const void *addr, unsigned long count) \ +static inline void outs##bwl(u16 port, const void *addr, unsigned long count) \ { \ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ - unsigned type *value = (unsigned type *)addr; \ + type *value = (type *)addr; \ while (count) { \ out##bwl(*value, port); \ value++; \ @@ -299,10 +286,10 @@ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ } \ } \ \ -static inline void ins##bwl(int port, void *addr, unsigned long count) \ +static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ { \ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ - unsigned type *value = (unsigned type *)addr; \ + type *value = (type *)addr; \ while (count) { \ *value = in##bwl(port); \ value++; \ @@ -315,13 +302,11 @@ static inline void ins##bwl(int port, void *addr, unsigned long count) \ } \ } -BUILDIO(b, b, char) -BUILDIO(w, w, short) -BUILDIO(l, , int) +BUILDIO(b, b, u8) +BUILDIO(w, w, u16) +BUILDIO(l, , u32) +#undef BUILDIO -#define inb inb -#define inw inw -#define inl inl #define inb_p inb_p #define inw_p inw_p #define inl_p inl_p @@ -329,9 +314,6 @@ BUILDIO(l, , int) #define insw insw #define insl insl -#define outb outb -#define outw outw -#define outl outl #define outb_p outb_p #define outw_p outw_p #define outl_p outl_p diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 111104d1c2cd..7793e52d6237 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -137,14 +137,6 @@ static __always_inline void arch_local_irq_restore(unsigned long flags) if (!arch_irqs_disabled_flags(flags)) arch_local_irq_enable(); } -#else -#ifdef CONFIG_X86_64 -#ifdef CONFIG_XEN_PV -#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV -#else -#define SWAPGS swapgs -#endif -#endif #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 0449b125d27f..071572e23d3a 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -20,7 +20,7 @@ _ASM_PTR "%c0 + %c1 - .\n\t" \ ".popsection \n\t" -#ifdef CONFIG_STACK_VALIDATION +#ifdef CONFIG_HAVE_JUMP_LABEL_HACK static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { @@ -34,7 +34,7 @@ l_yes: return true; } -#else +#else /* !CONFIG_HAVE_JUMP_LABEL_HACK */ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { @@ -48,7 +48,7 @@ l_yes: return true; } -#endif /* STACK_VALIDATION */ +#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */ static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 56935ebb1dfe..57bc74e112f2 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -7,6 +7,8 @@ #include <linux/interrupt.h> #include <uapi/asm/kvm_para.h> +#include <asm/tdx.h> + #ifdef CONFIG_KVM_GUEST bool kvm_check_and_clear_guest_paused(void); #else @@ -32,6 +34,10 @@ static inline bool kvm_check_and_clear_guest_paused(void) static inline long kvm_hypercall0(unsigned int nr) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, 0, 0, 0, 0); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr) @@ -42,6 +48,10 @@ static inline long kvm_hypercall0(unsigned int nr) static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, p1, 0, 0, 0); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr), "b"(p1) @@ -53,6 +63,10 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, unsigned long p2) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, p1, p2, 0, 0); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr), "b"(p1), "c"(p2) @@ -64,6 +78,10 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, p1, p2, p3, 0); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr), "b"(p1), "c"(p2), "d"(p3) @@ -76,6 +94,10 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, unsigned long p4) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, p1, p2, p3, p4); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index e2c6f433ed10..88ceaf3648b3 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -49,9 +49,6 @@ void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, void __init mem_encrypt_free_decrypted_mem(void); -/* Architecture __weak replacement functions */ -void __init mem_encrypt_init(void); - void __init sev_es_init_vc_handling(void); #define __bss_decrypted __section(".bss..decrypted") @@ -89,6 +86,9 @@ static inline void mem_encrypt_free_decrypted_mem(void) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void); + /* * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when * writing to or comparing values from the cr3 register. Having the diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 27516046117a..b8d40ddeab00 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -141,7 +141,7 @@ do { \ #ifdef CONFIG_X86_32 #define deactivate_mm(tsk, mm) \ do { \ - lazy_load_gs(0); \ + loadsegment(gs, 0); \ } while (0) #else #define deactivate_mm(tsk, mm) \ diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/arch/x86/include/asm/mmx.h +++ /dev/null diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ee15311b6be1..403e83b4adc8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -76,6 +76,8 @@ /* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */ #define MSR_IA32_CORE_CAPS 0x000000cf +#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT 2 +#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS BIT(MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT) #define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT 5 #define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT BIT(MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT) @@ -154,6 +156,11 @@ #define MSR_IA32_POWER_CTL 0x000001fc #define MSR_IA32_POWER_CTL_BIT_EE 19 +/* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */ +#define MSR_INTEGRITY_CAPS 0x000002d9 +#define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4 +#define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT) + #define MSR_LBR_NHM_FROM 0x00000680 #define MSR_LBR_NHM_TO 0x000006c0 #define MSR_LBR_CORE_FROM 0x00000040 @@ -312,6 +319,7 @@ /* Run Time Average Power Limiting (RAPL) Interface */ +#define MSR_VR_CURRENT_CONFIG 0x00000601 #define MSR_RAPL_POWER_UNIT 0x00000606 #define MSR_PKG_POWER_LIMIT 0x00000610 @@ -502,8 +510,10 @@ #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 #define MSR_AMD64_SEV_ES_ENABLED_BIT 1 +#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) +#define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f @@ -524,6 +534,11 @@ #define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16) #define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24) +/* AMD Performance Counter Global Status and Control MSRs */ +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 +#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 @@ -688,6 +703,10 @@ #define MSR_IA32_PERF_CTL 0x00000199 #define INTEL_PERF_CTL_MASK 0xffff +/* AMD Branch Sampling configuration */ +#define MSR_AMD_DBG_EXTN_CFG 0xc000010f +#define MSR_AMD_SAMP_BR_FROM 0xc0010300 + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index d42e6c6b47b1..65ec1965cd28 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -10,16 +10,7 @@ #include <asm/errno.h> #include <asm/cpumask.h> #include <uapi/asm/msr.h> - -struct msr { - union { - struct { - u32 l; - u32 h; - }; - u64 q; - }; -}; +#include <asm/shared/msr.h> struct msr_info { u32 msr_no; diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 1cb9c17a4cb4..5c5f1e56c404 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -47,6 +47,7 @@ struct nmiaction { #define register_nmi_handler(t, fn, fg, n, init...) \ ({ \ static struct nmiaction init fn##_na = { \ + .list = LIST_HEAD_INIT(fn##_na.list), \ .handler = (fn), \ .name = (n), \ .flags = (fg), \ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index e9c86299b835..baa70451b8df 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -16,7 +16,7 @@ extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; -static inline unsigned long __phys_addr_nodebug(unsigned long x) +static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { unsigned long y = x - __START_KERNEL_map; diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index a0627dfae541..1307cd689d2a 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,6 +93,15 @@ struct irq_routing_table { struct irq_info slots[]; } __attribute__((packed)); +struct irt_routing_table { + u32 signature; /* IRT_SIGNATURE should be here */ + u8 size; /* Number of entries provided */ + u8 used; /* Number of entries actually used */ + u16 exclusive_irqs; /* IRQs devoted exclusively to + PCI usage */ + struct irq_info slots[]; +} __attribute__((packed)); + extern unsigned int pcibios_irq_mask; extern raw_spinlock_t pci_config_lock; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index b06e4c573add..409725e86f42 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_PERF_EVENT_H #define _ASM_X86_PERF_EVENT_H +#include <linux/static_call.h> + /* * Performance event hw details: */ @@ -184,6 +186,18 @@ union cpuid28_ecx { unsigned int full; }; +/* + * AMD "Extended Performance Monitoring and Debug" CPUID + * detection/enumeration details: + */ +union cpuid_0x80000022_ebx { + struct { + /* Number of Core Performance Counters */ + unsigned int num_core_pmc:4; + } split; + unsigned int full; +}; + struct x86_pmu_capability { int version; int num_counters_gp; @@ -371,6 +385,11 @@ struct pebs_xmm { }; /* + * AMD Extended Performance Monitoring and Debug cpuid feature detection + */ +#define EXT_PERFMON_DEBUG_FEATURES 0x80000022 + +/* * IBS cpuid feature detection */ @@ -391,6 +410,7 @@ struct pebs_xmm { #define IBS_CAPS_OPBRNFUSE (1U<<8) #define IBS_CAPS_FETCHCTLEXTD (1U<<9) #define IBS_CAPS_OPDATA4 (1U<<10) +#define IBS_CAPS_ZEN4 (1U<<11) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -404,6 +424,7 @@ struct pebs_xmm { #define IBSCTL_LVT_OFFSET_MASK 0x0F /* IBS fetch bits/masks */ +#define IBS_FETCH_L3MISSONLY (1ULL<<59) #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) @@ -420,6 +441,7 @@ struct pebs_xmm { #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) +#define IBS_OP_L3MISSONLY (1ULL<<16) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ #define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */ @@ -518,6 +540,27 @@ static inline void intel_pt_handle_vmx(int on) #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) extern void amd_pmu_enable_virt(void); extern void amd_pmu_disable_virt(void); + +#if defined(CONFIG_PERF_EVENTS_AMD_BRS) + +#define PERF_NEEDS_LOPWR_CB 1 + +/* + * architectural low power callback impacts + * drivers/acpi/processor_idle.c + * drivers/acpi/acpi_pad.c + */ +extern void perf_amd_brs_lopwr_cb(bool lopwr_in); + +DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); + +static inline void perf_lopwr_cb(bool lopwr_in) +{ + static_call_mod(perf_lopwr_cb)(lopwr_in); +} + +#endif /* PERF_NEEDS_LOPWR_CB */ + #else static inline void amd_pmu_enable_virt(void) { } static inline void amd_pmu_disable_virt(void) { } diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 1d5f14aff5f6..2e6c04d8a45b 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -41,9 +41,6 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, return __arch_override_mprotect_pkey(vma, prot, pkey); } -extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); - #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3) #define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) @@ -118,11 +115,6 @@ int mm_pkey_free(struct mm_struct *mm, int pkey) return 0; } -extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); -extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); - static inline int vma_pkey(struct vm_area_struct *vma) { unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index feed36d44d04..12ef86b19910 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -13,6 +13,8 @@ void syscall_init(void); #ifdef CONFIG_X86_64 void entry_SYSCALL_64(void); void entry_SYSCALL_64_safe_stack(void); +void entry_SYSRETQ_unsafe_stack(void); +void entry_SYSRETQ_end(void); long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif @@ -28,6 +30,8 @@ void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); void entry_SYSCALL_compat_safe_stack(void); +void entry_SYSRETL_compat_unsafe_stack(void); +void entry_SYSRETL_compat_end(void); void entry_INT80_compat(void); #ifdef CONFIG_XEN_PV void xen_entry_INT80_compat(void); @@ -35,11 +39,9 @@ void xen_entry_INT80_compat(void); #endif void x86_configure_nx(void); -void x86_report_nx(void); extern int reboot_force; -long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long arg2); +long do_arch_prctl_common(int option, unsigned long arg2); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 4357e0f2cd5f..f4db78b09c8f 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -186,9 +186,13 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs) bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); + ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack && + regs->ip < (unsigned long)entry_SYSRETQ_end); #ifdef CONFIG_IA32_EMULATION ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); + ret = ret || (regs->ip >= (unsigned long)entry_SYSRETL_compat_unsafe_stack && + regs->ip < (unsigned long)entry_SYSRETL_compat_end); #endif return ret; diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 331474b150f1..fd6f6e5b755a 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -25,6 +25,7 @@ struct real_mode_header { u32 sev_es_trampoline_start; #endif #ifdef CONFIG_X86_64 + u32 trampoline_start64; u32 trampoline_pgd; #endif /* ACPI S3 wakeup */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 656ed6531d03..2e7890dd58a4 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -350,18 +350,6 @@ static inline void __loadsegment_fs(unsigned short value) #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") -/* - * x86-32 user GS accessors. This is ugly and could do with some cleaning up. - */ -#ifdef CONFIG_X86_32 -# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; }) -# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) -# define task_user_gs(tsk) ((tsk)->thread.gs) -# define lazy_save_gs(v) savesegment(gs, (v)) -# define lazy_load_gs(v) loadsegment(gs, (v)) -# define load_gs_index(v) loadsegment(gs, (v)) -#endif /* X86_32 */ - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 896e48d45828..7590ac2570b9 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -50,7 +50,6 @@ extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp); -extern unsigned long __startup_secondary_64(void); extern void startup_64_setup_env(unsigned long physbase); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); @@ -109,27 +108,19 @@ extern unsigned long _brk_end; void *extend_brk(size_t size, size_t align); /* - * Reserve space in the brk section. The name must be unique within - * the file, and somewhat descriptive. The size is in bytes. Must be - * used at file scope. + * Reserve space in the brk section. The name must be unique within the file, + * and somewhat descriptive. The size is in bytes. * - * (This uses a temp function to wrap the asm so we can pass it the - * size parameter; otherwise we wouldn't be able to. We can't use a - * "section" attribute on a normal variable because it always ends up - * being @progbits, which ends up allocating space in the vmlinux - * executable.) + * The allocation is done using inline asm (rather than using a section + * attribute on a normal variable) in order to allow the use of @nobits, so + * that it doesn't take up any space in the vmlinux file. */ -#define RESERVE_BRK(name,sz) \ - static void __section(".discard.text") __noendbr __used notrace \ - __brk_reservation_fn_##name##__(void) { \ - asm volatile ( \ - ".pushsection .brk_reservation,\"aw\",@nobits;" \ - ".brk." #name ":" \ - " 1:.skip %c0;" \ - " .size .brk." #name ", . - 1b;" \ - " .popsection" \ - : : "i" (sz)); \ - } +#define RESERVE_BRK(name, size) \ + asm(".pushsection .brk_reservation,\"aw\",@nobits\n\t" \ + ".brk." #name ":\n\t" \ + ".skip " __stringify(size) "\n\t" \ + ".size .brk." #name ", " __stringify(size) "\n\t" \ + ".popsection\n\t") extern void probe_roms(void); #ifdef __i386__ diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 1b2fd32b42fe..b8357d6ecd47 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -57,9 +57,79 @@ #define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 #define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +/* GHCB GPA Register */ +#define GHCB_MSR_REG_GPA_REQ 0x012 +#define GHCB_MSR_REG_GPA_REQ_VAL(v) \ + /* GHCBData[63:12] */ \ + (((u64)((v) & GENMASK_ULL(51, 0)) << 12) | \ + /* GHCBData[11:0] */ \ + GHCB_MSR_REG_GPA_REQ) + +#define GHCB_MSR_REG_GPA_RESP 0x013 +#define GHCB_MSR_REG_GPA_RESP_VAL(v) \ + /* GHCBData[63:12] */ \ + (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) + +/* + * SNP Page State Change Operation + * + * GHCBData[55:52] - Page operation: + * 0x0001 Page assignment, Private + * 0x0002 Page assignment, Shared + */ +enum psc_op { + SNP_PAGE_STATE_PRIVATE = 1, + SNP_PAGE_STATE_SHARED, +}; + +#define GHCB_MSR_PSC_REQ 0x014 +#define GHCB_MSR_PSC_REQ_GFN(gfn, op) \ + /* GHCBData[55:52] */ \ + (((u64)((op) & 0xf) << 52) | \ + /* GHCBData[51:12] */ \ + ((u64)((gfn) & GENMASK_ULL(39, 0)) << 12) | \ + /* GHCBData[11:0] */ \ + GHCB_MSR_PSC_REQ) + +#define GHCB_MSR_PSC_RESP 0x015 +#define GHCB_MSR_PSC_RESP_VAL(val) \ + /* GHCBData[63:32] */ \ + (((u64)(val) & GENMASK_ULL(63, 32)) >> 32) + /* GHCB Hypervisor Feature Request/Response */ #define GHCB_MSR_HV_FT_REQ 0x080 #define GHCB_MSR_HV_FT_RESP 0x081 +#define GHCB_MSR_HV_FT_RESP_VAL(v) \ + /* GHCBData[63:12] */ \ + (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) + +#define GHCB_HV_FT_SNP BIT_ULL(0) +#define GHCB_HV_FT_SNP_AP_CREATION BIT_ULL(1) + +/* SNP Page State Change NAE event */ +#define VMGEXIT_PSC_MAX_ENTRY 253 + +struct psc_hdr { + u16 cur_entry; + u16 end_entry; + u32 reserved; +} __packed; + +struct psc_entry { + u64 cur_page : 12, + gfn : 40, + operation : 4, + pagesize : 1, + reserved : 7; +} __packed; + +struct snp_psc_desc { + struct psc_hdr hdr; + struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY]; +} __packed; + +/* Guest message request error code */ +#define SNP_GUEST_REQ_INVALID_LEN BIT_ULL(32) #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 @@ -73,8 +143,20 @@ /* GHCBData[23:16] */ \ ((((u64)reason_val) & 0xff) << 16)) +/* Error codes from reason set 0 */ +#define SEV_TERM_SET_GEN 0 #define GHCB_SEV_ES_GEN_REQ 0 #define GHCB_SEV_ES_PROT_UNSUPPORTED 1 +#define GHCB_SNP_UNSUPPORTED 2 + +/* Linux-specific reason codes (used with reason set 1) */ +#define SEV_TERM_SET_LINUX 1 +#define GHCB_TERM_REGISTER 0 /* GHCB GPA registration failure */ +#define GHCB_TERM_PSC 1 /* Page State Change failure */ +#define GHCB_TERM_PVALIDATE 2 /* Pvalidate failure */ +#define GHCB_TERM_NOT_VMPL0 3 /* SNP guest is not running at VMPL-0 */ +#define GHCB_TERM_CPUID 4 /* CPUID-validation failure */ +#define GHCB_TERM_CPUID_HV 5 /* CPUID failure during hypervisor fallback */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index ec060c433589..19514524f0f8 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -11,9 +11,10 @@ #include <linux/types.h> #include <asm/insn.h> #include <asm/sev-common.h> +#include <asm/bootparam.h> -#define GHCB_PROTO_OUR 0x0001UL -#define GHCB_PROTOCOL_MAX 1ULL +#define GHCB_PROTOCOL_MIN 1ULL +#define GHCB_PROTOCOL_MAX 2ULL #define GHCB_DEFAULT_USAGE 0ULL #define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } @@ -42,6 +43,24 @@ struct es_em_ctxt { struct es_fault_info fi; }; +/* + * AMD SEV Confidential computing blob structure. The structure is + * defined in OVMF UEFI firmware header: + * https://github.com/tianocore/edk2/blob/master/OvmfPkg/Include/Guid/ConfidentialComputingSevSnpBlob.h + */ +#define CC_BLOB_SEV_HDR_MAGIC 0x45444d41 +struct cc_blob_sev_info { + u32 magic; + u16 version; + u16 reserved; + u64 secrets_phys; + u32 secrets_len; + u32 rsvd1; + u64 cpuid_phys; + u32 cpuid_len; + u32 rsvd2; +} __packed; + void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code); static inline u64 lower_bits(u64 val, unsigned int bits) @@ -60,6 +79,61 @@ extern void vc_no_ghcb(void); extern void vc_boot_ghcb(void); extern bool handle_vc_boot_ghcb(struct pt_regs *regs); +/* Software defined (when rFlags.CF = 1) */ +#define PVALIDATE_FAIL_NOUPDATE 255 + +/* RMP page size */ +#define RMP_PG_SIZE_4K 0 + +#define RMPADJUST_VMSA_PAGE_BIT BIT(16) + +/* SNP Guest message request */ +struct snp_req_data { + unsigned long req_gpa; + unsigned long resp_gpa; + unsigned long data_gpa; + unsigned int data_npages; +}; + +struct sev_guest_platform_data { + u64 secrets_gpa; +}; + +/* + * The secrets page contains 96-bytes of reserved field that can be used by + * the guest OS. The guest OS uses the area to save the message sequence + * number for each VMPCK. + * + * See the GHCB spec section Secret page layout for the format for this area. + */ +struct secrets_os_area { + u32 msg_seqno_0; + u32 msg_seqno_1; + u32 msg_seqno_2; + u32 msg_seqno_3; + u64 ap_jump_table_pa; + u8 rsvd[40]; + u8 guest_usage[32]; +} __packed; + +#define VMPCK_KEY_LEN 32 + +/* See the SNP spec version 0.9 for secrets page format */ +struct snp_secrets_page_layout { + u32 version; + u32 imien : 1, + rsvd1 : 31; + u32 fms; + u32 rsvd2; + u8 gosvw[16]; + u8 vmpck0[VMPCK_KEY_LEN]; + u8 vmpck1[VMPCK_KEY_LEN]; + u8 vmpck2[VMPCK_KEY_LEN]; + u8 vmpck3[VMPCK_KEY_LEN]; + struct secrets_os_area os_area; + u8 rsvd3[3840]; +} __packed; + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); @@ -87,12 +161,71 @@ extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, struct es_em_ctxt *ctxt, u64 exit_code, u64 exit_info_1, u64 exit_info_2); +static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) +{ + int rc; + + /* "rmpadjust" mnemonic support in binutils 2.36 and newer */ + asm volatile(".byte 0xF3,0x0F,0x01,0xFE\n\t" + : "=a"(rc) + : "a"(vaddr), "c"(rmp_psize), "d"(attrs) + : "memory", "cc"); + + return rc; +} +static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) +{ + bool no_rmpupdate; + int rc; + + /* "pvalidate" mnemonic support in binutils 2.36 and newer */ + asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFF\n\t" + CC_SET(c) + : CC_OUT(c) (no_rmpupdate), "=a"(rc) + : "a"(vaddr), "c"(rmp_psize), "d"(validate) + : "memory", "cc"); + + if (no_rmpupdate) + return PVALIDATE_FAIL_NOUPDATE; + + return rc; +} +void setup_ghcb(void); +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned int npages); +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned int npages); +void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); +void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); +void snp_set_memory_private(unsigned long vaddr, unsigned int npages); +void snp_set_wakeup_secondary_cpu(void); +bool snp_init(struct boot_params *bp); +void snp_abort(void); +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } static inline void sev_es_nmi_complete(void) { } static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } +static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } +static inline void setup_ghcb(void) { } +static inline void __init +early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } +static inline void __init +early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } +static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } +static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { } +static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } +static inline void snp_set_wakeup_secondary_cpu(void) { } +static inline bool snp_init(struct boot_params *bp) { return false; } +static inline void snp_abort(void) { } +static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, + unsigned long *fw_err) +{ + return -ENOTTY; +} #endif #endif diff --git a/arch/x86/include/asm/shared/io.h b/arch/x86/include/asm/shared/io.h new file mode 100644 index 000000000000..c0ef921c0586 --- /dev/null +++ b/arch/x86/include/asm/shared/io.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHARED_IO_H +#define _ASM_X86_SHARED_IO_H + +#include <linux/types.h> + +#define BUILDIO(bwl, bw, type) \ +static inline void __out##bwl(type value, u16 port) \ +{ \ + asm volatile("out" #bwl " %" #bw "0, %w1" \ + : : "a"(value), "Nd"(port)); \ +} \ + \ +static inline type __in##bwl(u16 port) \ +{ \ + type value; \ + asm volatile("in" #bwl " %w1, %" #bw "0" \ + : "=a"(value) : "Nd"(port)); \ + return value; \ +} + +BUILDIO(b, b, u8) +BUILDIO(w, w, u16) +BUILDIO(l, , u32) +#undef BUILDIO + +#define inb __inb +#define inw __inw +#define inl __inl +#define outb __outb +#define outw __outw +#define outl __outl + +#endif diff --git a/arch/x86/include/asm/shared/msr.h b/arch/x86/include/asm/shared/msr.h new file mode 100644 index 000000000000..1e6ec10b3a15 --- /dev/null +++ b/arch/x86/include/asm/shared/msr.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHARED_MSR_H +#define _ASM_X86_SHARED_MSR_H + +struct msr { + union { + struct { + u32 l; + u32 h; + }; + u64 q; + }; +}; + +#endif /* _ASM_X86_SHARED_MSR_H */ diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h new file mode 100644 index 000000000000..e53f26228fbb --- /dev/null +++ b/arch/x86/include/asm/shared/tdx.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHARED_TDX_H +#define _ASM_X86_SHARED_TDX_H + +#include <linux/bits.h> +#include <linux/types.h> + +#define TDX_HYPERCALL_STANDARD 0 + +#define TDX_HCALL_HAS_OUTPUT BIT(0) +#define TDX_HCALL_ISSUE_STI BIT(1) + +#define TDX_CPUID_LEAF_ID 0x21 +#define TDX_IDENT "IntelTDX " + +#ifndef __ASSEMBLY__ + +/* + * Used in __tdx_hypercall() to pass down and get back registers' values of + * the TDCALL instruction when requesting services from the VMM. + * + * This is a software only structure and not part of the TDX module/VMM ABI. + */ +struct tdx_hypercall_args { + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; +}; + +/* Used to request services from the VMM */ +u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void); + +#endif /* !__ASSEMBLY__ */ +#endif /* _ASM_X86_SHARED_TDX_H */ diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index d17b39893b79..bab490379c65 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -19,25 +19,14 @@ #ifdef __ASSEMBLY__ -#ifdef CONFIG_X86_SMAP - #define ASM_CLAC \ ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP #define ASM_STAC \ ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP -#else /* CONFIG_X86_SMAP */ - -#define ASM_CLAC -#define ASM_STAC - -#endif /* CONFIG_X86_SMAP */ - #else /* __ASSEMBLY__ */ -#ifdef CONFIG_X86_SMAP - static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ @@ -76,19 +65,6 @@ static __always_inline void smap_restore(unsigned long flags) #define ASM_STAC \ ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP) -#else /* CONFIG_X86_SMAP */ - -static inline void clac(void) { } -static inline void stac(void) { } - -static inline unsigned long smap_save(void) { return 0; } -static inline void smap_restore(unsigned long flags) { } - -#define ASM_CLAC -#define ASM_STAC - -#endif /* CONFIG_X86_SMAP */ - #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_SMAP_H */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 68c257a3de0d..45b18eb94fa1 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -184,14 +184,15 @@ static inline void wbinvd(void) native_wbinvd(); } -#ifdef CONFIG_X86_64 static inline void load_gs_index(unsigned int selector) { +#ifdef CONFIG_X86_64 native_load_gs_index(selector); -} - +#else + loadsegment(gs, selector); #endif +} #endif /* CONFIG_PARAVIRT_XXL */ diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 7b132d0312eb..a800abb1a992 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -19,7 +19,6 @@ struct saved_context { u16 gs; unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; - bool misc_enable_saved; struct saved_msrs saved_msrs; struct desc_ptr gdt_desc; struct desc_ptr idt; @@ -28,6 +27,7 @@ struct saved_context { unsigned long tr; unsigned long safety; unsigned long return_address; + bool misc_enable_saved; } __attribute__((packed)); /* routines for saving/restoring kernel state */ diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 35bb35d28733..54df06687d83 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -14,9 +14,13 @@ * Image of the saved processor state, used by the low level ACPI suspend to * RAM code and by the low level hibernation code. * - * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that - * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c, - * still work as required. + * If you modify it, check how it is used in arch/x86/kernel/acpi/wakeup_64.S + * and make sure that __save/__restore_processor_state(), defined in + * arch/x86/power/cpu.c, still work as required. + * + * Because the structure is packed, make sure to avoid unaligned members. For + * optimisation purposes but also because tools like kmemleak only search for + * pointers that are aligned. */ struct saved_context { struct pt_regs regs; @@ -36,7 +40,6 @@ struct saved_context { unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; - bool misc_enable_saved; struct saved_msrs saved_msrs; unsigned long efer; u16 gdt_pad; /* Unused */ @@ -48,6 +51,7 @@ struct saved_context { unsigned long tr; unsigned long safety; unsigned long return_address; + bool misc_enable_saved; } __attribute__((packed)); #define loaddebug(thread,register) \ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index f70a5108d464..1b07fba11704 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -271,6 +271,7 @@ struct vmcb_seg { u64 base; } __packed; +/* Save area definition for legacy and SEV-MEM guests */ struct vmcb_save_area { struct vmcb_seg es; struct vmcb_seg cs; @@ -282,12 +283,12 @@ struct vmcb_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u8 reserved_1[43]; + u8 reserved_1[42]; + u8 vmpl; u8 cpl; u8 reserved_2[4]; u64 efer; - u8 reserved_3[104]; - u64 xss; /* Valid for SEV-ES only */ + u8 reserved_3[112]; u64 cr4; u64 cr3; u64 cr0; @@ -297,7 +298,9 @@ struct vmcb_save_area { u64 rip; u8 reserved_4[88]; u64 rsp; - u8 reserved_5[24]; + u64 s_cet; + u64 ssp; + u64 isst_addr; u64 rax; u64 star; u64 lstar; @@ -308,29 +311,145 @@ struct vmcb_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_6[32]; + u8 reserved_5[32]; u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; u64 last_excp_from; u64 last_excp_to; - - /* - * The following part of the save area is valid only for - * SEV-ES guests when referenced through the GHCB or for - * saving to the host save area. - */ - u8 reserved_7[72]; + u8 reserved_6[72]; u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ - u8 reserved_7b[4]; +} __packed; + +/* Save area definition for SEV-ES and SEV-SNP guests */ +struct sev_es_save_area { + struct vmcb_seg es; + struct vmcb_seg cs; + struct vmcb_seg ss; + struct vmcb_seg ds; + struct vmcb_seg fs; + struct vmcb_seg gs; + struct vmcb_seg gdtr; + struct vmcb_seg ldtr; + struct vmcb_seg idtr; + struct vmcb_seg tr; + u64 vmpl0_ssp; + u64 vmpl1_ssp; + u64 vmpl2_ssp; + u64 vmpl3_ssp; + u64 u_cet; + u8 reserved_1[2]; + u8 vmpl; + u8 cpl; + u8 reserved_2[4]; + u64 efer; + u8 reserved_3[104]; + u64 xss; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u64 dr0; + u64 dr1; + u64 dr2; + u64 dr3; + u64 dr0_addr_mask; + u64 dr1_addr_mask; + u64 dr2_addr_mask; + u64 dr3_addr_mask; + u8 reserved_4[24]; + u64 rsp; + u64 s_cet; + u64 ssp; + u64 isst_addr; + u64 rax; + u64 star; + u64 lstar; + u64 cstar; + u64 sfmask; + u64 kernel_gs_base; + u64 sysenter_cs; + u64 sysenter_esp; + u64 sysenter_eip; + u64 cr2; + u8 reserved_5[32]; + u64 g_pat; + u64 dbgctl; + u64 br_from; + u64 br_to; + u64 last_excp_from; + u64 last_excp_to; + u8 reserved_7[80]; u32 pkru; - u8 reserved_7a[20]; - u64 reserved_8; /* rax already available at 0x01f8 */ + u8 reserved_8[20]; + u64 reserved_9; /* rax already available at 0x01f8 */ + u64 rcx; + u64 rdx; + u64 rbx; + u64 reserved_10; /* rsp already available at 0x01d8 */ + u64 rbp; + u64 rsi; + u64 rdi; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; + u8 reserved_11[16]; + u64 guest_exit_info_1; + u64 guest_exit_info_2; + u64 guest_exit_int_info; + u64 guest_nrip; + u64 sev_features; + u64 vintr_ctrl; + u64 guest_exit_code; + u64 virtual_tom; + u64 tlb_id; + u64 pcpu_id; + u64 event_inj; + u64 xcr0; + u8 reserved_12[16]; + + /* Floating point area */ + u64 x87_dp; + u32 mxcsr; + u16 x87_ftw; + u16 x87_fsw; + u16 x87_fcw; + u16 x87_fop; + u16 x87_ds; + u16 x87_cs; + u64 x87_rip; + u8 fpreg_x87[80]; + u8 fpreg_xmm[256]; + u8 fpreg_ymm[256]; +} __packed; + +struct ghcb_save_area { + u8 reserved_1[203]; + u8 cpl; + u8 reserved_2[116]; + u64 xss; + u8 reserved_3[24]; + u64 dr7; + u8 reserved_4[16]; + u64 rip; + u8 reserved_5[88]; + u64 rsp; + u8 reserved_6[24]; + u64 rax; + u8 reserved_7[264]; u64 rcx; u64 rdx; u64 rbx; - u64 reserved_9; /* rsp already available at 0x01d8 */ + u8 reserved_8[8]; u64 rbp; u64 rsi; u64 rdi; @@ -342,22 +461,24 @@ struct vmcb_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_10[16]; + u8 reserved_9[16]; u64 sw_exit_code; u64 sw_exit_info_1; u64 sw_exit_info_2; u64 sw_scratch; - u8 reserved_11[56]; + u8 reserved_10[56]; u64 xcr0; u8 valid_bitmap[16]; u64 x87_state_gpa; } __packed; +#define GHCB_SHARED_BUF_SIZE 2032 + struct ghcb { - struct vmcb_save_area save; - u8 reserved_save[2048 - sizeof(struct vmcb_save_area)]; + struct ghcb_save_area save; + u8 reserved_save[2048 - sizeof(struct ghcb_save_area)]; - u8 shared_buffer[2032]; + u8 shared_buffer[GHCB_SHARED_BUF_SIZE]; u8 reserved_1[10]; u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */ @@ -365,13 +486,17 @@ struct ghcb { } __packed; -#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032 +#define EXPECTED_VMCB_SAVE_AREA_SIZE 740 +#define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 +#define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE static inline void __unused_size_checks(void) { BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); + BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE); + BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); } @@ -441,26 +566,26 @@ struct vmcb { /* GHCB Accessor functions */ #define GHCB_BITMAP_IDX(field) \ - (offsetof(struct vmcb_save_area, field) / sizeof(u64)) + (offsetof(struct ghcb_save_area, field) / sizeof(u64)) #define DEFINE_GHCB_ACCESSORS(field) \ - static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ + static __always_inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ { \ return test_bit(GHCB_BITMAP_IDX(field), \ (unsigned long *)&ghcb->save.valid_bitmap); \ } \ \ - static inline u64 ghcb_get_##field(struct ghcb *ghcb) \ + static __always_inline u64 ghcb_get_##field(struct ghcb *ghcb) \ { \ return ghcb->save.field; \ } \ \ - static inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \ + static __always_inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \ { \ return ghcb_##field##_is_valid(ghcb) ? ghcb->save.field : 0; \ } \ \ - static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \ + static __always_inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \ { \ __set_bit(GHCB_BITMAP_IDX(field), \ (unsigned long *)&ghcb->save.valid_bitmap); \ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h new file mode 100644 index 000000000000..020c81a7c729 --- /dev/null +++ b/arch/x86/include/asm/tdx.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021-2022 Intel Corporation */ +#ifndef _ASM_X86_TDX_H +#define _ASM_X86_TDX_H + +#include <linux/init.h> +#include <linux/bits.h> +#include <asm/ptrace.h> +#include <asm/shared/tdx.h> + +/* + * SW-defined error codes. + * + * Bits 47:40 == 0xFF indicate Reserved status code class that never used by + * TDX module. + */ +#define TDX_ERROR _BITUL(63) +#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) +#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) + +#ifndef __ASSEMBLY__ + +/* + * Used to gather the output registers values of the TDCALL and SEAMCALL + * instructions when requesting services from the TDX module. + * + * This is a software only structure and not part of the TDX module/VMM ABI. + */ +struct tdx_module_output { + u64 rcx; + u64 rdx; + u64 r8; + u64 r9; + u64 r10; + u64 r11; +}; + +/* + * Used by the #VE exception handler to gather the #VE exception + * info from the TDX module. This is a software only structure + * and not part of the TDX module/VMM ABI. + */ +struct ve_info { + u64 exit_reason; + u64 exit_qual; + /* Guest Linear (virtual) Address */ + u64 gla; + /* Guest Physical Address */ + u64 gpa; + u32 instr_len; + u32 instr_info; +}; + +#ifdef CONFIG_INTEL_TDX_GUEST + +void __init tdx_early_init(void); + +/* Used to communicate with the TDX module */ +u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, + struct tdx_module_output *out); + +void tdx_get_ve_info(struct ve_info *ve); + +bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); + +void tdx_safe_halt(void); + +bool tdx_early_handle_ve(struct pt_regs *regs); + +#else + +static inline void tdx_early_init(void) { }; +static inline void tdx_safe_halt(void) { }; + +static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; } + +#endif /* CONFIG_INTEL_TDX_GUEST */ + +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_INTEL_TDX_GUEST) +long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2, + unsigned long p3, unsigned long p4); +#else +static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3, + unsigned long p4) +{ + return -ENODEV; +} +#endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */ +#endif /* !__ASSEMBLY__ */ +#endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index d20ab0921480..1cc15528ce29 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -45,6 +45,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); +extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ebec69c35e95..f0cb881c1d69 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -92,7 +92,6 @@ struct thread_info { #define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ -#define TIF_SLD 18 /* Restore split lock detection on context switch */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -116,7 +115,6 @@ struct thread_info { #define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -#define _TIF_SLD (1 << TIF_SLD) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) @@ -128,7 +126,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ (_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \ - _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE | _TIF_SLD) + _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE) /* * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated. diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h index a4a8b1b16c0c..956e4145311b 100644 --- a/arch/x86/include/asm/timex.h +++ b/arch/x86/include/asm/timex.h @@ -5,6 +5,15 @@ #include <asm/processor.h> #include <asm/tsc.h> +static inline unsigned long random_get_entropy(void) +{ + if (!IS_ENABLED(CONFIG_X86_TSC) && + !cpu_feature_enabled(X86_FEATURE_TSC)) + return random_get_entropy_fallback(); + return rdtsc(); +} +#define random_get_entropy random_get_entropy + /* Assume we use the PIT time source for the clock tick */ #define CLOCK_TICK_RATE PIT_TICK_RATE diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 9619385bf749..458c891a8273 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -212,30 +212,19 @@ static inline long arch_scale_freq_capacity(int cpu) } #define arch_scale_freq_capacity arch_scale_freq_capacity -extern void arch_scale_freq_tick(void); -#define arch_scale_freq_tick arch_scale_freq_tick - extern void arch_set_max_freq_ratio(bool turbo_disabled); -void init_freq_invariance(bool secondary, bool cppc_ready); +extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); #else -static inline void arch_set_max_freq_ratio(bool turbo_disabled) -{ -} -static inline void init_freq_invariance(bool secondary, bool cppc_ready) -{ -} +static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } +static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } #endif +extern void arch_scale_freq_tick(void); +#define arch_scale_freq_tick arch_scale_freq_tick + #ifdef CONFIG_ACPI_CPPC_LIB void init_freq_invariance_cppc(void); #define arch_init_invariance_cppc init_freq_invariance_cppc - -bool amd_set_max_freq_ratio(u64 *ratio); -#else -static inline bool amd_set_max_freq_ratio(u64 *ratio) -{ - return false; -} #endif #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 35317c5c551d..47ecfff2c83d 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -13,7 +13,7 @@ #ifdef CONFIG_X86_64 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace -struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s); +struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs); void __init trap_init(void); asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs); #endif diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 01a300a9700b..fbdc3d951494 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -20,13 +20,12 @@ extern void disable_TSC(void); static inline cycles_t get_cycles(void) { -#ifndef CONFIG_X86_TSC - if (!boot_cpu_has(X86_FEATURE_TSC)) + if (!IS_ENABLED(CONFIG_X86_TSC) && + !cpu_feature_enabled(X86_FEATURE_TSC)) return 0; -#endif - return rdtsc(); } +#define get_cycles get_cycles extern struct system_counterval_t convert_art_to_tsc(u64 art); extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h index 7ee7ba0d63a3..769b939444ae 100644 --- a/arch/x86/include/uapi/asm/amd_hsmp.h +++ b/arch/x86/include/uapi/asm/amd_hsmp.h @@ -31,9 +31,22 @@ enum hsmp_message_ids { HSMP_GET_CCLK_THROTTLE_LIMIT, /* 10h Get CCLK frequency limit in socket */ HSMP_GET_C0_PERCENT, /* 11h Get average C0 residency in socket */ HSMP_SET_NBIO_DPM_LEVEL, /* 12h Set max/min LCLK DPM Level for a given NBIO */ - /* 13h Reserved */ - HSMP_GET_DDR_BANDWIDTH = 0x14, /* 14h Get theoretical maximum and current DDR Bandwidth */ - HSMP_GET_TEMP_MONITOR, /* 15h Get per-DIMM temperature and refresh rates */ + HSMP_GET_NBIO_DPM_LEVEL, /* 13h Get LCLK DPM level min and max for a given NBIO */ + HSMP_GET_DDR_BANDWIDTH, /* 14h Get theoretical maximum and current DDR Bandwidth */ + HSMP_GET_TEMP_MONITOR, /* 15h Get socket temperature */ + HSMP_GET_DIMM_TEMP_RANGE, /* 16h Get per-DIMM temperature range and refresh rate */ + HSMP_GET_DIMM_POWER, /* 17h Get per-DIMM power consumption */ + HSMP_GET_DIMM_THERMAL, /* 18h Get per-DIMM thermal sensors */ + HSMP_GET_SOCKET_FREQ_LIMIT, /* 19h Get current active frequency per socket */ + HSMP_GET_CCLK_CORE_LIMIT, /* 1Ah Get CCLK frequency limit per core */ + HSMP_GET_RAILS_SVI, /* 1Bh Get SVI-based Telemetry for all rails */ + HSMP_GET_SOCKET_FMAX_FMIN, /* 1Ch Get Fmax and Fmin per socket */ + HSMP_GET_IOLINK_BANDWITH, /* 1Dh Get current bandwidth on IO Link */ + HSMP_GET_XGMI_BANDWITH, /* 1Eh Get current bandwidth on xGMI Link */ + HSMP_SET_GMI3_WIDTH, /* 1Fh Set max and min GMI3 Link width */ + HSMP_SET_PCI_RATE, /* 20h Control link rate on PCIe devices */ + HSMP_SET_POWER_MODE, /* 21h Select power efficiency profile policy */ + HSMP_SET_PSTATE_MAX_MIN, /* 22h Set the max and min DF P-State */ HSMP_MSG_ID_MAX, }; @@ -175,8 +188,12 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { */ {1, 0, HSMP_SET}, - /* RESERVED message */ - {0, 0, HSMP_RSVD}, + /* + * HSMP_GET_NBIO_DPM_LEVEL, num_args = 1, response_sz = 1 + * input: args[0] = nbioid[23:16] + * output: args[0] = max dpm level[15:8] + min dpm level[7:0] + */ + {1, 1, HSMP_GET}, /* * HSMP_GET_DDR_BANDWIDTH, num_args = 0, response_sz = 1 @@ -191,6 +208,93 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { * [7:5] fractional part */ {0, 1, HSMP_GET}, + + /* + * HSMP_GET_DIMM_TEMP_RANGE, num_args = 1, response_sz = 1 + * input: args[0] = DIMM address[7:0] + * output: args[0] = refresh rate[3] + temperature range[2:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_DIMM_POWER, num_args = 1, response_sz = 1 + * input: args[0] = DIMM address[7:0] + * output: args[0] = DIMM power in mW[31:17] + update rate in ms[16:8] + + * DIMM address[7:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_DIMM_THERMAL, num_args = 1, response_sz = 1 + * input: args[0] = DIMM address[7:0] + * output: args[0] = temperature in degree celcius[31:21] + update rate in ms[16:8] + + * DIMM address[7:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_SOCKET_FREQ_LIMIT, num_args = 0, response_sz = 1 + * output: args[0] = frequency in MHz[31:16] + frequency source[15:0] + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_CCLK_CORE_LIMIT, num_args = 1, response_sz = 1 + * input: args[0] = apic id [31:0] + * output: args[0] = frequency in MHz[31:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_RAILS_SVI, num_args = 0, response_sz = 1 + * output: args[0] = power in mW[31:0] + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_SOCKET_FMAX_FMIN, num_args = 0, response_sz = 1 + * output: args[0] = fmax in MHz[31:16] + fmin in MHz[15:0] + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_IOLINK_BANDWITH, num_args = 1, response_sz = 1 + * input: args[0] = link id[15:8] + bw type[2:0] + * output: args[0] = io bandwidth in Mbps[31:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_XGMI_BANDWITH, num_args = 1, response_sz = 1 + * input: args[0] = link id[15:8] + bw type[2:0] + * output: args[0] = xgmi bandwidth in Mbps[31:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_SET_GMI3_WIDTH, num_args = 1, response_sz = 0 + * input: args[0] = min link width[15:8] + max link width[7:0] + */ + {1, 0, HSMP_SET}, + + /* + * HSMP_SET_PCI_RATE, num_args = 1, response_sz = 1 + * input: args[0] = link rate control value + * output: args[0] = previous link rate control value + */ + {1, 1, HSMP_SET}, + + /* + * HSMP_SET_POWER_MODE, num_args = 1, response_sz = 0 + * input: args[0] = power efficiency mode[2:0] + */ + {1, 0, HSMP_SET}, + + /* + * HSMP_SET_PSTATE_MAX_MIN, num_args = 1, response_sz = 0 + * input: args[0] = min df pstate[15:8] + max df pstate[7:0] + */ + {1, 0, HSMP_SET}, }; /* Reset to default packing */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index b25d3f82c2f3..bea5cdcdf532 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -10,6 +10,7 @@ #define SETUP_EFI 4 #define SETUP_APPLE_PROPERTIES 5 #define SETUP_JAILHOUSE 6 +#define SETUP_CC_BLOB 7 #define SETUP_INDIRECT (1<<31) @@ -187,7 +188,8 @@ struct boot_params { __u32 ext_ramdisk_image; /* 0x0c0 */ __u32 ext_ramdisk_size; /* 0x0c4 */ __u32 ext_cmd_line_ptr; /* 0x0c8 */ - __u8 _pad4[116]; /* 0x0cc */ + __u8 _pad4[112]; /* 0x0cc */ + __u32 cc_blob_address; /* 0x13c */ struct edid_info edid_info; /* 0x140 */ struct efi_info efi_info; /* 0x1c0 */ __u32 alt_mem_k; /* 0x1e0 */ diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index efa969325ede..f69c168391aa 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -108,6 +108,14 @@ #define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005 #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 +#define SVM_VMGEXIT_PSC 0x80000010 +#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011 +#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012 +#define SVM_VMGEXIT_AP_CREATION 0x80000013 +#define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 +#define SVM_VMGEXIT_AP_CREATE 1 +#define SVM_VMGEXIT_AP_DESTROY 2 +#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff /* Exit code reserved for hypervisor/software use */ @@ -218,6 +226,11 @@ { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ + { SVM_VMGEXIT_PSC, "vmgexit_page_state_change" }, \ + { SVM_VMGEXIT_GUEST_REQUEST, "vmgexit_guest_request" }, \ + { SVM_VMGEXIT_EXT_GUEST_REQUEST, "vmgexit_ext_guest_request" }, \ + { SVM_VMGEXIT_AP_CREATION, "vmgexit_ap_creation" }, \ + { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \ { SVM_EXIT_ERR, "invalid_guest_state" } diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index c41ef42adbe8..1a2dc328cb5e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -46,8 +46,6 @@ endif # non-deterministic coverage. KCOV_INSTRUMENT := n -CFLAGS_head$(BITS).o += -fno-stack-protector - CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0d01e7f5078c..907cc98b1938 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -65,6 +65,13 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; static bool acpi_support_online_capable; #endif +#ifdef CONFIG_X86_64 +/* Physical address of the Multiprocessor Wakeup Structure mailbox */ +static u64 acpi_mp_wake_mailbox_paddr; +/* Virtual address of the Multiprocessor Wakeup Structure mailbox */ +static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; +#endif + #ifdef CONFIG_X86_IO_APIC /* * Locks related to IOAPIC hotplug @@ -336,7 +343,60 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e return 0; } -#endif /*CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_X86_64 +static int acpi_wakeup_cpu(int apicid, unsigned long start_ip) +{ + /* + * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). + * + * Wakeup of secondary CPUs is fully serialized in the core code. + * No need to protect acpi_mp_wake_mailbox from concurrent accesses. + */ + if (!acpi_mp_wake_mailbox) { + acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, + sizeof(*acpi_mp_wake_mailbox), + MEMREMAP_WB); + } + + /* + * Mailbox memory is shared between the firmware and OS. Firmware will + * listen on mailbox command address, and once it receives the wakeup + * command, the CPU associated with the given apicid will be booted. + * + * The value of 'apic_id' and 'wakeup_vector' must be visible to the + * firmware before the wakeup command is visible. smp_store_release() + * ensures ordering and visibility. + */ + acpi_mp_wake_mailbox->apic_id = apicid; + acpi_mp_wake_mailbox->wakeup_vector = start_ip; + smp_store_release(&acpi_mp_wake_mailbox->command, + ACPI_MP_WAKE_COMMAND_WAKEUP); + + /* + * Wait for the CPU to wake up. + * + * The CPU being woken up is essentially in a spin loop waiting to be + * woken up. It should not take long for it wake up and acknowledge by + * zeroing out ->command. + * + * ACPI specification doesn't provide any guidance on how long kernel + * has to wait for a wake up acknowledgement. It also doesn't provide + * a way to cancel a wake up request if it takes too long. + * + * In TDX environment, the VMM has control over how long it takes to + * wake up secondary. It can postpone scheduling secondary vCPU + * indefinitely. Giving up on wake up request and reporting error opens + * possible attack vector for VMM: it can wake up a secondary CPU when + * kernel doesn't expect it. Wait until positive result of the wake up + * request. + */ + while (READ_ONCE(acpi_mp_wake_mailbox->command)) + cpu_relax(); + + return 0; +} +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 @@ -375,7 +435,7 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, isa_irq_to_gsi[bus_irq] = gsi; } -static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, +static void mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { #ifdef CONFIG_X86_MPPARSE @@ -387,9 +447,9 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, u8 pin; if (!acpi_ioapic) - return 0; + return; if (!dev || !dev_is_pci(dev)) - return 0; + return; pdev = to_pci_dev(dev); number = pdev->bus->number; @@ -408,7 +468,6 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, mp_save_irq(&mp_irq); #endif - return 0; } static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, @@ -1083,6 +1142,29 @@ static int __init acpi_parse_madt_lapic_entries(void) } return 0; } + +#ifdef CONFIG_X86_64 +static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_multiproc_wakeup *mp_wake; + + if (!IS_ENABLED(CONFIG_SMP)) + return -ENODEV; + + mp_wake = (struct acpi_madt_multiproc_wakeup *)header; + if (BAD_MADT_ENTRY(mp_wake, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_mp_wake_mailbox_paddr = mp_wake->base_address; + + acpi_wake_cpu_handler_update(acpi_wakeup_cpu); + + return 0; +} +#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC @@ -1278,6 +1360,14 @@ static void __init acpi_process_madt(void) smp_found_config = 1; } + +#ifdef CONFIG_X86_64 + /* + * Parse MADT MP Wake entry. + */ + acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP, + acpi_parse_mp_wake, 1); +#endif } if (error == -EINVAL) { /* @@ -1772,7 +1862,7 @@ int __acpi_release_global_lock(unsigned int *lock) void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) { - e820__range_add(addr, size, E820_TYPE_ACPI); + e820__range_add(addr, size, E820_TYPE_NVS); e820__update_table_print(); } diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index df1644d9b3b6..8b8cbf22461a 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -50,20 +50,17 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) return err; } -bool amd_set_max_freq_ratio(u64 *ratio) +static void amd_set_max_freq_ratio(void) { struct cppc_perf_caps perf_caps; u64 highest_perf, nominal_perf; u64 perf_ratio; int rc; - if (!ratio) - return false; - rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { pr_debug("Could not retrieve perf counters (%d)\n", rc); - return false; + return; } highest_perf = amd_get_highest_perf(); @@ -71,7 +68,7 @@ bool amd_set_max_freq_ratio(u64 *ratio) if (!highest_perf || !nominal_perf) { pr_debug("Could not retrieve highest or nominal performance\n"); - return false; + return; } perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); @@ -79,25 +76,27 @@ bool amd_set_max_freq_ratio(u64 *ratio) perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; if (!perf_ratio) { pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); - return false; + return; } - *ratio = perf_ratio; - arch_set_max_freq_ratio(false); - - return true; + freq_invariance_set_perf_ratio(perf_ratio, false); } static DEFINE_MUTEX(freq_invariance_lock); void init_freq_invariance_cppc(void) { - static bool secondary; + static bool init_done; - mutex_lock(&freq_invariance_lock); + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return; - init_freq_invariance(secondary, true); - secondary = true; + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + mutex_lock(&freq_invariance_lock); + if (!init_done) + amd_set_max_freq_ratio(); + init_done = true; mutex_unlock(&freq_invariance_lock); } diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d374cb3cf024..e257f6c80372 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -338,7 +338,7 @@ next: } } -#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * CALL/JMP *%\reg @@ -507,11 +507,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) } } -#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ +#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } -#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ +#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT @@ -994,7 +994,21 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state) __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; -static void *__text_poke(void *addr, const void *opcode, size_t len) +static void text_poke_memcpy(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); +} + +static void text_poke_memset(void *dst, const void *src, size_t len) +{ + int c = *(const int *)src; + + memset(dst, c, len); +} + +typedef void text_poke_f(void *dst, const void *src, size_t len); + +static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; @@ -1059,7 +1073,7 @@ static void *__text_poke(void *addr, const void *opcode, size_t len) prev = use_temporary_mm(poking_mm); kasan_disable_current(); - memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); + func((u8 *)poking_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -1087,11 +1101,13 @@ static void *__text_poke(void *addr, const void *opcode, size_t len) (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); - /* - * If the text does not match what we just wrote then something is - * fundamentally screwy; there's nothing we can really do about that. - */ - BUG_ON(memcmp(addr, opcode, len)); + if (func == text_poke_memcpy) { + /* + * If the text does not match what we just wrote then something is + * fundamentally screwy; there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, src, len)); + } local_irq_restore(flags); pte_unmap_unlock(ptep, ptl); @@ -1118,7 +1134,7 @@ void *text_poke(void *addr, const void *opcode, size_t len) { lockdep_assert_held(&text_mutex); - return __text_poke(addr, opcode, len); + return __text_poke(text_poke_memcpy, addr, opcode, len); } /** @@ -1137,7 +1153,7 @@ void *text_poke(void *addr, const void *opcode, size_t len) */ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) { - return __text_poke(addr, opcode, len); + return __text_poke(text_poke_memcpy, addr, opcode, len); } /** @@ -1167,7 +1183,38 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len) s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); - __text_poke((void *)ptr, opcode + patched, s); + __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); + patched += s; + } + mutex_unlock(&text_mutex); + return addr; +} + +/** + * text_poke_set - memset into (an unused part of) RX memory + * @addr: address to modify + * @c: the byte to fill the area with + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * This is useful to overwrite unused regions of RX memory with illegal + * instructions. + */ +void *text_poke_set(void *addr, int c, size_t len) +{ + unsigned long start = (unsigned long)addr; + size_t patched = 0; + + if (WARN_ON_ONCE(core_kernel_text(start))) + return NULL; + + mutex_lock(&text_mutex); + while (patched < len) { + unsigned long ptr = start + patched; + size_t s; + + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); + + __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); patched += s; } mutex_unlock(&text_mutex); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 020c906f7934..190e0f763375 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -188,7 +188,7 @@ int amd_smn_write(u16 node, u32 address, u32 value) EXPORT_SYMBOL_GPL(amd_smn_write); -int amd_cache_northbridges(void) +static int amd_cache_northbridges(void) { const struct pci_device_id *misc_ids = amd_nb_misc_ids; const struct pci_device_id *link_ids = amd_nb_link_ids; @@ -210,14 +210,14 @@ int amd_cache_northbridges(void) } misc = NULL; - while ((misc = next_northbridge(misc, misc_ids)) != NULL) + while ((misc = next_northbridge(misc, misc_ids))) misc_count++; if (!misc_count) return -ENODEV; root = NULL; - while ((root = next_northbridge(root, root_ids)) != NULL) + while ((root = next_northbridge(root, root_ids))) root_count++; if (root_count) { @@ -290,7 +290,6 @@ int amd_cache_northbridges(void) return 0; } -EXPORT_SYMBOL_GPL(amd_cache_northbridges); /* * Ignores subdevice/subvendor but as far as I can figure out diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b70344bf6600..189d3a5e471a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -170,7 +170,7 @@ static __init int setup_apicpmtimer(char *s) { apic_calibrate_pmtmr = 1; notsc_setup(NULL); - return 0; + return 1; } __setup("apicpmtimer", setup_apicpmtimer); #endif @@ -320,6 +320,9 @@ int lapic_get_maxlvt(void) #define APIC_DIVISOR 16 #define TSC_DIVISOR 8 +/* i82489DX specific */ +#define I82489DX_BASE_DIVIDER (((0x2) << 18)) + /* * This function sets up the local APIC timer, with a timeout of * 'clocks' APIC bus clock. During calibration we actually call @@ -340,8 +343,14 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + /* + * The i82489DX APIC uses bit 18 and 19 for the base divider. This + * overlaps with bit 18 on integrated APICs, but is not documented + * in the SDM. No problem though. i82489DX equipped systems do not + * have TSC deadline timer. + */ if (!lapic_is_integrated()) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + lvtt_value |= I82489DX_BASE_DIVIDER; if (!irqen) lvtt_value |= APIC_LVT_MASKED; @@ -1419,22 +1428,21 @@ void __init apic_intr_mode_init(void) return; case APIC_VIRTUAL_WIRE: pr_info("APIC: Switch to virtual wire mode setup\n"); - default_setup_apic_routing(); break; case APIC_VIRTUAL_WIRE_NO_CONFIG: pr_info("APIC: Switch to virtual wire mode setup with no configuration\n"); upmode = true; - default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO: pr_info("APIC: Switch to symmetric I/O mode setup\n"); - default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO_NO_ROUTING: pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n"); break; } + default_setup_apic_routing(); + if (x86_platform.apic_post_init) x86_platform.apic_post_init(); @@ -2551,6 +2559,16 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) } EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); +#ifdef CONFIG_X86_64 +void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) + (*drv)->wakeup_secondary_cpu_64 = handler; +} +#endif + /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c1bb384935b0..a868b76cd3d4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -65,6 +65,7 @@ #include <asm/irq_remapping.h> #include <asm/hw_irq.h> #include <asm/apic.h> +#include <asm/pgtable.h> #define for_each_ioapic(idx) \ for ((idx) = 0; (idx) < nr_ioapics; (idx)++) @@ -2677,6 +2678,19 @@ static struct resource * __init ioapic_setup_resources(void) return res; } +static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys) +{ + pgprot_t flags = FIXMAP_PAGE_NOCACHE; + + /* + * Ensure fixmaps for IOAPIC MMIO respect memory encryption pgprot + * bits, just like normal ioremap(): + */ + flags = pgprot_decrypted(flags); + + __set_fixmap(idx, phys, flags); +} + void __init io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; @@ -2709,7 +2723,7 @@ fake_ioapic_page: __func__, PAGE_SIZE, PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } - set_fixmap_nocache(idx, ioapic_phys); + io_apic_set_fixmap(idx, ioapic_phys); apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys); @@ -2838,7 +2852,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, ioapics[idx].mp_config.flags = MPC_APIC_USABLE; ioapics[idx].mp_config.apicaddr = address; - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + io_apic_set_fixmap(FIX_IO_APIC_BASE_0 + idx, address); if (bad_ioapic_register(idx)) { clear_fixmap(FIX_IO_APIC_BASE_0 + idx); return -ENODEV; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f5a48e66e4f5..482855227964 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -199,7 +199,13 @@ static void __init uv_tsc_check_sync(void) int mmr_shift; char *state; - /* Different returns from different UV BIOS versions */ + /* UV5 guarantees synced TSCs; do not zero TSC_ADJUST */ + if (!is_uv(UV2|UV3|UV4)) { + mark_tsc_async_resets("UV5+"); + return; + } + + /* UV2,3,4, UV BIOS TSC sync state available */ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); mmr_shift = is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; @@ -1340,7 +1346,7 @@ static void __init decode_gam_params(unsigned long ptr) static void __init decode_gam_rng_tbl(unsigned long ptr) { struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr; - unsigned long lgre = 0; + unsigned long lgre = 0, gend = 0; int index = 0; int sock_min = 999999, pnode_min = 99999; int sock_max = -1, pnode_max = -1; @@ -1374,6 +1380,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) flag, size, suffix[order], gre->type, gre->nasid, gre->sockid, gre->pnode); + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) + gend = (unsigned long)gre->limit << UV_GAM_RANGE_SHFT; + /* update to next range start */ lgre = gre->limit; if (sock_min > gre->sockid) @@ -1391,7 +1400,8 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) _max_pnode = pnode_max; _gr_table_len = index; - pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode); + pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x), pnodes(min:%x,max:%x), gap_end(%d)\n", + index, _min_socket, _max_socket, _min_pnode, _max_pnode, fls64(gend)); } /* Walk through UVsystab decoding the fields */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 9fb0a2f8b62a..437308004ef2 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -18,6 +18,7 @@ #include <asm/bootparam.h> #include <asm/suspend.h> #include <asm/tlbflush.h> +#include <asm/tdx.h> #ifdef CONFIG_XEN #include <xen/interface/xen.h> @@ -66,6 +67,22 @@ static void __used common(void) #endif BLANK(); + OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx); + OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx); + OFFSET(TDX_MODULE_r8, tdx_module_output, r8); + OFFSET(TDX_MODULE_r9, tdx_module_output, r9); + OFFSET(TDX_MODULE_r10, tdx_module_output, r10); + OFFSET(TDX_MODULE_r11, tdx_module_output, r11); + + BLANK(); + OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10); + OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11); + OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12); + OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13); + OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14); + OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15); + + BLANK(); OFFSET(BP_scratch, boot_params, scratch); OFFSET(BP_secure_boot, boot_params, secure_boot); OFFSET(BP_loadflags, boot_params, hdr.loadflags); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 9ca008f9e9b1..1f60a2b27936 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -6,146 +6,446 @@ * Copyright (C) 2017 Intel Corp. * Author: Len Brown <len.brown@intel.com> */ - +#include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/ktime.h> #include <linux/math64.h> #include <linux/percpu.h> -#include <linux/cpufreq.h> -#include <linux/smp.h> -#include <linux/sched/isolation.h> #include <linux/rcupdate.h> +#include <linux/sched/isolation.h> +#include <linux/sched/topology.h> +#include <linux/smp.h> +#include <linux/syscore_ops.h> + +#include <asm/cpu.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> #include "cpu.h" -struct aperfmperf_sample { - unsigned int khz; - atomic_t scfpending; - ktime_t time; - u64 aperf; - u64 mperf; +struct aperfmperf { + seqcount_t seq; + unsigned long last_update; + u64 acnt; + u64 mcnt; + u64 aperf; + u64 mperf; }; -static DEFINE_PER_CPU(struct aperfmperf_sample, samples); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { + .seq = SEQCNT_ZERO(cpu_samples.seq) +}; -#define APERFMPERF_CACHE_THRESHOLD_MS 10 -#define APERFMPERF_REFRESH_DELAY_MS 10 -#define APERFMPERF_STALE_THRESHOLD_MS 1000 +static void init_counter_refs(void) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + this_cpu_write(cpu_samples.aperf, aperf); + this_cpu_write(cpu_samples.mperf, mperf); +} + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* - * aperfmperf_snapshot_khz() - * On the current CPU, snapshot APERF, MPERF, and jiffies - * unless we already did it within 10ms - * calculate kHz, save snapshot + * APERF/MPERF frequency ratio computation. + * + * The scheduler wants to do frequency invariant accounting and needs a <1 + * ratio to account for the 'current' frequency, corresponding to + * freq_curr / freq_max. + * + * Since the frequency freq_curr on x86 is controlled by micro-controller and + * our P-state setting is little more than a request/hint, we need to observe + * the effective frequency 'BusyMHz', i.e. the average frequency over a time + * interval after discarding idle time. This is given by: + * + * BusyMHz = delta_APERF / delta_MPERF * freq_base + * + * where freq_base is the max non-turbo P-state. + * + * The freq_max term has to be set to a somewhat arbitrary value, because we + * can't know which turbo states will be available at a given point in time: + * it all depends on the thermal headroom of the entire package. We set it to + * the turbo level with 4 cores active. + * + * Benchmarks show that's a good compromise between the 1C turbo ratio + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, + * which would ignore the entire turbo range (a conspicuous part, making + * freq_curr/freq_max always maxed out). + * + * An exception to the heuristic above is the Atom uarch, where we choose the + * highest turbo level for freq_max since Atom's are generally oriented towards + * power efficiency. + * + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. */ -static void aperfmperf_snapshot_khz(void *dummy) + +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); + +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; + +void arch_set_max_freq_ratio(bool turbo_disabled) { - u64 aperf, aperf_delta; - u64 mperf, mperf_delta; - struct aperfmperf_sample *s = this_cpu_ptr(&samples); - unsigned long flags; + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : + arch_turbo_freq_ratio; +} +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); - local_irq_save(flags); - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - local_irq_restore(flags); +static bool __init turbo_disabled(void) +{ + u64 misc_en; + int err; + + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + if (err) + return false; + + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); +} + +static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ + + return true; +} + +#define X86_MATCH(model) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) + +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { + X86_MATCH(XEON_PHI_KNL), + X86_MATCH(XEON_PHI_KNM), + {} +}; + +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { + X86_MATCH(SKYLAKE_X), + {} +}; + +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { + X86_MATCH(ATOM_GOLDMONT), + X86_MATCH(ATOM_GOLDMONT_D), + X86_MATCH(ATOM_GOLDMONT_PLUS), + {} +}; + +static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) +{ + int fratio, delta_fratio, found; + int err, i; + u64 msr; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + fratio = (msr >> 8) & 0xFF; + i = 16; + found = 0; + do { + if (found >= num_delta_fratio) { + *turbo_freq = fratio; + return true; + } + + delta_fratio = (msr >> (i + 5)) & 0x7; + + if (delta_fratio) { + found += 1; + fratio -= delta_fratio; + } + + i += 8; + } while (i < 64); + + return true; +} + +static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +{ + u64 ratios, counts; + u32 group_size; + int err, i; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + if (err) + return false; + + for (i = 0; i < 64; i += 8) { + group_size = (counts >> i) & 0xFF; + if (group_size >= size) { + *turbo_freq = (ratios >> i) & 0xFF; + return true; + } + } + + return false; +} - aperf_delta = aperf - s->aperf; - mperf_delta = mperf - s->mperf; +static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + u64 msr; + int err; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ + + /* The CPU may have less than 4 cores */ + if (!*turbo_freq) + *turbo_freq = msr & 0xFF; /* 1C turbo */ + + return true; +} + +static bool __init intel_set_max_freq_ratio(void) +{ + u64 base_freq, turbo_freq; + u64 turbo_ratio; + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + if (x86_match_cpu(has_glm_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_knl_turbo_ratio_limits) && + knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_skx_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) + goto out; + + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + return false; + +out: /* - * There is no architectural guarantee that MPERF - * increments faster than we can read it. + * Some hypervisors advertise X86_FEATURE_APERFMPERF + * but then fill all MSR's with zeroes. + * Some CPUs have turbo boost but don't declare any turbo ratio + * in MSR_TURBO_RATIO_LIMIT. */ - if (mperf_delta == 0) - return; + if (!base_freq || !turbo_freq) { + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); + return false; + } - s->time = ktime_get(); - s->aperf = aperf; - s->mperf = mperf; - s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); - atomic_set_release(&s->scfpending, 0); + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); + if (!turbo_ratio) { + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); + return false; + } + + arch_turbo_freq_ratio = turbo_ratio; + arch_set_max_freq_ratio(turbo_disabled()); + + return true; } -static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) +#ifdef CONFIG_PM_SLEEP +static struct syscore_ops freq_invariance_syscore_ops = { + .resume = init_counter_refs, +}; + +static void register_freq_invariance_syscore_ops(void) { - s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu)); - struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); + register_syscore_ops(&freq_invariance_syscore_ops); +} +#else +static inline void register_freq_invariance_syscore_ops(void) {} +#endif - /* Don't bother re-computing within the cache threshold time. */ - if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) - return true; +static void freq_invariance_enable(void) +{ + if (static_branch_unlikely(&arch_scale_freq_key)) { + WARN_ON_ONCE(1); + return; + } + static_branch_enable(&arch_scale_freq_key); + register_freq_invariance_syscore_ops(); + pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); +} + +void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) +{ + arch_turbo_freq_ratio = ratio; + arch_set_max_freq_ratio(turbo_disabled); + freq_invariance_enable(); +} + +static void __init bp_init_freq_invariance(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; - if (!atomic_xchg(&s->scfpending, 1) || wait) - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); + if (intel_set_max_freq_ratio()) + freq_invariance_enable(); +} - /* Return false if the previous iteration was too long ago. */ - return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; +static void disable_freq_invariance_workfn(struct work_struct *work) +{ + static_branch_disable(&arch_scale_freq_key); } -unsigned int aperfmperf_get_khz(int cpu) +static DECLARE_WORK(disable_freq_invariance_work, + disable_freq_invariance_workfn); + +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; + +static void scale_freq_tick(u64 acnt, u64 mcnt) { - if (!cpu_khz) - return 0; + u64 freq_scale; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return 0; + if (!arch_scale_freq_invariant()) + return; - if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) - return 0; + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) + goto error; - if (rcu_is_idle_cpu(cpu)) - return 0; /* Idle CPUs are completely uninteresting. */ + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + goto error; - aperfmperf_snapshot_cpu(cpu, ktime_get(), true); - return per_cpu(samples.khz, cpu); + freq_scale = div64_u64(acnt, mcnt); + if (!freq_scale) + goto error; + + if (freq_scale > SCHED_CAPACITY_SCALE) + freq_scale = SCHED_CAPACITY_SCALE; + + this_cpu_write(arch_freq_scale, freq_scale); + return; + +error: + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); + schedule_work(&disable_freq_invariance_work); } +#else +static inline void bp_init_freq_invariance(void) { } +static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } +#endif /* CONFIG_X86_64 && CONFIG_SMP */ -void arch_freq_prepare_all(void) +void arch_scale_freq_tick(void) { - ktime_t now = ktime_get(); - bool wait = false; - int cpu; + struct aperfmperf *s = this_cpu_ptr(&cpu_samples); + u64 acnt, mcnt, aperf, mperf; - if (!cpu_khz) + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return; + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + acnt = aperf - s->aperf; + mcnt = mperf - s->mperf; - for_each_online_cpu(cpu) { - if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) - continue; - if (rcu_is_idle_cpu(cpu)) - continue; /* Idle CPUs are completely uninteresting. */ - if (!aperfmperf_snapshot_cpu(cpu, now, false)) - wait = true; - } + s->aperf = aperf; + s->mperf = mperf; + + raw_write_seqcount_begin(&s->seq); + s->last_update = jiffies; + s->acnt = acnt; + s->mcnt = mcnt; + raw_write_seqcount_end(&s->seq); - if (wait) - msleep(APERFMPERF_REFRESH_DELAY_MS); + scale_freq_tick(acnt, mcnt); } +/* + * Discard samples older than the define maximum sample age of 20ms. There + * is no point in sending IPIs in such a case. If the scheduler tick was + * not running then the CPU is either idle or isolated. + */ +#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) + unsigned int arch_freq_get_on_cpu(int cpu) { - struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); + struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); + unsigned int seq, freq; + unsigned long last; + u64 acnt, mcnt; - if (!cpu_khz) - return 0; + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + goto fallback; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return 0; + do { + seq = raw_read_seqcount_begin(&s->seq); + last = s->last_update; + acnt = s->acnt; + mcnt = s->mcnt; + } while (read_seqcount_retry(&s->seq, seq)); - if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) - return 0; + /* + * Bail on invalid count and when the last update was too long ago, + * which covers idle and NOHZ full CPUs. + */ + if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) + goto fallback; + + return div64_u64((cpu_khz * acnt), mcnt); + +fallback: + freq = cpufreq_quick_get(cpu); + return freq ? freq : cpu_khz; +} - if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) - return per_cpu(samples.khz, cpu); +static int __init bp_init_aperfmperf(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return 0; - msleep(APERFMPERF_REFRESH_DELAY_MS); - atomic_set(&s->scfpending, 1); - smp_mb(); /* ->scfpending before smp_call_function_single(). */ - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + init_counter_refs(); + bp_init_freq_invariance(); + return 0; +} +early_initcall(bp_init_aperfmperf); - return per_cpu(samples.khz, cpu); +void ap_init_aperfmperf(void) +{ + if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + init_counter_refs(); } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 6296e1ebed1d..d879a6c93609 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -446,6 +446,13 @@ void update_srbds_msr(void) if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) return; + /* + * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX + * being disabled and it hasn't received the SRBDS MSR microcode. + */ + if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + return; + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); switch (srbds_mitigation) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e342ae4db3c4..2e9142797c99 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -60,6 +60,7 @@ #include <asm/uv/uv.h> #include <asm/sigframe.h> #include <asm/traps.h> +#include <asm/sev.h> #include "cpu.h" @@ -298,13 +299,6 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); -static int __init x86_sep_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_SEP); - return 1; -} -__setup("nosep", x86_sep_setup); - /* Standard macro to see if a specific flag is changeable */ static inline int flag_is_changeable_p(u32 flag) { @@ -376,26 +370,12 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif -static __init int setup_disable_smep(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_SMEP); - return 1; -} -__setup("nosmep", setup_disable_smep); - static __always_inline void setup_smep(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_SMEP)) cr4_set_bits(X86_CR4_SMEP); } -static __init int setup_disable_smap(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_SMAP); - return 1; -} -__setup("nosmap", setup_disable_smap); - static __always_inline void setup_smap(struct cpuinfo_x86 *c) { unsigned long eflags = native_save_fl(); @@ -403,14 +383,8 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) /* This should have been cleared long ago */ BUG_ON(eflags & X86_EFLAGS_AC); - if (cpu_has(c, X86_FEATURE_SMAP)) { -#ifdef CONFIG_X86_SMAP + if (cpu_has(c, X86_FEATURE_SMAP)) cr4_set_bits(X86_CR4_SMAP); -#else - clear_cpu_cap(c, X86_FEATURE_SMAP); - cr4_clear_bits(X86_CR4_SMAP); -#endif - } } static __always_inline void setup_umip(struct cpuinfo_x86 *c) @@ -1368,8 +1342,8 @@ static void detect_nopl(void) static void __init cpu_parse_early_param(void) { char arg[128]; - char *argptr = arg; - int arglen, res, bit; + char *argptr = arg, *opt; + int arglen, taint = 0; #ifdef CONFIG_X86_32 if (cmdline_find_option_bool(boot_command_line, "no387")) @@ -1397,21 +1371,61 @@ static void __init cpu_parse_early_param(void) return; pr_info("Clearing CPUID bits:"); - do { - res = get_option(&argptr, &bit); - if (res == 0 || res == 3) - break; - /* If the argument was too long, the last bit may be cut off */ - if (res == 1 && arglen >= sizeof(arg)) - break; + while (argptr) { + bool found __maybe_unused = false; + unsigned int bit; + + opt = strsep(&argptr, ","); + + /* + * Handle naked numbers first for feature flags which don't + * have names. + */ + if (!kstrtouint(opt, 10, &bit)) { + if (bit < NCAPINTS * 32) { + +#ifdef CONFIG_X86_FEATURE_NAMES + /* empty-string, i.e., ""-defined feature flags */ + if (!x86_cap_flags[bit]) + pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); + else +#endif + pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + + setup_clear_cpu_cap(bit); + taint++; + } + /* + * The assumption is that there are no feature names with only + * numbers in the name thus go to the next argument. + */ + continue; + } + +#ifdef CONFIG_X86_FEATURE_NAMES + for (bit = 0; bit < 32 * NCAPINTS; bit++) { + if (!x86_cap_flag(bit)) + continue; - if (bit >= 0 && bit < NCAPINTS * 32) { - pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + if (strcmp(x86_cap_flag(bit), opt)) + continue; + + pr_cont(" %s", opt); setup_clear_cpu_cap(bit); + taint++; + found = true; + break; } - } while (res == 2); + + if (!found) + pr_cont(" (unknown: %s)", opt); +#endif + } pr_cont("\n"); + + if (taint) + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); } /* @@ -1859,14 +1873,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) tsx_ap_init(); } -static __init int setup_noclflush(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); - setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT); - return 1; -} -__setup("noclflush", setup_noclflush); - void print_cpu_info(struct cpuinfo_x86 *c) { const char *vendor = NULL; @@ -2126,6 +2132,9 @@ void cpu_init_exception_handling(void) load_TR_desc(); + /* GHCB needs to be setup to handle #VC. */ + setup_ghcb(); + /* Finally load the IDT */ load_current_idt(); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f7a5370a9b3b..fd5dead8371c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,10 +7,13 @@ #include <linux/smp.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/semaphore.h> #include <linux/thread_info.h> #include <linux/init.h> #include <linux/uaccess.h> +#include <linux/workqueue.h> #include <linux/delay.h> +#include <linux/cpuhotplug.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -91,7 +94,7 @@ static bool ring3mwait_disabled __read_mostly; static int __init ring3mwait_disable(char *__unused) { ring3mwait_disabled = true; - return 0; + return 1; } __setup("ring3mwait=disable", ring3mwait_disable); @@ -181,6 +184,38 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) return false; } +int intel_cpu_collect_info(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig = { 0 }; + unsigned int eax, ebx, ecx, edx; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = x86_family(eax); + model = x86_model(eax); + + if (model >= 5 || family > 6) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + + csig.rev = intel_get_microcode_revision(); + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(intel_cpu_collect_info); + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -999,6 +1034,8 @@ static const struct { static struct ratelimit_state bld_ratelimit; +static DEFINE_SEMAPHORE(buslock_sem); + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt), ratelimit; @@ -1109,18 +1146,52 @@ static void split_lock_init(void) split_lock_verify_msr(sld_state != sld_off); } +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); + up(&buslock_sem); +} + +/* + * If a CPU goes offline with pending delayed work to re-enable split lock + * detection then the delayed work will be executed on some other CPU. That + * handles releasing the buslock_sem, but because it executes on a + * different CPU probably won't re-enable split lock detection. This is a + * problem on HT systems since the sibling CPU on the same core may then be + * left running with split lock detection disabled. + * + * Unconditionally re-enable detection here. + */ +static int splitlock_cpu_offline(unsigned int cpu) +{ + sld_update_msr(true); + + return 0; +} + +static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable); + static void split_lock_warn(unsigned long ip) { - pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", - current->comm, current->pid, ip); + int cpu; - /* - * Disable the split lock detection for this task so it can make - * progress and set TIF_SLD so the detection is re-enabled via - * switch_to_sld() when the task is scheduled out. - */ + if (!current->reported_split_lock) + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + current->reported_split_lock = 1; + + /* misery factor #1, sleep 10ms before trying to execute split lock */ + if (msleep_interruptible(10) > 0) + return; + /* Misery factor #2, only allow one buslocked disabled core at a time */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + cpu = get_cpu(); + schedule_delayed_work_on(cpu, &split_lock_reenable, 2); + + /* Disable split lock detection on this CPU to make progress */ sld_update_msr(false); - set_tsk_thread_flag(current, TIF_SLD); + put_cpu(); } bool handle_guest_split_lock(unsigned long ip) @@ -1194,18 +1265,6 @@ void handle_bus_lock(struct pt_regs *regs) } /* - * This function is called only when switching between tasks with - * different split-lock detection modes. It sets the MSR for the - * mode of the new task. This is right most of the time, but since - * the MSR is shared by hyperthreads on a physical core there can - * be glitches when the two threads need different modes. - */ -void switch_to_sld(unsigned long tifn) -{ - sld_update_msr(!(tifn & _TIF_SLD)); -} - -/* * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should * only be trusted if it is confirmed that a CPU model implements a * specific feature at a particular bit position. @@ -1230,6 +1289,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1), {} }; @@ -1274,10 +1334,14 @@ static void sld_state_show(void) pr_info("disabled\n"); break; case sld_warn: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); - else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/splitlock", NULL, splitlock_cpu_offline) < 0) + pr_warn("No splitlock CPU offline handler\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { pr_info("#DB: warning on user-space bus_locks\n"); + } break; case sld_fatal: if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 1940d305db1c..1c87501e0fa3 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1294,10 +1294,23 @@ out_free: kfree(bank); } +static void __threshold_remove_device(struct threshold_bank **bp) +{ + unsigned int bank, numbanks = this_cpu_read(mce_num_banks); + + for (bank = 0; bank < numbanks; bank++) { + if (!bp[bank]) + continue; + + threshold_remove_bank(bp[bank]); + bp[bank] = NULL; + } + kfree(bp); +} + int mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); - unsigned int bank, numbanks = this_cpu_read(mce_num_banks); if (!bp) return 0; @@ -1308,13 +1321,7 @@ int mce_threshold_remove_device(unsigned int cpu) */ this_cpu_write(threshold_banks, NULL); - for (bank = 0; bank < numbanks; bank++) { - if (bp[bank]) { - threshold_remove_bank(bp[bank]); - bp[bank] = NULL; - } - } - kfree(bp); + __threshold_remove_device(bp); return 0; } @@ -1351,15 +1358,14 @@ int mce_threshold_create_device(unsigned int cpu) if (!(this_cpu_read(bank_map) & (1 << bank))) continue; err = threshold_create_bank(bp, cpu, bank); - if (err) - goto out_err; + if (err) { + __threshold_remove_device(bp); + return err; + } } this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; return 0; -out_err: - mce_threshold_remove_device(cpu); - return err; } diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 0e3ae64d3b76..717192915f28 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -177,16 +177,14 @@ retry: /* no more record */ if (*record_id == APEI_ERST_INVALID_RECORD_ID) goto out; - rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + rc = erst_read_record(*record_id, &rcd.hdr, sizeof(rcd), sizeof(rcd), + &CPER_CREATOR_MCE); /* someone else has cleared the record, try next one */ if (rc == -ENOENT) goto retry; else if (rc < 0) goto out; - /* try to skip other type records in storage */ - else if (rc != sizeof(rcd) || - !guid_equal(&rcd.hdr.creator_id, &CPER_CREATOR_MCE)) - goto retry; + memcpy(m, &rcd.mce, sizeof(*m)); rc = sizeof(*m); out: diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 981496e6bc0e..d775fcd74e98 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -69,7 +69,9 @@ DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); struct mce_bank { u64 ctl; /* subevents to enable */ - bool init; /* initialise bank? */ + + __u64 init : 1, /* initialise bank? */ + __reserved_1 : 63; }; static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 1add86935349..00483d1c27e4 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -301,85 +301,65 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs) } } -static __always_inline int mce_severity_amd_smca(struct mce *m, enum context err_ctx) +/* See AMD PPR(s) section Machine Check Error Handling. */ +static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { - u64 mcx_cfg; + char *panic_msg = NULL; + int ret; /* - * We need to look at the following bits: - * - "succor" bit (data poisoning support), and - * - TCC bit (Task Context Corrupt) - * in MCi_STATUS to determine error severity. + * Default return value: Action required, the error must be handled + * immediately. */ - if (!mce_flags.succor) - return MCE_PANIC_SEVERITY; - - mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank)); - - /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */ - if ((mcx_cfg & MCI_CONFIG_MCAX) && - (m->status & MCI_STATUS_TCC) && - (err_ctx == IN_KERNEL)) - return MCE_PANIC_SEVERITY; - - /* ...otherwise invoke hwpoison handler. */ - return MCE_AR_SEVERITY; -} - -/* - * See AMD Error Scope Hierarchy table in a newer BKDG. For example - * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" - */ -static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) -{ - enum context ctx = error_context(m, regs); + ret = MCE_AR_SEVERITY; /* Processor Context Corrupt, no need to fumble too much, die! */ - if (m->status & MCI_STATUS_PCC) - return MCE_PANIC_SEVERITY; - - if (m->status & MCI_STATUS_UC) { - - if (ctx == IN_KERNEL) - return MCE_PANIC_SEVERITY; + if (m->status & MCI_STATUS_PCC) { + panic_msg = "Processor Context Corrupt"; + ret = MCE_PANIC_SEVERITY; + goto out; + } - /* - * On older systems where overflow_recov flag is not present, we - * should simply panic if an error overflow occurs. If - * overflow_recov flag is present and set, then software can try - * to at least kill process to prolong system operation. - */ - if (mce_flags.overflow_recov) { - if (mce_flags.smca) - return mce_severity_amd_smca(m, ctx); - - /* kill current process */ - return MCE_AR_SEVERITY; - } else { - /* at least one error was not logged */ - if (m->status & MCI_STATUS_OVER) - return MCE_PANIC_SEVERITY; - } - - /* - * For any other case, return MCE_UC_SEVERITY so that we log the - * error and exit #MC handler. - */ - return MCE_UC_SEVERITY; + if (m->status & MCI_STATUS_DEFERRED) { + ret = MCE_DEFERRED_SEVERITY; + goto out; } /* - * deferred error: poll handler catches these and adds to mce_ring so - * memory-failure can take recovery actions. + * If the UC bit is not set, the system either corrected or deferred + * the error. No action will be required after logging the error. */ - if (m->status & MCI_STATUS_DEFERRED) - return MCE_DEFERRED_SEVERITY; + if (!(m->status & MCI_STATUS_UC)) { + ret = MCE_KEEP_SEVERITY; + goto out; + } /* - * corrected error: poll handler catches these and passes responsibility - * of decoding the error to EDAC + * On MCA overflow, without the MCA overflow recovery feature the + * system will not be able to recover, panic. */ - return MCE_KEEP_SEVERITY; + if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) { + panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery"; + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (!mce_flags.succor) { + panic_msg = "Uncorrected error without MCA Recovery"; + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (error_context(m, regs) == IN_KERNEL) { + panic_msg = "Uncorrected unrecoverable error in kernel context"; + ret = MCE_PANIC_SEVERITY; + } + +out: + if (msg && panic_msg) + *msg = panic_msg; + + return ret; } static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index d28a9f8f3fec..025c8f0cd948 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -45,20 +45,6 @@ static struct microcode_intel *intel_ucode_patch; /* last level cache size per core */ static int llc_size_per_core; -static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; - - /* ... or they intersect. */ - return p1 & p2; -} - /* * Returns 1 if update has been found, 0 otherwise. */ @@ -69,7 +55,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) struct extended_signature *ext_sig; int i; - if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) return 1; /* Look for ext. headers: */ @@ -80,7 +66,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; for (i = 0; i < ext_hdr->count; i++) { - if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) return 1; ext_sig++; } @@ -342,37 +328,6 @@ next: return patch; } -static int collect_cpu_info_early(struct ucode_cpu_info *uci) -{ - unsigned int val[2]; - unsigned int family, model; - struct cpu_signature csig = { 0 }; - unsigned int eax, ebx, ecx, edx; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; - - family = x86_family(eax); - model = x86_model(eax); - - if ((model >= 5) || (family > 6)) { - /* get processor flags from MSR 0x17 */ - native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); - } - - csig.rev = intel_get_microcode_revision(); - - uci->cpu_sig = csig; - uci->valid = 1; - - return 0; -} - static void show_saved_mc(void) { #ifdef DEBUG @@ -386,7 +341,7 @@ static void show_saved_mc(void) return; } - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); sig = uci.cpu_sig.sig; pf = uci.cpu_sig.pf; @@ -502,7 +457,7 @@ void show_ucode_info_early(void) struct ucode_cpu_info uci; if (delay_ucode_info) { - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); print_ucode_info(&uci, current_mc_date); delay_ucode_info = 0; } @@ -604,7 +559,7 @@ int __init save_microcode_in_initrd_intel(void) if (!(cp.data && cp.size)) return 0; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); scan_microcode(cp.data, cp.size, &uci, true); @@ -637,7 +592,7 @@ static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) if (!(cp.data && cp.size)) return NULL; - collect_cpu_info_early(uci); + intel_cpu_collect_info(uci); return scan_microcode(cp.data, cp.size, uci, false); } @@ -712,7 +667,7 @@ void reload_ucode_intel(void) struct microcode_intel *p; struct ucode_cpu_info uci; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); p = find_patch(&uci); if (!p) diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 4eec8889b0ff..099b6f0d96bd 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -84,14 +84,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = aperfmperf_get_khz(cpu); - - if (!freq) - freq = cpufreq_quick_get(cpu); - if (!freq) - freq = cpu_khz; - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); + unsigned int freq = arch_freq_get_on_cpu(cpu); + + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } /* Cache size */ diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 83f901e2c2df..f276aff521e8 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -341,14 +341,14 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus belong to parent ctrl group */ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); return -EINVAL; } /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { /* Give any dropped cpus to parent rdtgroup */ cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); update_closid_rmid(tmpmask, prgrp); @@ -359,7 +359,7 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, * and update per-cpu rmid */ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) { if (crgrp == rdtgrp) @@ -394,7 +394,7 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { /* Can't drop from default group */ if (rdtgrp == &rdtgroup_default) { rdt_last_cmd_puts("Can't drop CPUs from default group\n"); @@ -413,12 +413,12 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, * and update per-cpu closid/rmid. */ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { if (r == rdtgrp) continue; cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); - if (cpumask_weight(tmpmask1)) + if (!cpumask_empty(tmpmask1)) cpumask_rdtgrp_clear(r, tmpmask1); } update_closid_rmid(tmpmask, rdtgrp); @@ -488,7 +488,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, /* check that user didn't specify any offline cpus */ cpumask_andnot(tmpmask, newmask, cpu_online_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { ret = -EINVAL; rdt_last_cmd_puts("Can only assign online CPUs\n"); goto unlock; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 4143b1e4c5c6..dbaa8326d6f2 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -43,6 +43,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 7c63a1911fae..3c24e6124d95 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,92 @@ #include "encls.h" #include "sgx.h" +#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) +/* + * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to + * determine the page index associated with the first PCMD entry + * within a PCMD page. + */ +#define PCMD_FIRST_MASK GENMASK(4, 0) + +/** + * reclaimer_writing_to_pcmd() - Query if any enclave page associated with + * a PCMD page is in process of being reclaimed. + * @encl: Enclave to which PCMD page belongs + * @start_addr: Address of enclave page using first entry within the PCMD page + * + * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is + * stored. The PCMD data of a reclaimed enclave page contains enough + * information for the processor to verify the page at the time + * it is loaded back into the Enclave Page Cache (EPC). + * + * The backing storage to which enclave pages are reclaimed is laid out as + * follows: + * Encrypted enclave pages:SECS page:PCMD pages + * + * Each PCMD page contains the PCMD metadata of + * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. + * + * A PCMD page can only be truncated if it is (a) empty, and (b) not in the + * process of getting data (and thus soon being non-empty). (b) is tested with + * a check if an enclave page sharing the PCMD page is in the process of being + * reclaimed. + * + * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it + * intends to reclaim that enclave page - it means that the PCMD page + * associated with that enclave page is about to get some data and thus + * even if the PCMD page is empty, it should not be truncated. + * + * Context: Enclave mutex (&sgx_encl->lock) must be held. + * Return: 1 if the reclaimer is about to write to the PCMD page + * 0 if the reclaimer has no intention to write to the PCMD page + */ +static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, + unsigned long start_addr) +{ + int reclaimed = 0; + int i; + + /* + * PCMD_FIRST_MASK is based on number of PCMD entries within + * PCMD page being 32. + */ + BUILD_BUG_ON(PCMDS_PER_PAGE != 32); + + for (i = 0; i < PCMDS_PER_PAGE; i++) { + struct sgx_encl_page *entry; + unsigned long addr; + + addr = start_addr + i * PAGE_SIZE; + + /* + * Stop when reaching the SECS page - it does not + * have a page_array entry and its reclaim is + * started and completed with enclave mutex held so + * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED + * flag. + */ + if (addr == encl->base + encl->size) + break; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + continue; + + /* + * VA page slot ID uses same bit as the flag so it is important + * to ensure that the page is not already in backing store. + */ + if (entry->epc_page && + (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { + reclaimed = 1; + break; + } + } + + return reclaimed; +} + /* * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's * follow right after the EPC data in the backing storage. In addition to the @@ -47,6 +133,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; struct sgx_encl *encl = encl_page->encl; pgoff_t page_index, page_pcmd_off; + unsigned long pcmd_first_page; struct sgx_pageinfo pginfo; struct sgx_backing b; bool pcmd_page_empty; @@ -58,6 +145,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, else page_index = PFN_DOWN(encl->size); + /* + * Address of enclave page using the first entry within the PCMD page. + */ + pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; + page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); ret = sgx_encl_get_backing(encl, page_index, &b); @@ -84,6 +176,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, } memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + set_page_dirty(b.pcmd); /* * The area for the PCMD in the page was zeroed above. Check if the @@ -94,12 +187,20 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, kunmap_atomic(pcmd_page); kunmap_atomic((void *)(unsigned long)pginfo.contents); - sgx_encl_put_backing(&b, false); + get_page(b.pcmd); + sgx_encl_put_backing(&b); sgx_encl_truncate_backing_page(encl, page_index); - if (pcmd_page_empty) + if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); + pcmd_page = kmap_atomic(b.pcmd); + if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) + pr_warn("PCMD page not empty after truncate.\n"); + kunmap_atomic(pcmd_page); + } + + put_page(b.pcmd); return ret; } @@ -645,15 +746,9 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, /** * sgx_encl_put_backing() - Unpin the backing storage * @backing: data for accessing backing storage for the page - * @do_write: mark pages dirty */ -void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write) +void sgx_encl_put_backing(struct sgx_backing *backing) { - if (do_write) { - set_page_dirty(backing->pcmd); - set_page_dirty(backing->contents); - } - put_page(backing->pcmd); put_page(backing->contents); } diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index fec43ca65065..d44e7372151f 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -107,7 +107,7 @@ void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing); -void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); +void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8e4bc6453d26..ab4ec54bbdd9 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -191,6 +191,8 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, backing->pcmd_offset; ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); + set_page_dirty(backing->pcmd); + set_page_dirty(backing->contents); kunmap_atomic((void *)(unsigned long)(pginfo.metadata - backing->pcmd_offset)); @@ -308,6 +310,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(epc_page, backing); encl_page->epc_page = NULL; encl->secs_child_cnt--; + sgx_encl_put_backing(backing); if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), @@ -320,7 +323,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; - sgx_encl_put_backing(&secs_backing, true); + sgx_encl_put_backing(&secs_backing); } out: @@ -379,11 +382,14 @@ static void sgx_reclaim_pages(void) goto skip; page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + + mutex_lock(&encl_page->encl->lock); ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); - if (ret) + if (ret) { + mutex_unlock(&encl_page->encl->lock); goto skip; + } - mutex_lock(&encl_page->encl->lock); encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; mutex_unlock(&encl_page->encl->lock); continue; @@ -411,7 +417,6 @@ skip: encl_page = epc_page->owner; sgx_reclaimer_write(epc_page, &backing[i]); - sgx_encl_put_backing(&backing[i], true); kref_put(&encl_page->encl->refcount, sgx_encl_release); epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index e8326a8d1c5d..9730c88530fc 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -407,7 +407,7 @@ int crash_load_segments(struct kimage *image) } image->elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->elf_load_addr, kbuf.bufsz, kbuf.bufsz); + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); return ret; } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c049561f373a..e28ab0ecc537 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -41,17 +41,7 @@ struct fpu_state_config fpu_user_cfg __ro_after_init; */ struct fpstate init_fpstate __ro_after_init; -/* - * Track whether the kernel is using the FPU state - * currently. - * - * This flag is used: - * - * - by IRQ context code to potentially use the FPU - * if it's unused. - * - * - to debug kernel_fpu_begin()/end() correctness - */ +/* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* @@ -59,42 +49,37 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu); */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -static bool kernel_fpu_disabled(void) -{ - return this_cpu_read(in_kernel_fpu); -} - -static bool interrupted_kernel_fpu_idle(void) -{ - return !kernel_fpu_disabled(); -} - -/* - * Were we in user mode (or vm86 mode) when we were - * interrupted? - * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. - */ -static bool interrupted_user_mode(void) -{ - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode(regs); -} - /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? - * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. */ bool irq_fpu_usable(void) { - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); + if (WARN_ON_ONCE(in_nmi())) + return false; + + /* In kernel FPU usage already active? */ + if (this_cpu_read(in_kernel_fpu)) + return false; + + /* + * When not in NMI or hard interrupt context, FPU can be used in: + * + * - Task context except from within fpregs_lock()'ed critical + * regions. + * + * - Soft interrupt processing context which cannot happen + * while in a fpregs_lock()'ed critical region. + */ + if (!in_hardirq()) + return true; + + /* + * In hard interrupt context it's safe when soft interrupts + * are enabled, which means the interrupt did not hit in + * a fpregs_lock()'ed critical region. + */ + return !softirq_count(); } EXPORT_SYMBOL(irq_fpu_usable); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 39e1c8626ab9..c8340156bfd2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -142,7 +142,8 @@ static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) * Non-compacted format and legacy features use the cached fixed * offsets. */ - if (!cpu_feature_enabled(X86_FEATURE_XSAVES) || xfeature <= XFEATURE_SSE) + if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || + xfeature <= XFEATURE_SSE) return xstate_offsets[xfeature]; /* @@ -369,12 +370,12 @@ static void __init setup_init_fpu_buf(void) /* * All components are now in init state. Read the state back so * that init_fpstate contains all non-zero init state. This only - * works with XSAVE, but not with XSAVEOPT and XSAVES because + * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because * those use the init optimization which skips writing data for * components in init state. * * XSAVE could be used, but that would require to reshuffle the - * data when XSAVES is available because XSAVES uses xstate + * data when XSAVEC/S is available because XSAVEC/S uses xstate * compaction. But doing so is a pointless exercise because most * components have an all zeros init state except for the legacy * ones (FP and SSE). Those can be saved with FXSAVE into the @@ -584,7 +585,8 @@ static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) */ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); + bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; @@ -595,7 +597,7 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * Supervisor state components can be managed only by * XSAVES. */ - if (!compacted && xfeature_is_supervisor(i)) { + if (!xsaves && xfeature_is_supervisor(i)) { XSTATE_WARN_ON(1); return false; } @@ -612,8 +614,11 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. + * + * This also takes compaction into account. So this works for + * XSAVEC as well. */ -static unsigned int __init get_xsaves_size(void) +static unsigned int __init get_compacted_size(void) { unsigned int eax, ebx, ecx, edx; /* @@ -623,6 +628,10 @@ static unsigned int __init get_xsaves_size(void) * containing all the state components * corresponding to bits currently set in * XCR0 | IA32_XSS. + * + * When XSAVES is not available but XSAVEC is (virt), then there + * are no supervisor states, but XSAVEC still uses compacted + * format. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); return ebx; @@ -632,13 +641,13 @@ static unsigned int __init get_xsaves_size(void) * Get the total size of the enabled xstates without the independent supervisor * features. */ -static unsigned int __init get_xsaves_size_no_independent(void) +static unsigned int __init get_xsave_compacted_size(void) { u64 mask = xfeatures_mask_independent(); unsigned int size; if (!mask) - return get_xsaves_size(); + return get_compacted_size(); /* Disable independent features. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); @@ -647,7 +656,7 @@ static unsigned int __init get_xsaves_size_no_independent(void) * Ask the hardware what size is required of the buffer. * This is the size required for the task->fpu buffer. */ - size = get_xsaves_size(); + size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); @@ -687,20 +696,21 @@ static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ unsigned int user_size, kernel_size, kernel_default_size; - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); /* Uncompacted user space size */ user_size = get_xsave_size_user(); /* - * XSAVES kernel size includes supervisor states and - * uses compacted format when available. + * XSAVES kernel size includes supervisor states and uses compacted + * format. XSAVEC uses compacted format, but does not save + * supervisor states. * - * XSAVE does not support supervisor states so - * kernel and user size is identical. + * XSAVE[OPT] do not support supervisor states so kernel and user + * size is identical. */ if (compacted) - kernel_size = get_xsaves_size_no_independent(); + kernel_size = get_xsave_compacted_size(); else kernel_size = user_size; @@ -813,8 +823,11 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) if (!cpu_feature_enabled(X86_FEATURE_XFD)) fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; - fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | - XFEATURE_MASK_SUPERVISOR_SUPPORTED; + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + else + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; @@ -837,6 +850,11 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) */ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; + /* Set up compaction feature bit */ + if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || + cpu_feature_enabled(X86_FEATURE_XSAVES)) + setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); + /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -873,7 +891,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", fpu_kernel_cfg.max_features, fpu_kernel_cfg.max_size, - boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); + boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); return; out_disable: @@ -917,7 +935,7 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) return NULL; } @@ -1215,7 +1233,7 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, } for (i = 0; i < XFEATURE_MAX; i++) { - u64 mask = ((u64)1 << i); + mask = BIT_ULL(i); if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, i); @@ -1525,7 +1543,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) * vendors into extending XFD for the pre AMX states, especially * AVX512. */ - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); struct fpu *fpu = ¤t->group_leader->thread.fpu; struct fpu_state_perm *perm; unsigned int ksize, usize; @@ -1687,16 +1705,13 @@ EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). */ -long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) +long fpu_xstate_prctl(int option, unsigned long arg2) { u64 __user *uptr = (u64 __user *)arg2; u64 permitted, supported; unsigned long idx = arg2; bool guest = false; - if (tsk != current) - return -EPERM; - switch (option) { case ARCH_GET_XCOMP_SUPP: supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index d22ace092ca2..5ad47031383b 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -16,7 +16,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; } @@ -79,6 +79,7 @@ static inline u64 xfeatures_mask_independent(void) /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27" #define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" #define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" @@ -97,9 +98,11 @@ static inline u64 xfeatures_mask_independent(void) : "memory") /* - * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact - * format and supervisor states in addition to modified optimization in - * XSAVEOPT. + * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor + * states in addition to XSAVEC. + * + * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports + * compacted storage format in addition to XSAVEOPT. * * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT * supports modified optimization which is not supported by XSAVE. @@ -111,8 +114,9 @@ static inline u64 xfeatures_mask_independent(void) * address of the instruction where we might get an exception at. */ #define XSTATE_XSAVE(st, lmask, hmask, err) \ - asm volatile(ALTERNATIVE_2(XSAVE, \ + asm volatile(ALTERNATIVE_3(XSAVE, \ XSAVEOPT, X86_FEATURE_XSAVEOPT, \ + XSAVEC, X86_FEATURE_XSAVEC, \ XSAVES, X86_FEATURE_XSAVES) \ "\n" \ "xor %[err], %[err]\n" \ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1e31c7d21597..b09d73c2ba89 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -579,9 +579,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE - -#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +#if defined(CONFIG_DYNAMIC_FTRACE) && !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) extern void ftrace_graph_call(void); static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { @@ -610,18 +608,7 @@ int ftrace_disable_ftrace_graph_caller(void) return ftrace_mod_jmp(ip, &ftrace_stub); } -#else /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ -int ftrace_enable_ftrace_graph_caller(void) -{ - return 0; -} - -int ftrace_disable_ftrace_graph_caller(void) -{ - return 0; -} -#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ -#endif /* !CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_DYNAMIC_FTRACE && !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ /* * Hook the return address and push it in the stack of return addrs diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 4f5ecbbaae77..bd4a34100ed0 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -40,6 +40,7 @@ #include <asm/extable.h> #include <asm/trapnr.h> #include <asm/sev.h> +#include <asm/tdx.h> /* * Manage page tables very early on. @@ -143,7 +144,20 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv if (sme_get_me_mask()) { vaddr = (unsigned long)__start_bss_decrypted; vaddr_end = (unsigned long)__end_bss_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + /* + * On SNP, transition the page to shared in the RMP table so that + * it is consistent with the page table attribute change. + * + * __start_bss_decrypted has a virtual address in the high range + * mapping (kernel .text). PVALIDATE, by way of + * early_snp_set_memory_shared(), requires a valid virtual + * address but the kernel is currently running off of the identity + * mapping so use __pa() to get a *currently* valid virtual address. + */ + early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD); + i = pmd_index(vaddr); pmd[i] -= sme_get_me_mask(); } @@ -192,9 +206,6 @@ unsigned long __head __startup_64(unsigned long physaddr, if (load_delta & ~PMD_PAGE_MASK) for (;;); - /* Activate Secure Memory Encryption (SME) if supported and enabled */ - sme_enable(bp); - /* Include the SME encryption mask in the fixup value */ load_delta += sme_get_me_mask(); @@ -308,15 +319,6 @@ unsigned long __head __startup_64(unsigned long physaddr, return sme_postprocess_startup(bp, pmd); } -unsigned long __startup_secondary_64(void) -{ - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); -} - /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { @@ -416,6 +418,9 @@ void __init do_early_exception(struct pt_regs *regs, int trapnr) trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs)) return; + if (trapnr == X86_TRAP_VE && tdx_early_handle_ve(regs)) + return; + early_fixup_exception(regs, trapnr); } @@ -514,6 +519,9 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) idt_setup_early_handler(); + /* Needed before cc_platform_has() can be used for TDX */ + tdx_early_init(); + copy_bootdata(__va(real_mode_data)); /* @@ -600,8 +608,10 @@ static void startup_64_load_idt(unsigned long physbase) void early_setup_idt(void) { /* VMM Communication Exception */ - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + setup_ghcb(); set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + } bringup_idt_descr.address = (unsigned long)bringup_idt_table; native_load_idt(&bringup_idt_descr); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b8e3019547a5..92c4afa2b729 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,10 +65,39 @@ SYM_CODE_START_NOALIGN(startup_64) leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp leaq _text(%rip), %rdi + + /* + * initial_gs points to initial fixed_percpu_data struct with storage for + * the stack protector canary. Global pointer fixups are needed at this + * stage, so apply them as is done in fixup_pointer(), and initialize %gs + * such that the canary can be accessed at %gs:40 for subsequent C calls. + */ + movl $MSR_GS_BASE, %ecx + movq initial_gs(%rip), %rax + movq $_text, %rdx + subq %rdx, %rax + addq %rdi, %rax + movq %rax, %rdx + shrq $32, %rdx + wrmsr + pushq %rsi call startup_64_setup_env popq %rsi +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Activate SEV/SME memory encryption if supported/enabled. This needs to + * be done now, since this also includes setup of the SEV-SNP CPUID table, + * which needs to be done before any CPUID instructions are executed in + * subsequent code. + */ + movq %rsi, %rdi + pushq %rsi + call sme_enable + popq %rsi +#endif + /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS leaq .Lon_kernel_cs(%rip), %rax @@ -134,16 +163,32 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. */ - pushq %rsi - call __startup_secondary_64 - popq %rsi +#ifdef CONFIG_AMD_MEM_ENCRYPT + movq sme_me_mask, %rax +#else + xorq %rax, %rax +#endif /* Form the CR3 value being sure to include the CR3 modifier */ addq $(init_top_pgt - __START_KERNEL_map), %rax 1: +#ifdef CONFIG_X86_MCE + /* + * Preserve CR4.MCE if the kernel will enable #MC support. + * Clearing MCE may fault in some environments (that also force #MC + * support). Any machine check that occurs before #MC support is fully + * configured will crash the system regardless of the CR4.MCE value set + * here. + */ + movq %cr4, %rcx + andl $X86_CR4_MCE, %ecx +#else + movl $0, %ecx +#endif + /* Enable PAE mode, PGE and LA57 */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL testl $1, __pgtable_l5_enabled(%rip) jz 1f @@ -249,13 +294,23 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx rdmsr + /* + * Preserve current value of EFER for comparison and to skip + * EFER writes if no change was made (for TDX guest) + */ + movl %eax, %edx btsl $_EFER_SCE, %eax /* Enable System Call */ btl $20,%edi /* No Execute supported? */ jnc 1f btsl $_EFER_NX, %eax btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) -1: wrmsr /* Make changes effective */ + /* Avoid writing EFER if no change was made (for TDX guest) */ +1: cmpl %edx, %eax + je 1f + xor %edx, %edx + wrmsr /* Make changes effective */ +1: /* Setup cr0 */ movl $CR0_STATE, %eax /* Make changes effective */ diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 608eb63bf044..a58c6bc1cd68 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -69,6 +69,9 @@ static const __initconst struct idt_data early_idts[] = { */ INTG(X86_TRAP_PF, asm_exc_page_fault), #endif +#ifdef CONFIG_INTEL_TDX_GUEST + INTG(X86_TRAP_VE, asm_exc_virtualization_exception), +#endif }; /* diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e73f7df362f5..cec0bfa3bc04 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -157,7 +157,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) struct nmi_desc *desc = nmi_to_desc(type); unsigned long flags; - if (!action->handler) + if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list))) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); @@ -177,7 +177,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) list_add_rcu(&action->list, &desc->head); else list_add_tail_rcu(&action->list, &desc->head); - + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -186,7 +186,7 @@ EXPORT_SYMBOL(__register_nmi_handler); void unregister_nmi_handler(unsigned int type, const char *name) { struct nmi_desc *desc = nmi_to_desc(type); - struct nmiaction *n; + struct nmiaction *n, *found = NULL; unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); @@ -200,12 +200,16 @@ void unregister_nmi_handler(unsigned int type, const char *name) WARN(in_nmi(), "Trying to free NMI (%s) from NMI context!\n", n->name); list_del_rcu(&n->list); + found = n; break; } } raw_spin_unlock_irqrestore(&desc->lock, flags); - synchronize_rcu(); + if (found) { + synchronize_rcu(); + INIT_LIST_HEAD(&found->list); + } } EXPORT_SYMBOL_GPL(unregister_nmi_handler); diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 36e84d904260..319fef37d9dc 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -21,6 +21,7 @@ #include <asm/sections.h> #include <asm/io.h> #include <asm/setup_arch.h> +#include <asm/sev.h> static struct resource system_rom_resource = { .name = "System ROM", @@ -197,11 +198,21 @@ static int __init romchecksum(const unsigned char *rom, unsigned long length) void __init probe_roms(void) { - const unsigned char *rom; unsigned long start, length, upper; + const unsigned char *rom; unsigned char c; int i; + /* + * The ROM memory range is not part of the e820 table and is therefore not + * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted + * memory, and SNP requires encrypted memory to be validated before access. + * Do that here. + */ + snp_prep_memory(video_rom_resource.start, + ((system_rom_resource.end + 1) - video_rom_resource.start), + SNP_PAGE_STATE_PRIVATE); + /* video rom */ upper = adapter_rom_resources[0].start; for (start = video_rom_resource.start; start < upper; start += 2048) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..58fb48d3004f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -46,6 +46,7 @@ #include <asm/proto.h> #include <asm/frame.h> #include <asm/unwind.h> +#include <asm/tdx.h> #include "process.h" @@ -160,6 +161,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, savesegment(ds, p->thread.ds); #else p->thread.sp0 = (unsigned long) (childregs + 1); + savesegment(gs, p->thread.gs); /* * Clear all status flags including IF and set fixed bit. 64bit * does not have this initialization as the frame does not contain @@ -191,10 +193,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, if (sp) childregs->sp = sp; -#ifdef CONFIG_X86_32 - task_user_gs(p) = get_user_gs(current_pt_regs()); -#endif - if (unlikely(p->flags & PF_IO_WORKER)) { /* * An IO thread is a user space thread, but it doesn't @@ -334,7 +332,7 @@ static int get_cpuid_mode(void) return !test_thread_flag(TIF_NOCPUID); } -static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +static int set_cpuid_mode(unsigned long cpuid_enabled) { if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; @@ -405,7 +403,7 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) } /** - * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode + * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode */ void native_tss_update_io_bitmap(void) { @@ -686,9 +684,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) /* Enforce MSR update to ensure consistent state */ __speculation_ctrl_update(~tifn, tifn); } - - if ((tifp ^ tifn) & _TIF_SLD) - switch_to_sld(tifn); } /* @@ -873,6 +868,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) } else if (prefer_mwait_c1_over_halt(c)) { pr_info("using mwait in idle threads\n"); x86_idle = mwait_idle; + } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_info("using TDX aware idle routine\n"); + x86_idle = tdx_safe_halt; } else x86_idle = default_idle; } @@ -985,20 +983,19 @@ unsigned long __get_wchan(struct task_struct *p) return addr; } -long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long arg2) +long do_arch_prctl_common(int option, unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: - return set_cpuid_mode(task, arg2); + return set_cpuid_mode(arg2); case ARCH_GET_XCOMP_SUPP: case ARCH_GET_XCOMP_PERM: case ARCH_REQ_XCOMP_PERM: case ARCH_GET_XCOMP_GUEST_PERM: case ARCH_REQ_XCOMP_GUEST_PERM: - return fpu_xstate_prctl(task, option, arg2); + return fpu_xstate_prctl(option, arg2); } return -EINVAL; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 26edb1cd07a4..2f314b170c9f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -63,10 +63,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, unsigned long d0, d1, d2, d3, d6, d7; unsigned short gs; - if (user_mode(regs)) - gs = get_user_gs(regs); - else - savesegment(gs, gs); + savesegment(gs, gs); show_ip(regs, log_lvl); @@ -114,7 +111,7 @@ void release_thread(struct task_struct *dead_task) void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { - set_user_gs(regs, 0); + loadsegment(gs, 0); regs->fs = 0; regs->ds = __USER_DS; regs->es = __USER_DS; @@ -177,7 +174,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ - lazy_save_gs(prev->gs); + savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -208,7 +205,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) - lazy_load_gs(next->gs); + loadsegment(gs, next->gs); this_cpu_write(current_task, next_p); @@ -222,5 +219,5 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl_common(current, option, arg2); + return do_arch_prctl_common(option, arg2); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e459253649be..1962008fe743 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -844,7 +844,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) ret = do_arch_prctl_64(current, option, arg2); if (ret == -EINVAL) - ret = do_arch_prctl_common(current, option, arg2); + ret = do_arch_prctl_common(option, arg2); return ret; } @@ -852,7 +852,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) #ifdef CONFIG_IA32_EMULATION COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl_common(current, option, arg2); + return do_arch_prctl_common(option, arg2); } #endif diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 98d10ef60571..37c12fb92906 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -170,9 +170,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) retval = *pt_regs_access(task_pt_regs(task), offset); else { if (task == current) - retval = get_user_gs(task_pt_regs(task)); + savesegment(gs, retval); else - retval = task_user_gs(task); + retval = task->thread.gs; } return retval; } @@ -210,7 +210,7 @@ static int set_segment_reg(struct task_struct *task, break; case offsetof(struct user_regs_struct, gs): - task_user_gs(task) = value; + task->thread.gs = value; } return 0; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c95b9ac5a457..249981bf3d8a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -756,6 +756,30 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } +void x86_configure_nx(void) +{ + if (boot_cpu_has(X86_FEATURE_NX)) + __supported_pte_mask |= _PAGE_NX; + else + __supported_pte_mask &= ~_PAGE_NX; +} + +static void __init x86_report_nx(void) +{ + if (!boot_cpu_has(X86_FEATURE_NX)) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#else + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); +#endif + } +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -896,9 +920,7 @@ void __init setup_arch(char **cmdline_p) /* * x86_configure_nx() is called before parse_early_param() to detect * whether hardware doesn't support NX (so that the early EHCI debug - * console setup can safely call set_fixmap()). It may then be called - * again from within noexec_setup() during parsing early parameters - * to honor the respective command line option. + * console setup can safely call set_fixmap()). */ x86_configure_nx(); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index ce987688bbc0..b478edf43bec 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -14,6 +14,68 @@ #define has_cpuflag(f) boot_cpu_has(f) #endif +/* I/O parameters for CPUID-related helpers */ +struct cpuid_leaf { + u32 fn; + u32 subfn; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +}; + +/* + * Individual entries of the SNP CPUID table, as defined by the SNP + * Firmware ABI, Revision 0.9, Section 7.1, Table 14. + */ +struct snp_cpuid_fn { + u32 eax_in; + u32 ecx_in; + u64 xcr0_in; + u64 xss_in; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u64 __reserved; +} __packed; + +/* + * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, + * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit + * of 64 entries per CPUID table. + */ +#define SNP_CPUID_COUNT_MAX 64 + +struct snp_cpuid_table { + u32 count; + u32 __reserved1; + u64 __reserved2; + struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; +} __packed; + +/* + * Since feature negotiation related variables are set early in the boot + * process they must reside in the .data section so as not to be zeroed + * out when the .bss section is later cleared. + * + * GHCB protocol version negotiated with the hypervisor. + */ +static u16 ghcb_version __ro_after_init; + +/* Copy of the SNP firmware's CPUID page. */ +static struct snp_cpuid_table cpuid_table_copy __ro_after_init; + +/* + * These will be initialized based on CPUID table so that non-present + * all-zero leaves (for sparse tables) can be differentiated from + * invalid/out-of-range leaves. This is needed since all-zero leaves + * still need to be post-processed. + */ +static u32 cpuid_std_range_max __ro_after_init; +static u32 cpuid_hyp_range_max __ro_after_init; +static u32 cpuid_ext_range_max __ro_after_init; + static bool __init sev_es_check_cpu_features(void) { if (!has_cpuflag(X86_FEATURE_RDRAND)) { @@ -24,15 +86,12 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void __noreturn sev_es_terminate(unsigned int reason) +static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; - /* - * Tell the hypervisor what went wrong - only reason-set 0 is - * currently supported. - */ - val |= GHCB_SEV_TERM_REASON(0, reason); + /* Tell the hypervisor what went wrong. */ + val |= GHCB_SEV_TERM_REASON(set, reason); /* Request Guest Termination from Hypvervisor */ sev_es_wr_ghcb_msr(val); @@ -42,6 +101,42 @@ static void __noreturn sev_es_terminate(unsigned int reason) asm volatile("hlt\n" : : : "memory"); } +/* + * The hypervisor features are available from GHCB version 2 onward. + */ +static u64 get_hv_features(void) +{ + u64 val; + + if (ghcb_version < 2) + return 0; + + sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP) + return 0; + + return GHCB_MSR_HV_FT_RESP_VAL(val); +} + +static void snp_register_ghcb_early(unsigned long paddr) +{ + unsigned long pfn = paddr >> PAGE_SHIFT; + u64 val; + + sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + /* If the response GPA is not ours then abort the guest */ + if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || + (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); +} + static bool sev_es_negotiate_protocol(void) { u64 val; @@ -54,10 +149,12 @@ static bool sev_es_negotiate_protocol(void) if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) return false; - if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTO_OUR || - GHCB_MSR_PROTO_MIN(val) > GHCB_PROTO_OUR) + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) return false; + ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); + return true; } @@ -104,10 +201,7 @@ static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt if (ret == 1) { u64 info = ghcb->save.sw_exit_info_2; - unsigned long v; - - info = ghcb->save.sw_exit_info_2; - v = info & SVM_EVTINJ_VEC_MASK; + unsigned long v = info & SVM_EVTINJ_VEC_MASK; /* Check if exception information from hypervisor is sane. */ if ((info & SVM_EVTINJ_VALID) && @@ -130,7 +224,7 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, u64 exit_info_1, u64 exit_info_2) { /* Fill in protocol and format specifiers */ - ghcb->protocol_version = GHCB_PROTOCOL_MAX; + ghcb->protocol_version = ghcb_version; ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; ghcb_set_sw_exit_code(ghcb, exit_code); @@ -150,6 +244,290 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, return verify_exception_info(ghcb, ctxt); } +static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) +{ + u64 val; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + return -EIO; + + *reg = (val >> 32); + + return 0; +} + +static int sev_cpuid_hv(struct cpuid_leaf *leaf) +{ + int ret; + + /* + * MSR protocol does not support fetching non-zero subfunctions, but is + * sufficient to handle current early-boot cases. Should that change, + * make sure to report an error rather than ignoring the index and + * grabbing random values. If this issue arises in the future, handling + * can be added here to use GHCB-page protocol for cases that occur late + * enough in boot that GHCB page is available. + */ + if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn) + return -EINVAL; + + ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx); + + return ret; +} + +/* + * This may be called early while still running on the initial identity + * mapping. Use RIP-relative addressing to obtain the correct address + * while running with the initial identity mapping as well as the + * switch-over to kernel virtual addresses later. + */ +static const struct snp_cpuid_table *snp_cpuid_get_table(void) +{ + void *ptr; + + asm ("lea cpuid_table_copy(%%rip), %0" + : "=r" (ptr) + : "p" (&cpuid_table_copy)); + + return ptr; +} + +/* + * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of + * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0 + * and 1 based on the corresponding features enabled by a particular + * combination of XCR0 and XSS registers so that a guest can look up the + * version corresponding to the features currently enabled in its XCR0/XSS + * registers. The only values that differ between these versions/table + * entries is the enabled XSAVE area size advertised via EBX. + * + * While hypervisors may choose to make use of this support, it is more + * robust/secure for a guest to simply find the entry corresponding to the + * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the + * XSAVE area size using subfunctions 2 through 64, as documented in APM + * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here. + * + * Since base/legacy XSAVE area size is documented as 0x240, use that value + * directly rather than relying on the base size in the CPUID table. + * + * Return: XSAVE area size on success, 0 otherwise. + */ +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + u64 xfeatures_found = 0; + u32 xsave_size = 0x240; + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64)) + continue; + if (!(xfeatures_en & (BIT_ULL(e->ecx_in)))) + continue; + if (xfeatures_found & (BIT_ULL(e->ecx_in))) + continue; + + xfeatures_found |= (BIT_ULL(e->ecx_in)); + + if (compacted) + xsave_size += e->eax; + else + xsave_size = max(xsave_size, e->eax + e->ebx); + } + + /* + * Either the guest set unsupported XCR0/XSS bits, or the corresponding + * entries in the CPUID table were not present. This is not a valid + * state to be in. + */ + if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2))) + return 0; + + return xsave_size; +} + +static bool +snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (e->eax_in != leaf->fn) + continue; + + if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn) + continue; + + /* + * For 0xD subfunctions 0 and 1, only use the entry corresponding + * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0). + * See the comments above snp_cpuid_calc_xsave_size() for more + * details. + */ + if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1)) + if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in) + continue; + + leaf->eax = e->eax; + leaf->ebx = e->ebx; + leaf->ecx = e->ecx; + leaf->edx = e->edx; + + return true; + } + + return false; +} + +static void snp_cpuid_hv(struct cpuid_leaf *leaf) +{ + if (sev_cpuid_hv(leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + +static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) +{ + struct cpuid_leaf leaf_hv = *leaf; + + switch (leaf->fn) { + case 0x1: + snp_cpuid_hv(&leaf_hv); + + /* initial APIC ID */ + leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); + /* APIC enabled bit */ + leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9)); + + /* OSXSAVE enabled bit */ + if (native_read_cr4() & X86_CR4_OSXSAVE) + leaf->ecx |= BIT(27); + break; + case 0x7: + /* OSPKE enabled bit */ + leaf->ecx &= ~BIT(4); + if (native_read_cr4() & X86_CR4_PKE) + leaf->ecx |= BIT(4); + break; + case 0xB: + leaf_hv.subfn = 0; + snp_cpuid_hv(&leaf_hv); + + /* extended APIC ID */ + leaf->edx = leaf_hv.edx; + break; + case 0xD: { + bool compacted = false; + u64 xcr0 = 1, xss = 0; + u32 xsave_size; + + if (leaf->subfn != 0 && leaf->subfn != 1) + return 0; + + if (native_read_cr4() & X86_CR4_OSXSAVE) + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if (leaf->subfn == 1) { + /* Get XSS value if XSAVES is enabled. */ + if (leaf->eax & BIT(3)) { + unsigned long lo, hi; + + asm volatile("rdmsr" : "=a" (lo), "=d" (hi) + : "c" (MSR_IA32_XSS)); + xss = (hi << 32) | lo; + } + + /* + * The PPR and APM aren't clear on what size should be + * encoded in 0xD:0x1:EBX when compaction is not enabled + * by either XSAVEC (feature bit 1) or XSAVES (feature + * bit 3) since SNP-capable hardware has these feature + * bits fixed as 1. KVM sets it to 0 in this case, but + * to avoid this becoming an issue it's safer to simply + * treat this as unsupported for SNP guests. + */ + if (!(leaf->eax & (BIT(1) | BIT(3)))) + return -EINVAL; + + compacted = true; + } + + xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted); + if (!xsave_size) + return -EINVAL; + + leaf->ebx = xsave_size; + } + break; + case 0x8000001E: + snp_cpuid_hv(&leaf_hv); + + /* extended APIC ID */ + leaf->eax = leaf_hv.eax; + /* compute ID */ + leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0)); + /* node ID */ + leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0)); + break; + default: + /* No fix-ups needed, use values as-is. */ + break; + } + + return 0; +} + +/* + * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value + * should be treated as fatal by caller. + */ +static int snp_cpuid(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return -EOPNOTSUPP; + + if (!snp_cpuid_get_validated_func(leaf)) { + /* + * Some hypervisors will avoid keeping track of CPUID entries + * where all values are zero, since they can be handled the + * same as out-of-range values (all-zero). This is useful here + * as well as it allows virtually all guest configurations to + * work using a single SNP CPUID table. + * + * To allow for this, there is a need to distinguish between + * out-of-range entries and in-range zero entries, since the + * CPUID table entries are only a template that may need to be + * augmented with additional values for things like + * CPU-specific information during post-processing. So if it's + * not in the table, set the values to zero. Then, if they are + * within a valid CPUID range, proceed with post-processing + * using zeros as the initial values. Otherwise, skip + * post-processing and just return zeros immediately. + */ + leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; + + /* Skip post-processing for out-of-range zero leafs. */ + if (!(leaf->fn <= cpuid_std_range_max || + (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) || + (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max))) + return 0; + } + + return snp_cpuid_postprocess(leaf); +} + /* * Boot VC Handler - This is the first VC handler during boot, there is no GHCB * page yet, so it only supports the MSR based communication with the @@ -157,40 +535,33 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, */ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { + unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); - unsigned long val; + struct cpuid_leaf leaf; + int ret; /* Only CPUID is supported via MSR protocol */ if (exit_code != SVM_EXIT_CPUID) goto fail; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) - goto fail; - regs->ax = val >> 32; + leaf.fn = fn; + leaf.subfn = subfn; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) - goto fail; - regs->bx = val >> 32; + ret = snp_cpuid(&leaf); + if (!ret) + goto cpuid_done; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + if (ret != -EOPNOTSUPP) goto fail; - regs->cx = val >> 32; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + if (sev_cpuid_hv(&leaf)) goto fail; - regs->dx = val >> 32; + +cpuid_done: + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; /* * This is a VC handler and the #VC is only raised when SEV-ES is @@ -221,7 +592,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) fail: /* Terminate the guest */ - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, @@ -481,12 +852,37 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +static int vc_handle_cpuid_snp(struct pt_regs *regs) +{ + struct cpuid_leaf leaf; + int ret; + + leaf.fn = regs->ax; + leaf.subfn = regs->cx; + ret = snp_cpuid(&leaf); + if (!ret) { + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; + } + + return ret; +} + static enum es_result vc_handle_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct pt_regs *regs = ctxt->regs; u32 cr4 = native_read_cr4(); enum es_result ret; + int snp_cpuid_ret; + + snp_cpuid_ret = vc_handle_cpuid_snp(regs); + if (!snp_cpuid_ret) + return ES_OK; + if (snp_cpuid_ret != -EOPNOTSUPP) + return ES_VMM_ERROR; ghcb_set_rax(ghcb, regs->ax); ghcb_set_rcx(ghcb, regs->cx); @@ -538,3 +934,67 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, return ES_OK; } + +struct cc_setup_data { + struct setup_data header; + u32 cc_blob_address; +}; + +/* + * Search for a Confidential Computing blob passed in as a setup_data entry + * via the Linux Boot Protocol. + */ +static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) +{ + struct cc_setup_data *sd = NULL; + struct setup_data *hdr; + + hdr = (struct setup_data *)bp->hdr.setup_data; + + while (hdr) { + if (hdr->type == SETUP_CC_BLOB) { + sd = (struct cc_setup_data *)hdr; + return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; + } + hdr = (struct setup_data *)hdr->next; + } + + return NULL; +} + +/* + * Initialize the kernel's copy of the SNP CPUID table, and set up the + * pointer that will be used to access it. + * + * Maintaining a direct mapping of the SNP CPUID table used by firmware would + * be possible as an alternative, but the approach is brittle since the + * mapping needs to be updated in sync with all the changes to virtual memory + * layout and related mapping facilities throughout the boot process. + */ +static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +{ + const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; + int i; + + if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys; + if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table = snp_cpuid_get_table(); + memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table)); + + /* Initialize CPUID ranges for range-checking. */ + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + if (fn->eax_in == 0x0) + cpuid_std_range_max = fn->eax; + else if (fn->eax_in == 0x40000000) + cpuid_hyp_range_max = fn->eax; + else if (fn->eax_in == 0x80000000) + cpuid_ext_range_max = fn->eax; + } +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index e6d316a01fdd..c05f0124c410 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -18,6 +18,10 @@ #include <linux/memblock.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/cpumask.h> +#include <linux/efi.h> +#include <linux/platform_device.h> +#include <linux/io.h> #include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> @@ -31,9 +35,28 @@ #include <asm/svm.h> #include <asm/smp.h> #include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/cpuid.h> +#include <asm/cmdline.h> #define DR7_RESET_VALUE 0x400 +/* AP INIT values as documented in the APM2 section "Processor Initialization State" */ +#define AP_INIT_CS_LIMIT 0xffff +#define AP_INIT_DS_LIMIT 0xffff +#define AP_INIT_LDTR_LIMIT 0xffff +#define AP_INIT_GDTR_LIMIT 0xffff +#define AP_INIT_IDTR_LIMIT 0xffff +#define AP_INIT_TR_LIMIT 0xffff +#define AP_INIT_RFLAGS_DEFAULT 0x2 +#define AP_INIT_DR6_DEFAULT 0xffff0ff0 +#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL +#define AP_INIT_XCR0_DEFAULT 0x1 +#define AP_INIT_X87_FTW_DEFAULT 0x5555 +#define AP_INIT_X87_FCW_DEFAULT 0x0040 +#define AP_INIT_CR0_DEFAULT 0x60000010 +#define AP_INIT_MXCSR_DEFAULT 0x1f80 + /* For early boot hypervisor communication in SEV-ES enabled guests */ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); @@ -41,7 +64,10 @@ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); * Needs to be in the .data section because we need it NULL before bss is * cleared */ -static struct ghcb __initdata *boot_ghcb; +static struct ghcb *boot_ghcb __section(".data"); + +/* Bitmap of SEV features supported by the hypervisor */ +static u64 sev_hv_features __ro_after_init; /* #VC handler runtime per-CPU data */ struct sev_es_runtime_data { @@ -87,6 +113,15 @@ struct ghcb_state { static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); +static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); + +struct sev_config { + __u64 debug : 1, + __reserved : 63; +}; + +static struct sev_config sev_cfg __read_mostly; + static __always_inline bool on_vc_stack(struct pt_regs *regs) { unsigned long sp = regs->sp; @@ -523,13 +558,68 @@ void noinstr __sev_es_nmi_complete(void) __sev_put_ghcb(&state); } -static u64 get_jump_table_addr(void) +static u64 __init get_secrets_page(void) +{ + u64 pa_data = boot_params.cc_blob_address; + struct cc_blob_sev_info info; + void *map; + + /* + * The CC blob contains the address of the secrets page, check if the + * blob is present. + */ + if (!pa_data) + return 0; + + map = early_memremap(pa_data, sizeof(info)); + if (!map) { + pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); + return 0; + } + memcpy(&info, map, sizeof(info)); + early_memunmap(map, sizeof(info)); + + /* smoke-test the secrets page passed */ + if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) + return 0; + + return info.secrets_phys; +} + +static u64 __init get_snp_jump_table_addr(void) +{ + struct snp_secrets_page_layout *layout; + void __iomem *mem; + u64 pa, addr; + + pa = get_secrets_page(); + if (!pa) + return 0; + + mem = ioremap_encrypted(pa, PAGE_SIZE); + if (!mem) { + pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); + return 0; + } + + layout = (__force struct snp_secrets_page_layout *)mem; + + addr = layout->os_area.ap_jump_table_pa; + iounmap(mem); + + return addr; +} + +static u64 __init get_jump_table_addr(void) { struct ghcb_state state; unsigned long flags; struct ghcb *ghcb; u64 ret = 0; + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return get_snp_jump_table_addr(); + local_irq_save(flags); ghcb = __sev_get_ghcb(&state); @@ -553,7 +643,496 @@ static u64 get_jump_table_addr(void) return ret; } -int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) +static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate) +{ + unsigned long vaddr_end; + int rc; + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) { + rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); + if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + + vaddr = vaddr + PAGE_SIZE; + } +} + +static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op) +{ + unsigned long paddr_end; + u64 val; + + paddr = paddr & PAGE_MASK; + paddr_end = paddr + (npages << PAGE_SHIFT); + + while (paddr < paddr_end) { + /* + * Use the MSR protocol because this function can be called before + * the GHCB is established. + */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, + "Wrong PSC response code: 0x%x\n", + (unsigned int)GHCB_RESP_CODE(val))) + goto e_term; + + if (WARN(GHCB_MSR_PSC_RESP_VAL(val), + "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", + op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", + paddr, GHCB_MSR_PSC_RESP_VAL(val))) + goto e_term; + + paddr = paddr + PAGE_SIZE; + } + + return; + +e_term: + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* + * Ask the hypervisor to mark the memory pages as private in the RMP + * table. + */ + early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE); + + /* Validate the memory pages after they've been added in the RMP table. */ + pvalidate_pages(vaddr, npages, true); +} + +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* Invalidate the memory pages before they are marked shared in the RMP table. */ + pvalidate_pages(vaddr, npages, false); + + /* Ask hypervisor to mark the memory pages shared in the RMP table. */ + early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED); +} + +void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) +{ + unsigned long vaddr, npages; + + vaddr = (unsigned long)__va(paddr); + npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + + if (op == SNP_PAGE_STATE_PRIVATE) + early_snp_set_memory_private(vaddr, paddr, npages); + else if (op == SNP_PAGE_STATE_SHARED) + early_snp_set_memory_shared(vaddr, paddr, npages); + else + WARN(1, "invalid memory op %d\n", op); +} + +static int vmgexit_psc(struct snp_psc_desc *desc) +{ + int cur_entry, end_entry, ret = 0; + struct snp_psc_desc *data; + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = 1; + goto out_unlock; + } + + /* Copy the input desc into GHCB shared buffer */ + data = (struct snp_psc_desc *)ghcb->shared_buffer; + memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); + + /* + * As per the GHCB specification, the hypervisor can resume the guest + * before processing all the entries. Check whether all the entries + * are processed. If not, then keep retrying. Note, the hypervisor + * will update the data memory directly to indicate the status, so + * reference the data->hdr everywhere. + * + * The strategy here is to wait for the hypervisor to change the page + * state in the RMP table before guest accesses the memory pages. If the + * page state change was not successful, then later memory access will + * result in a crash. + */ + cur_entry = data->hdr.cur_entry; + end_entry = data->hdr.end_entry; + + while (data->hdr.cur_entry <= data->hdr.end_entry) { + ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); + + /* This will advance the shared buffer data points to. */ + ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, SVM_VMGEXIT_PSC, 0, 0); + + /* + * Page State Change VMGEXIT can pass error code through + * exit_info_2. + */ + if (WARN(ret || ghcb->save.sw_exit_info_2, + "SNP: PSC failed ret=%d exit_info_2=%llx\n", + ret, ghcb->save.sw_exit_info_2)) { + ret = 1; + goto out; + } + + /* Verify that reserved bit is not set */ + if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { + ret = 1; + goto out; + } + + /* + * Sanity check that entry processing is not going backwards. + * This will happen only if hypervisor is tricking us. + */ + if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, +"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", + end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { + ret = 1; + goto out; + } + } + +out: + __sev_put_ghcb(&state); + +out_unlock: + local_irq_restore(flags); + + return ret; +} + +static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, + unsigned long vaddr_end, int op) +{ + struct psc_hdr *hdr; + struct psc_entry *e; + unsigned long pfn; + int i; + + hdr = &data->hdr; + e = data->entries; + + memset(data, 0, sizeof(*data)); + i = 0; + + while (vaddr < vaddr_end) { + if (is_vmalloc_addr((void *)vaddr)) + pfn = vmalloc_to_pfn((void *)vaddr); + else + pfn = __pa(vaddr) >> PAGE_SHIFT; + + e->gfn = pfn; + e->operation = op; + hdr->end_entry = i; + + /* + * Current SNP implementation doesn't keep track of the RMP page + * size so use 4K for simplicity. + */ + e->pagesize = RMP_PG_SIZE_4K; + + vaddr = vaddr + PAGE_SIZE; + e++; + i++; + } + + if (vmgexit_psc(data)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +static void set_pages_state(unsigned long vaddr, unsigned int npages, int op) +{ + unsigned long vaddr_end, next_vaddr; + struct snp_psc_desc *desc; + + desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT); + if (!desc) + panic("SNP: failed to allocate memory for PSC descriptor\n"); + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) { + /* Calculate the last vaddr that fits in one struct snp_psc_desc. */ + next_vaddr = min_t(unsigned long, vaddr_end, + (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr); + + __set_pages_state(desc, vaddr, next_vaddr, op); + + vaddr = next_vaddr; + } + + kfree(desc); +} + +void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + pvalidate_pages(vaddr, npages, false); + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); +} + +void snp_set_memory_private(unsigned long vaddr, unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); + + pvalidate_pages(vaddr, npages, true); +} + +static int snp_set_vmsa(void *va, bool vmsa) +{ + u64 attrs; + + /* + * Running at VMPL0 allows the kernel to change the VMSA bit for a page + * using the RMPADJUST instruction. However, for the instruction to + * succeed it must target the permissions of a lesser privileged + * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST + * instruction in the AMD64 APM Volume 3). + */ + attrs = 1; + if (vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); +} + +#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) +#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) +#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) + +#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) +#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) + +static void *snp_alloc_vmsa_page(void) +{ + struct page *p; + + /* + * Allocate VMSA page to work around the SNP erratum where the CPU will + * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) + * collides with the RMP entry of VMSA page. The recommended workaround + * is to not use a large page. + * + * Allocate an 8k page which is also 8k-aligned. + */ + p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); + if (!p) + return NULL; + + split_page(p, 1); + + /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ + __free_page(p); + + return page_address(p + 1); +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) +{ + int err; + + err = snp_set_vmsa(vmsa, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + +static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) +{ + struct sev_es_save_area *cur_vmsa, *vmsa; + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u8 sipi_vector; + int cpu, ret; + u64 cr4; + + /* + * The hypervisor SNP feature support check has happened earlier, just check + * the AP_CREATION one here. + */ + if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) + return -EOPNOTSUPP; + + /* + * Verify the desired start IP against the known trampoline start IP + * to catch any future new trampolines that may be introduced that + * would require a new protected guest entry point. + */ + if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, + "Unsupported SNP start_ip: %lx\n", start_ip)) + return -EINVAL; + + /* Override start_ip with known protected guest start IP */ + start_ip = real_mode_header->sev_es_trampoline_start; + + /* Find the logical CPU for the APIC ID */ + for_each_present_cpu(cpu) { + if (arch_match_cpu_phys_id(cpu, apic_id)) + break; + } + if (cpu >= nr_cpu_ids) + return -EINVAL; + + cur_vmsa = per_cpu(sev_vmsa, cpu); + + /* + * A new VMSA is created each time because there is no guarantee that + * the current VMSA is the kernels or that the vCPU is not running. If + * an attempt was done to use the current VMSA with a running vCPU, a + * #VMEXIT of that vCPU would wipe out all of the settings being done + * here. + */ + vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); + if (!vmsa) + return -ENOMEM; + + /* CR4 should maintain the MCE value */ + cr4 = native_read_cr4() & X86_CR4_MCE; + + /* Set the CS value based on the start_ip converted to a SIPI vector */ + sipi_vector = (start_ip >> 12); + vmsa->cs.base = sipi_vector << 12; + vmsa->cs.limit = AP_INIT_CS_LIMIT; + vmsa->cs.attrib = INIT_CS_ATTRIBS; + vmsa->cs.selector = sipi_vector << 8; + + /* Set the RIP value based on start_ip */ + vmsa->rip = start_ip & 0xfff; + + /* Set AP INIT defaults as documented in the APM */ + vmsa->ds.limit = AP_INIT_DS_LIMIT; + vmsa->ds.attrib = INIT_DS_ATTRIBS; + vmsa->es = vmsa->ds; + vmsa->fs = vmsa->ds; + vmsa->gs = vmsa->ds; + vmsa->ss = vmsa->ds; + + vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; + vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; + vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; + vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; + vmsa->tr.limit = AP_INIT_TR_LIMIT; + vmsa->tr.attrib = INIT_TR_ATTRIBS; + + vmsa->cr4 = cr4; + vmsa->cr0 = AP_INIT_CR0_DEFAULT; + vmsa->dr7 = DR7_RESET_VALUE; + vmsa->dr6 = AP_INIT_DR6_DEFAULT; + vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; + vmsa->g_pat = AP_INIT_GPAT_DEFAULT; + vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; + vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; + vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; + vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; + + /* SVME must be set. */ + vmsa->efer = EFER_SVME; + + /* + * Set the SNP-specific fields for this VMSA: + * VMPL level + * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) + */ + vmsa->vmpl = 0; + vmsa->sev_features = sev_status >> 2; + + /* Switch the page over to a VMSA page now that it is initialized */ + ret = snp_set_vmsa(vmsa, true); + if (ret) { + pr_err("set VMSA page failed (%u)\n", ret); + free_page((unsigned long)vmsa); + + return -EINVAL; + } + + /* Issue VMGEXIT AP Creation NAE event */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_rax(ghcb, vmsa->sev_features); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); + ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); + ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (!ghcb_sw_exit_info_1_is_valid(ghcb) || + lower_32_bits(ghcb->save.sw_exit_info_1)) { + pr_err("SNP AP Creation error\n"); + ret = -EINVAL; + } + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + /* Perform cleanup if there was an error */ + if (ret) { + snp_cleanup_vmsa(vmsa); + vmsa = NULL; + } + + /* Free up any previous VMSA page */ + if (cur_vmsa) + snp_cleanup_vmsa(cur_vmsa); + + /* Record the current VMSA page */ + per_cpu(sev_vmsa, cpu) = vmsa; + + return ret; +} + +void snp_set_wakeup_secondary_cpu(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* + * Always set this override if SNP is enabled. This makes it the + * required method to start APs under SNP. If the hypervisor does + * not support AP creation, then no APs will be started. + */ + apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit; +} + +int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { u16 startup_cs, startup_ip; phys_addr_t jump_table_pa; @@ -644,15 +1223,39 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } -/* - * This function runs on the first #VC exception after the kernel - * switched to virtual addresses. - */ -static bool __init sev_es_setup_ghcb(void) +static void snp_register_per_cpu_ghcb(void) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + snp_register_ghcb_early(__pa(ghcb)); +} + +void setup_ghcb(void) { + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + return; + /* First make sure the hypervisor talks a supported protocol. */ if (!sev_es_negotiate_protocol()) - return false; + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* + * Check whether the runtime #VC exception handler is active. It uses + * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). + * + * If SNP is active, register the per-CPU GHCB page so that the runtime + * exception handler can use it. + */ + if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_per_cpu_ghcb(); + + return; + } /* * Clear the boot_ghcb. The first exception comes in before the bss @@ -663,7 +1266,9 @@ static bool __init sev_es_setup_ghcb(void) /* Alright - Make the boot-ghcb public */ boot_ghcb = &boot_ghcb_page; - return true; + /* SNP guest requires that GHCB GPA must be registered. */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_ghcb_early(__pa(&boot_ghcb_page)); } #ifdef CONFIG_HOTPLUG_CPU @@ -766,6 +1371,17 @@ void __init sev_es_init_vc_handling(void) if (!sev_es_check_cpu_features()) panic("SEV-ES CPU Features missing"); + /* + * SNP is supported in v2 of the GHCB spec which mandates support for HV + * features. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + sev_hv_features = get_hv_features(); + + if (!(sev_hv_features & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + } + /* Enable SEV-ES special handling */ static_branch_enable(&sev_es_enable_key); @@ -1337,7 +1953,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) show_regs(regs); /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); /* If that fails and we get here - just panic */ panic("Returned from Terminate-Request to Hypervisor\n"); @@ -1383,10 +1999,6 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) struct es_em_ctxt ctxt; enum es_result result; - /* Do initial setup or terminate the guest */ - if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); - vc_ghcb_invalidate(boot_ghcb); result = vc_init_em_ctxt(&ctxt, regs, exit_code); @@ -1425,6 +2037,215 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) fail: show_regs(regs); - while (true) - halt(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the kernel + * in the following ways, depending on how it is booted: + * + * - when booted via the boot/decompress kernel: + * - via boot_params + * + * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): + * - via a setup_data entry, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + /* Boot kernel would have passed the CC blob via boot_params. */ + if (bp->cc_blob_address) { + cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; + goto found_cc_info; + } + + /* + * If kernel was booted directly, without the use of the + * boot/decompression kernel, the CC blob may have been passed via + * setup_data instead. + */ + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + snp_abort(); + + return cc_info; +} + +bool __init snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + setup_cpuid_table(cc_info); + + /* + * The CC blob will be used later to access the secrets page. Cache + * it here like the boot kernel does. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} + +void __init snp_abort(void) +{ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); +} + +static void dump_cpuid_table(void) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i = 0; + + pr_info("count=%d reserved=0x%x reserved2=0x%llx\n", + cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2); + + for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n", + i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx, + fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved); + } +} + +/* + * It is useful from an auditing/testing perspective to provide an easy way + * for the guest owner to know that the CPUID table has been initialized as + * expected, but that initialization happens too early in boot to print any + * sort of indicator, and there's not really any other good place to do it, + * so do it here. + */ +static int __init report_cpuid_table(void) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return 0; + + pr_info("Using SNP CPUID table, %d entries present.\n", + cpuid_table->count); + + if (sev_cfg.debug) + dump_cpuid_table(); + + return 0; +} +arch_initcall(report_cpuid_table); + +static int __init init_sev_config(char *str) +{ + char *s; + + while ((s = strsep(&str, ","))) { + if (!strcmp(s, "debug")) { + sev_cfg.debug = true; + continue; + } + + pr_info("SEV command-line option '%s' was not recognized\n", s); + } + + return 1; +} +__setup("sev=", init_sev_config); + +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + if (!fw_err) + return -EINVAL; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = -EIO; + goto e_restore_irq; + } + + vc_ghcb_invalidate(ghcb); + + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { + ghcb_set_rax(ghcb, input->data_gpa); + ghcb_set_rbx(ghcb, input->data_npages); + } + + ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, exit_code, input->req_gpa, input->resp_gpa); + if (ret) + goto e_put; + + if (ghcb->save.sw_exit_info_2) { + /* Number of expected pages are returned in RBX */ + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST && + ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN) + input->data_npages = ghcb_get_rbx(ghcb); + + *fw_err = ghcb->save.sw_exit_info_2; + + ret = -EIO; + } + +e_put: + __sev_put_ghcb(&state); +e_restore_irq: + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(snp_issue_guest_request); + +static struct platform_device sev_guest_device = { + .name = "sev-guest", + .id = -1, +}; + +static int __init snp_init_platform_device(void) +{ + struct sev_guest_platform_data data; + u64 gpa; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + gpa = get_secrets_page(); + if (!gpa) + return -ENODEV; + + data.secrets_gpa = gpa; + if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) + return -ENODEV; + + if (platform_device_register(&sev_guest_device)) + return -ENODEV; + + pr_info("SNP guest platform device initialized.\n"); + return 0; } +device_initcall(snp_init_platform_device); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e439eb14325f..9c7265b524c7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -93,7 +93,7 @@ static bool restore_sigcontext(struct pt_regs *regs, return false; #ifdef CONFIG_X86_32 - set_user_gs(regs, sc.gs); + loadsegment(gs, sc.gs); regs->fs = sc.fs; regs->es = sc.es; regs->ds = sc.ds; @@ -146,8 +146,10 @@ __unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { #ifdef CONFIG_X86_32 - unsafe_put_user(get_user_gs(regs), - (unsigned int __user *)&sc->gs, Efault); + unsigned int gs; + savesegment(gs, gs); + + unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault); unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index b52407c56000..879ef8c72f5c 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -149,8 +149,10 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18); BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x24); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_flags) != 0x18); CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2ef14772dc04..5e7f9532a10d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,7 +56,6 @@ #include <linux/numa.h> #include <linux/pgtable.h> #include <linux/overflow.h> -#include <linux/syscore_ops.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -82,6 +81,7 @@ #include <asm/spec-ctrl.h> #include <asm/hw_irq.h> #include <asm/stackprotector.h> +#include <asm/sev.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -187,7 +187,7 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); - init_freq_invariance(true, false); + ap_init_aperfmperf(); /* * Get our bogomips. @@ -1082,6 +1082,11 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, unsigned long boot_error = 0; unsigned long timeout; +#ifdef CONFIG_X86_64 + /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ + if (apic->wakeup_secondary_cpu_64) + start_ip = real_mode_header->trampoline_start64; +#endif idle->thread.sp = (unsigned long)task_pt_regs(idle); early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; @@ -1123,11 +1128,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, /* * Wake up a CPU in difference cases: - * - Use the method in the APIC driver if it's defined + * - Use a method from the APIC driver if one defined, with wakeup + * straight to 64-bit mode preferred over wakeup to RM. * Otherwise, * - Use an INIT boot APIC message for APs or NMI for BSP. */ - if (apic->wakeup_secondary_cpu) + if (apic->wakeup_secondary_cpu_64) + boot_error = apic->wakeup_secondary_cpu_64(apicid, start_ip); + else if (apic->wakeup_secondary_cpu) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, @@ -1397,7 +1405,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); - init_freq_invariance(false, false); smp_sanity_check(); switch (apic_intr_mode) { @@ -1430,6 +1437,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) smp_quirk_init_udelay(); speculative_store_bypass_ht_init(); + + snp_set_wakeup_secondary_cpu(); } void arch_thaw_secondary_cpus_begin(void) @@ -1847,357 +1856,3 @@ void native_play_dead(void) } #endif - -#ifdef CONFIG_X86_64 -/* - * APERF/MPERF frequency ratio computation. - * - * The scheduler wants to do frequency invariant accounting and needs a <1 - * ratio to account for the 'current' frequency, corresponding to - * freq_curr / freq_max. - * - * Since the frequency freq_curr on x86 is controlled by micro-controller and - * our P-state setting is little more than a request/hint, we need to observe - * the effective frequency 'BusyMHz', i.e. the average frequency over a time - * interval after discarding idle time. This is given by: - * - * BusyMHz = delta_APERF / delta_MPERF * freq_base - * - * where freq_base is the max non-turbo P-state. - * - * The freq_max term has to be set to a somewhat arbitrary value, because we - * can't know which turbo states will be available at a given point in time: - * it all depends on the thermal headroom of the entire package. We set it to - * the turbo level with 4 cores active. - * - * Benchmarks show that's a good compromise between the 1C turbo ratio - * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, - * which would ignore the entire turbo range (a conspicuous part, making - * freq_curr/freq_max always maxed out). - * - * An exception to the heuristic above is the Atom uarch, where we choose the - * highest turbo level for freq_max since Atom's are generally oriented towards - * power efficiency. - * - * Setting freq_max to anything less than the 1C turbo ratio makes the ratio - * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. - */ - -DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); - -static DEFINE_PER_CPU(u64, arch_prev_aperf); -static DEFINE_PER_CPU(u64, arch_prev_mperf); -static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; -static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; - -void arch_set_max_freq_ratio(bool turbo_disabled) -{ - arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : - arch_turbo_freq_ratio; -} -EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); - -static bool turbo_disabled(void) -{ - u64 misc_en; - int err; - - err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); - if (err) - return false; - - return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); -} - -static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) -{ - int err; - - err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); - if (err) - return false; - - err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); - if (err) - return false; - - *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ - *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ - - return true; -} - -#define X86_MATCH(model) \ - X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ - INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) - -static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { - X86_MATCH(XEON_PHI_KNL), - X86_MATCH(XEON_PHI_KNM), - {} -}; - -static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { - X86_MATCH(SKYLAKE_X), - {} -}; - -static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { - X86_MATCH(ATOM_GOLDMONT), - X86_MATCH(ATOM_GOLDMONT_D), - X86_MATCH(ATOM_GOLDMONT_PLUS), - {} -}; - -static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, - int num_delta_fratio) -{ - int fratio, delta_fratio, found; - int err, i; - u64 msr; - - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); - if (err) - return false; - - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); - if (err) - return false; - - fratio = (msr >> 8) & 0xFF; - i = 16; - found = 0; - do { - if (found >= num_delta_fratio) { - *turbo_freq = fratio; - return true; - } - - delta_fratio = (msr >> (i + 5)) & 0x7; - - if (delta_fratio) { - found += 1; - fratio -= delta_fratio; - } - - i += 8; - } while (i < 64); - - return true; -} - -static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) -{ - u64 ratios, counts; - u32 group_size; - int err, i; - - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); - if (err) - return false; - - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); - if (err) - return false; - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); - if (err) - return false; - - for (i = 0; i < 64; i += 8) { - group_size = (counts >> i) & 0xFF; - if (group_size >= size) { - *turbo_freq = (ratios >> i) & 0xFF; - return true; - } - } - - return false; -} - -static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) -{ - u64 msr; - int err; - - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); - if (err) - return false; - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); - if (err) - return false; - - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ - - /* The CPU may have less than 4 cores */ - if (!*turbo_freq) - *turbo_freq = msr & 0xFF; /* 1C turbo */ - - return true; -} - -static bool intel_set_max_freq_ratio(void) -{ - u64 base_freq, turbo_freq; - u64 turbo_ratio; - - if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) - goto out; - - if (x86_match_cpu(has_glm_turbo_ratio_limits) && - skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) - goto out; - - if (x86_match_cpu(has_knl_turbo_ratio_limits) && - knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) - goto out; - - if (x86_match_cpu(has_skx_turbo_ratio_limits) && - skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) - goto out; - - if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) - goto out; - - return false; - -out: - /* - * Some hypervisors advertise X86_FEATURE_APERFMPERF - * but then fill all MSR's with zeroes. - * Some CPUs have turbo boost but don't declare any turbo ratio - * in MSR_TURBO_RATIO_LIMIT. - */ - if (!base_freq || !turbo_freq) { - pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); - return false; - } - - turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); - if (!turbo_ratio) { - pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); - return false; - } - - arch_turbo_freq_ratio = turbo_ratio; - arch_set_max_freq_ratio(turbo_disabled()); - - return true; -} - -static void init_counter_refs(void) -{ - u64 aperf, mperf; - - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - - this_cpu_write(arch_prev_aperf, aperf); - this_cpu_write(arch_prev_mperf, mperf); -} - -#ifdef CONFIG_PM_SLEEP -static struct syscore_ops freq_invariance_syscore_ops = { - .resume = init_counter_refs, -}; - -static void register_freq_invariance_syscore_ops(void) -{ - /* Bail out if registered already. */ - if (freq_invariance_syscore_ops.node.prev) - return; - - register_syscore_ops(&freq_invariance_syscore_ops); -} -#else -static inline void register_freq_invariance_syscore_ops(void) {} -#endif - -void init_freq_invariance(bool secondary, bool cppc_ready) -{ - bool ret = false; - - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return; - - if (secondary) { - if (static_branch_likely(&arch_scale_freq_key)) { - init_counter_refs(); - } - return; - } - - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - ret = intel_set_max_freq_ratio(); - else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (!cppc_ready) { - return; - } - ret = amd_set_max_freq_ratio(&arch_turbo_freq_ratio); - } - - if (ret) { - init_counter_refs(); - static_branch_enable(&arch_scale_freq_key); - register_freq_invariance_syscore_ops(); - pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); - } else { - pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); - } -} - -static void disable_freq_invariance_workfn(struct work_struct *work) -{ - static_branch_disable(&arch_scale_freq_key); -} - -static DECLARE_WORK(disable_freq_invariance_work, - disable_freq_invariance_workfn); - -DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; - -void arch_scale_freq_tick(void) -{ - u64 freq_scale; - u64 aperf, mperf; - u64 acnt, mcnt; - - if (!arch_scale_freq_invariant()) - return; - - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - - acnt = aperf - this_cpu_read(arch_prev_aperf); - mcnt = mperf - this_cpu_read(arch_prev_mperf); - - this_cpu_write(arch_prev_aperf, aperf); - this_cpu_write(arch_prev_mperf, mperf); - - if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) - goto error; - - if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) - goto error; - - freq_scale = div64_u64(acnt, mcnt); - if (!freq_scale) - goto error; - - if (freq_scale > SCHED_CAPACITY_SCALE) - freq_scale = SCHED_CAPACITY_SCALE; - - this_cpu_write(arch_freq_scale, freq_scale); - return; - -error: - pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); - schedule_work(&disable_freq_invariance_work); -} -#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 660b78827638..8cc653ffdccd 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -68,9 +68,6 @@ static int __init control_va_addr_alignment(char *str) if (*str == 0) return 1; - if (*str == '=') - str++; - if (!strcmp(str, "32")) va_align.flags = ALIGN_VA_32; else if (!strcmp(str, "64")) @@ -80,11 +77,11 @@ static int __init control_va_addr_alignment(char *str) else if (!strcmp(str, "on")) va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; else - return 0; + pr_warn("invalid option value: 'align_va_addr=%s'\n", str); return 1; } -__setup("align_va_addr", control_va_addr_alignment); +__setup("align_va_addr=", control_va_addr_alignment); SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1563fb995005..d62b2cb85cea 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -62,6 +62,7 @@ #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/vdso.h> +#include <asm/tdx.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -686,13 +687,40 @@ static bool try_fixup_enqcmd_gp(void) #endif } +static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + if (fixup_exception(regs, trapnr, error_code, 0)) + return true; + + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + + /* + * To be potentially processing a kprobe fault and to trust the result + * from kprobe_running(), we have to be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(regs, trapnr)) + return true; + + return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; +} + +static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + show_signal(current, SIGSEGV, "", str, regs, error_code); + force_sig(SIGSEGV); +} + DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; - struct task_struct *tsk; unsigned long gp_addr; - int ret; if (user_mode(regs) && try_fixup_enqcmd_gp()) return; @@ -711,40 +739,18 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) return; } - tsk = current; - if (user_mode(regs)) { if (fixup_iopl_exception(regs)) goto exit; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) goto exit; - show_signal(tsk, SIGSEGV, "", desc, regs, error_code); - force_sig(SIGSEGV); + gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); goto exit; } - if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) - goto exit; - - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - - /* - * To be potentially processing a kprobe fault and to trust the result - * from kprobe_running(), we have to be non-preemptible. - */ - if (!preemptible() && - kprobe_running() && - kprobe_fault_handler(regs, X86_TRAP_GP)) - goto exit; - - ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV); - if (ret == NOTIFY_STOP) + if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc)) goto exit; if (error_code) @@ -892,14 +898,10 @@ sync: } #endif -struct bad_iret_stack { - void *error_entry_ret; - struct pt_regs regs; -}; - -asmlinkage __visible noinstr -struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) +asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) { + struct pt_regs tmp, *new_stack; + /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault @@ -908,19 +910,18 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ - struct bad_iret_stack tmp, *new_stack = - (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the temporary storage. */ - __memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8); + __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); /* Copy the remainder of the stack from the current stack. */ - __memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip)); + __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); /* Update the entry stack */ __memcpy(new_stack, &tmp, sizeof(tmp)); - BUG_ON(!user_mode(&new_stack->regs)); + BUG_ON(!user_mode(new_stack)); return new_stack; } #endif @@ -1343,6 +1344,91 @@ DEFINE_IDTENTRY(exc_device_not_available) } } +#ifdef CONFIG_INTEL_TDX_GUEST + +#define VE_FAULT_STR "VE fault" + +static void ve_raise_fault(struct pt_regs *regs, long error_code) +{ + if (user_mode(regs)) { + gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); + return; + } + + if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR)) + return; + + die_addr(VE_FAULT_STR, regs, error_code, 0); +} + +/* + * Virtualization Exceptions (#VE) are delivered to TDX guests due to + * specific guest actions which may happen in either user space or the + * kernel: + * + * * Specific instructions (WBINVD, for example) + * * Specific MSR accesses + * * Specific CPUID leaf accesses + * * Access to specific guest physical addresses + * + * In the settings that Linux will run in, virtualization exceptions are + * never generated on accesses to normal, TD-private memory that has been + * accepted (by BIOS or with tdx_enc_status_changed()). + * + * Syscall entry code has a critical window where the kernel stack is not + * yet set up. Any exception in this window leads to hard to debug issues + * and can be exploited for privilege escalation. Exceptions in the NMI + * entry code also cause issues. Returning from the exception handler with + * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. + * + * For these reasons, the kernel avoids #VEs during the syscall gap and + * the NMI entry code. Entry code paths do not access TD-shared memory, + * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves + * that might generate #VE. VMM can remove memory from TD at any point, + * but access to unaccepted (or missing) private memory leads to VM + * termination, not to #VE. + * + * Similarly to page faults and breakpoints, #VEs are allowed in NMI + * handlers once the kernel is ready to deal with nested NMIs. + * + * During #VE delivery, all interrupts, including NMIs, are blocked until + * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads + * the VE info. + * + * If a guest kernel action which would normally cause a #VE occurs in + * the interrupt-disabled region before TDGETVEINFO, a #DF (fault + * exception) is delivered to the guest which will result in an oops. + * + * The entry code has been audited carefully for following these expectations. + * Changes in the entry code have to be audited for correctness vs. this + * aspect. Similarly to #PF, #VE in these places will expose kernel to + * privilege escalation or may lead to random crashes. + */ +DEFINE_IDTENTRY(exc_virtualization_exception) +{ + struct ve_info ve; + + /* + * NMIs/Machine-checks/Interrupts will be in a disabled state + * till TDGETVEINFO TDCALL is executed. This ensures that VE + * info cannot be overwritten by a nested #VE. + */ + tdx_get_ve_info(&ve); + + cond_local_irq_enable(regs); + + /* + * If tdx_handle_virt_exception() could not process + * it successfully, treat it as #GP(0) and handle it. + */ + if (!tdx_handle_virt_exception(regs, &ve)) + ve_raise_fault(regs, 0); + + cond_local_irq_disable(regs); +} + +#endif + #ifdef CONFIG_X86_32 DEFINE_IDTENTRY_SW(iret_error) { diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index c21bcd668284..e9e803a4d44c 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -151,7 +151,7 @@ exit_vm86: memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); - lazy_load_gs(vm86->regs32.gs); + loadsegment(gs, vm86->regs32.gs); regs->pt.ax = retval; return; @@ -325,7 +325,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) * Save old state */ vm86->saved_sp0 = tsk->thread.sp0; - lazy_save_gs(vm86->regs32.gs); + savesegment(gs, vm86->regs32.gs); /* make room for real-mode segments */ preempt_disable(); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 598334ed5fbc..de6d44e07e34 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -19,6 +19,7 @@ #include <asm/user.h> #include <asm/fpu/xstate.h> #include <asm/sgx.h> +#include <asm/cpuid.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -744,24 +745,8 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - switch (function) { - case 4: - case 7: - case 0xb: - case 0xd: - case 0xf: - case 0x10: - case 0x12: - case 0x14: - case 0x17: - case 0x18: - case 0x1d: - case 0x1e: - case 0x1f: - case 0x8000001d: + if (cpuid_function_is_indexed(function)) entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - break; - } return entry; } @@ -887,6 +872,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) union cpuid10_eax eax; union cpuid10_edx edx; + if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + perf_get_x86_pmu_capability(&cap); /* diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 46f9dfb60469..a0702b6be3e8 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1914,7 +1914,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); - unsigned long valid_bank_mask; + u64 valid_bank_mask; u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; u32 vector; bool all_cpus; @@ -1956,7 +1956,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; - if (hc->var_cnt != bitmap_weight(&valid_bank_mask, 64)) + if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 64a2a7e2be90..45e1573f8f1d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -473,30 +473,6 @@ retry: } #endif -static bool spte_has_volatile_bits(u64 spte) -{ - if (!is_shadow_present_pte(spte)) - return false; - - /* - * Always atomically update spte if it can be updated - * out of mmu-lock, it can ensure dirty bit is not lost, - * also, it can help us to get a stable is_writable_pte() - * to ensure tlb flush is not missed. - */ - if (spte_can_locklessly_be_made_writable(spte) || - is_access_track_spte(spte)) - return true; - - if (spte_ad_enabled(spte)) { - if ((spte & shadow_accessed_mask) == 0 || - (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) - return true; - } - - return false; -} - /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -557,7 +533,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * we always atomically update it, see the comments in * spte_has_volatile_bits(). */ - if (spte_can_locklessly_be_made_writable(old_spte) && + if (is_mmu_writable_spte(old_spte) && !is_writable_pte(new_spte)) flush = true; @@ -591,7 +567,8 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; - if (!spte_has_volatile_bits(old_spte)) + if (!is_shadow_present_pte(old_spte) || + !spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, 0ull); else old_spte = __update_clear_spte_slow(sptep, 0ull); @@ -1187,7 +1164,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) u64 spte = *sptep; if (!is_writable_pte(spte) && - !(pt_protect && spte_can_locklessly_be_made_writable(spte))) + !(pt_protect && is_mmu_writable_spte(spte))) return false; rmap_printk("spte %p %llx\n", sptep, *sptep); @@ -3196,8 +3173,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * be removed in the fast path only if the SPTE was * write-protected for dirty-logging or access tracking. */ - if (fault->write && - spte_can_locklessly_be_made_writable(spte)) { + if (fault->write && is_mmu_writable_spte(spte)) { new_spte |= PT_WRITABLE_MASK; /* @@ -5494,14 +5470,16 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) uint i; if (pcid == kvm_get_active_pcid(vcpu)) { - mmu->invlpg(vcpu, gva, mmu->root.hpa); + if (mmu->invlpg) + mmu->invlpg(vcpu, gva, mmu->root.hpa); tlb_flush = true; } for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + if (mmu->invlpg) + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); tlb_flush = true; } } @@ -5689,6 +5667,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; int nr_zapped, batch = 0; + bool unstable; restart: list_for_each_entry_safe_reverse(sp, node, @@ -5720,11 +5699,12 @@ restart: goto restart; } - if (__kvm_mmu_prepare_zap_page(kvm, sp, - &kvm->arch.zapped_obsolete_pages, &nr_zapped)) { - batch += nr_zapped; + unstable = __kvm_mmu_prepare_zap_page(kvm, sp, + &kvm->arch.zapped_obsolete_pages, &nr_zapped); + batch += nr_zapped; + + if (unstable) goto restart; - } } /* diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 4739b53c9734..e5c0b6db6f2c 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -90,6 +90,34 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) E820_TYPE_RAM); } +/* + * Returns true if the SPTE has bits that may be set without holding mmu_lock. + * The caller is responsible for checking if the SPTE is shadow-present, and + * for determining whether or not the caller cares about non-leaf SPTEs. + */ +bool spte_has_volatile_bits(u64 spte) +{ + /* + * Always atomically update spte if it can be updated + * out of mmu-lock, it can ensure dirty bit is not lost, + * also, it can help us to get a stable is_writable_pte() + * to ensure tlb flush is not missed. + */ + if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) + return true; + + if (is_access_track_spte(spte)) + return true; + + if (spte_ad_enabled(spte)) { + if (!(spte & shadow_accessed_mask) || + (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) + return true; + } + + return false; +} + bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index e4abeb5df1b1..80ab0f5cff01 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -390,7 +390,7 @@ static inline void check_spte_writable_invariants(u64 spte) "kvm: Writable SPTE is not MMU-writable: %llx", spte); } -static inline bool spte_can_locklessly_be_made_writable(u64 spte) +static inline bool is_mmu_writable_spte(u64 spte) { return spte & shadow_mmu_writable_mask; } @@ -404,6 +404,8 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } +bool spte_has_volatile_bits(u64 spte); + bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index b1eaf6ec0e0b..f0af385c56e0 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -6,6 +6,7 @@ #include <linux/kvm_host.h> #include "mmu.h" +#include "spte.h" /* * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs) @@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep) { return READ_ONCE(*rcu_dereference(sptep)); } -static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val) + +static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte) +{ + return xchg(rcu_dereference(sptep), new_spte); +} + +static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) +{ + WRITE_ONCE(*rcu_dereference(sptep), new_spte); +} + +static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, + u64 new_spte, int level) { - WRITE_ONCE(*rcu_dereference(sptep), val); + /* + * Atomically write the SPTE if it is a shadow-present, leaf SPTE with + * volatile bits, i.e. has bits that can be set outside of mmu_lock. + * The Writable bit can be set by KVM's fast page fault handler, and + * Accessed and Dirty bits can be set by the CPU. + * + * Note, non-leaf SPTEs do have Accessed bits and those bits are + * technically volatile, but KVM doesn't consume the Accessed bit of + * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This + * logic needs to be reassessed if KVM were to use non-leaf Accessed + * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. + */ + if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && + spte_has_volatile_bits(old_spte)) + return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); + + __kvm_tdp_mmu_write_spte(sptep, new_spte); + return old_spte; } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index edc68538819b..922b06bf4b94 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) tdp_mmu_unlink_sp(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - u64 *sptep = rcu_dereference(pt) + i; + tdp_ptep_t sptep = pt + i; gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); - u64 old_child_spte; + u64 old_spte; if (shared) { /* @@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * value to the removed SPTE value. */ for (;;) { - old_child_spte = xchg(sptep, REMOVED_SPTE); - if (!is_removed_spte(old_child_spte)) + old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); + if (!is_removed_spte(old_spte)) break; cpu_relax(); } @@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * are guarded by the memslots generation, not by being * unreachable. */ - old_child_spte = READ_ONCE(*sptep); - if (!is_shadow_present_pte(old_child_spte)) + old_spte = kvm_tdp_mmu_read_spte(sptep); + if (!is_shadow_present_pte(old_spte)) continue; /* - * Marking the SPTE as a removed SPTE is not - * strictly necessary here as the MMU lock will - * stop other threads from concurrently modifying - * this SPTE. Using the removed SPTE value keeps - * the two branches consistent and simplifies - * the function. + * Use the common helper instead of a raw WRITE_ONCE as + * the SPTE needs to be updated atomically if it can be + * modified by a different vCPU outside of mmu_lock. + * Even though the parent SPTE is !PRESENT, the TLB + * hasn't yet been flushed, and both Intel and AMD + * document that A/D assists can use upper-level PxE + * entries that are cached in the TLB, i.e. the CPU can + * still access the page and mark it dirty. + * + * No retry is needed in the atomic update path as the + * sole concern is dropping a Dirty bit, i.e. no other + * task can zap/remove the SPTE as mmu_lock is held for + * write. Marking the SPTE as a removed SPTE is not + * strictly necessary for the same reason, but using + * the remove SPTE value keeps the shared/exclusive + * paths consistent and allows the handle_changed_spte() + * call below to hardcode the new value to REMOVED_SPTE. + * + * Note, even though dropping a Dirty bit is the only + * scenario where a non-atomic update could result in a + * functional bug, simply checking the Dirty bit isn't + * sufficient as a fast page fault could read the upper + * level SPTE before it is zapped, and then make this + * target SPTE writable, resume the guest, and set the + * Dirty bit between reading the SPTE above and writing + * it here. */ - WRITE_ONCE(*sptep, REMOVED_SPTE); + old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, + REMOVED_SPTE, level); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_child_spte, REMOVED_SPTE, level, - shared); + old_spte, REMOVED_SPTE, level, shared); } call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); @@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, KVM_PAGES_PER_HPAGE(iter->level)); /* - * No other thread can overwrite the removed SPTE as they - * must either wait on the MMU lock or use - * tdp_mmu_set_spte_atomic which will not overwrite the - * special removed SPTE value. No bookkeeping is needed - * here since the SPTE is going from non-present - * to non-present. + * No other thread can overwrite the removed SPTE as they must either + * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not + * overwrite the special removed SPTE value. No bookkeeping is needed + * here since the SPTE is going from non-present to non-present. Use + * the raw write helper to avoid an unnecessary check on volatile bits. */ - kvm_tdp_mmu_write_spte(iter->sptep, 0); + __kvm_tdp_mmu_write_spte(iter->sptep, 0); return 0; } @@ -699,10 +718,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * unless performing certain dirty logging operations. * Leaving record_dirty_log unset in that case prevents page * writes from being double counted. + * + * Returns the old SPTE value, which _may_ be different than @old_spte if the + * SPTE had voldatile bits. */ -static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, - u64 old_spte, u64 new_spte, gfn_t gfn, int level, - bool record_acc_track, bool record_dirty_log) +static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, + u64 old_spte, u64 new_spte, gfn_t gfn, int level, + bool record_acc_track, bool record_dirty_log) { lockdep_assert_held_write(&kvm->mmu_lock); @@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, */ WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); - kvm_tdp_mmu_write_spte(sptep, new_spte); + old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); @@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, if (record_dirty_log) handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, new_spte, level); + return old_spte; } static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, @@ -732,9 +755,10 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, { WARN_ON_ONCE(iter->yielded); - __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte, - new_spte, iter->gfn, iter->level, - record_acc_track, record_dirty_log); + iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, + iter->old_spte, new_spte, + iter->gfn, iter->level, + record_acc_track, record_dirty_log); } static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index eca39f56c231..0604bc29f0b8 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -171,9 +171,12 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } -static int cmp_u64(const void *a, const void *b) +static int cmp_u64(const void *pa, const void *pb) { - return *(__u64 *)a - *(__u64 *)b; + u64 a = *(u64 *)pa; + u64 b = *(u64 *)pb; + + return (a > b) - (a < b); } void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index b14860863c39..16a5ebb420cf 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -45,6 +45,22 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = { [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, }; +/* duplicated from amd_f17h_perfmon_event_map. */ +static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = { + [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES }, + [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES }, + [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, +}; + +/* amd_pmc_perf_hw_id depends on these being the same size */ +static_assert(ARRAY_SIZE(amd_event_mapping) == + ARRAY_SIZE(amd_f17h_event_mapping)); + static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type) { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); @@ -140,6 +156,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) { + struct kvm_event_hw_type_mapping *event_mapping; u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; @@ -148,15 +165,20 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) if (WARN_ON(pmc_is_fixed(pmc))) return PERF_COUNT_HW_MAX; + if (guest_cpuid_family(pmc->vcpu) >= 0x17) + event_mapping = amd_f17h_event_mapping; + else + event_mapping = amd_event_mapping; + for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) - if (amd_event_mapping[i].eventsel == event_select - && amd_event_mapping[i].unit_mask == unit_mask) + if (event_mapping[i].eventsel == event_select + && event_mapping[i].unit_mask == unit_mask) break; if (i == ARRAY_SIZE(amd_event_mapping)) return PERF_COUNT_HW_MAX; - return amd_event_mapping[i].event_type; + return event_mapping[i].event_type; } /* check if a PMC is enabled by comparing it against global_ctrl bits. Because diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0ad70c12c7c3..636c77ef55fc 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -562,12 +562,20 @@ e_unpin: static int sev_es_sync_vmsa(struct vcpu_svm *svm) { - struct vmcb_save_area *save = &svm->vmcb->save; + struct sev_es_save_area *save = svm->sev_es.vmsa; /* Check some debug related fields before encrypting the VMSA */ - if (svm->vcpu.guest_debug || (save->dr7 & ~DR7_FIXED_1)) + if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) return -EINVAL; + /* + * SEV-ES will use a VMSA that is pointed to by the VMCB, not + * the traditional VMSA that is part of the VMCB. Copy the + * traditional VMSA as it has been built so far (in prep + * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. + */ + memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save)); + /* Sync registgers */ save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX]; save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX]; @@ -595,14 +603,6 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; - /* - * SEV-ES will use a VMSA that is pointed to by the VMCB, not - * the traditional VMSA that is part of the VMCB. Copy the - * traditional VMSA as it has been built so far (in prep - * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. - */ - memcpy(svm->sev_es.vmsa, save, sizeof(*save)); - return 0; } @@ -1594,24 +1594,51 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) atomic_set_release(&src_sev->migration_in_progress, 0); } +/* vCPU mutex subclasses. */ +enum sev_migration_role { + SEV_MIGRATION_SOURCE = 0, + SEV_MIGRATION_TARGET, + SEV_NR_MIGRATION_ROLES, +}; -static int sev_lock_vcpus_for_migration(struct kvm *kvm) +static int sev_lock_vcpus_for_migration(struct kvm *kvm, + enum sev_migration_role role) { struct kvm_vcpu *vcpu; unsigned long i, j; + bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { - if (mutex_lock_killable(&vcpu->mutex)) + if (mutex_lock_killable_nested(&vcpu->mutex, role)) goto out_unlock; + + if (first) { + /* + * Reset the role to one that avoids colliding with + * the role used for the first vcpu mutex. + */ + role = SEV_NR_MIGRATION_ROLES; + first = false; + } else { + mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); + } } return 0; out_unlock: + + first = true; kvm_for_each_vcpu(j, vcpu, kvm) { if (i == j) break; + if (first) + first = false; + else + mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); + + mutex_unlock(&vcpu->mutex); } return -EINTR; @@ -1621,8 +1648,15 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i; + bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { + if (first) + first = false; + else + mutex_acquire(&vcpu->mutex.dep_map, + SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_); + mutex_unlock(&vcpu->mutex); } } @@ -1748,10 +1782,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) charged = true; } - ret = sev_lock_vcpus_for_migration(kvm); + ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE); if (ret) goto out_dst_cgroup; - ret = sev_lock_vcpus_for_migration(source_kvm); + ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET); if (ret) goto out_dst_vcpu; @@ -2932,7 +2966,7 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) sev_enc_bit)); } -void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa) +void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) { /* * As an SEV-ES guest, hardware will restore the host state on VMEXIT, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7e45d03cd018..17d334ef5430 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1270,8 +1270,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) */ vmsave(__sme_page_pa(sd->save_area)); if (sev_es_guest(vcpu->kvm)) { - struct vmcb_save_area *hostsa; - hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); + struct sev_es_save_area *hostsa; + hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); sev_es_prepare_switch_to_guest(hostsa); } @@ -3117,8 +3117,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "tr:", save01->tr.selector, save01->tr.attrib, save01->tr.limit, save01->tr.base); - pr_err("cpl: %d efer: %016llx\n", - save->cpl, save->efer); + pr_err("vmpl: %d cpl: %d efer: %016llx\n", + save->vmpl, save->cpl, save->efer); pr_err("%-15s %016llx %-13s %016llx\n", "cr0:", save->cr0, "cr2:", save->cr2); pr_err("%-15s %016llx %-13s %016llx\n", diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f76deff71002..2d83845b9032 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -181,7 +181,7 @@ struct svm_nested_state { struct vcpu_sev_es_state { /* SEV-ES support */ - struct vmcb_save_area *vmsa; + struct sev_es_save_area *vmsa; struct ghcb *ghcb; struct kvm_host_map ghcb_map; bool received_first_sipi; @@ -622,7 +622,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); -void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa); +void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa); void sev_es_unmap_ghcb(struct vcpu_svm *svm); /* vmenter.S */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d58b763df855..610355b9ccce 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5472,7 +5472,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); return vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending; + (vcpu->arch.exception.pending || vcpu->arch.exception.injected); } static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 65d15df6212d..0e65d00e2339 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -54,8 +54,8 @@ static void delay_loop(u64 __loops) " jnz 2b \n" "3: dec %0 \n" - : /* we don't need output */ - :"a" (loops) + : "+a" (loops) + : ); } diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index b781d324211b..21104c41cba0 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -342,9 +342,9 @@ static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff) */ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) { -#ifdef CONFIG_X86_64 unsigned short sel; +#ifdef CONFIG_X86_64 switch (seg_reg_idx) { case INAT_SEG_REG_IGNORE: return 0; @@ -402,7 +402,8 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) case INAT_SEG_REG_FS: return (unsigned short)(regs->fs & 0xffff); case INAT_SEG_REG_GS: - return get_user_gs(regs); + savesegment(gs, sel); + return sel; case INAT_SEG_REG_IGNORE: default: return -EINVAL; diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index 2b3eb8c948a3..a58f451a7dd3 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -11,7 +11,7 @@ #include <asm/msr.h> #include <asm/archrandom.h> #include <asm/e820/api.h> -#include <asm/io.h> +#include <asm/shared/io.h> /* * When built for the regular kernel, several functions need to be stubbed out diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/arch/x86/lib/mmx_32.c +++ /dev/null diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index b82ca14ba718..4a9fd9029a53 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -153,7 +153,7 @@ static long pm_address(u_char FPU_modrm, u_char segment, switch (segment) { case PREFIX_GS_ - 1: /* user gs handling can be lazy, use special accessors */ - addr->selector = get_user_gs(FPU_info->regs); + savesegment(gs, addr->selector); break; default: addr->selector = PM_REG_(segment); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index fe3d3061fc11..d957dc15b371 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -20,13 +20,12 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ - pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o + pgtable.o physaddr.o tlb.o cpu_entry_area.o maccess.o obj-y += pat/ # Make sure __phys_addr has no stackprotector CFLAGS_physaddr.o := -fno-stack-protector -CFLAGS_setup_nx.o := -fno-stack-protector CFLAGS_mem_encrypt_identity.o := -fno-stack-protector CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 058b2f36b3a6..b3ca7d23e4b0 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -154,7 +154,7 @@ int __init amd_numa_init(void) node_set(nodeid, numa_nodes_parsed); } - if (!nodes_weight(numa_nodes_parsed)) + if (nodes_empty(numa_nodes_parsed)) return -ENOENT; /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index d0074c6ed31a..fad8faa29d04 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -149,7 +149,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) unsigned char opcode; if (user_mode(regs)) { - if (get_user(opcode, instr)) + if (get_user(opcode, (unsigned char __user *) instr)) break; } else { if (get_kernel_nofault(opcode, instr)) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 96d34ebb20a9..61d0ab154f96 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -110,7 +110,6 @@ int force_personality32; /* * noexec32=on|off * Control non executable heap for 32bit processes. - * To control the stack too use noexec=off * * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) * off PROT_READ implies PROT_EXEC @@ -902,6 +901,8 @@ static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) { + const unsigned long page = ALIGN_DOWN(start, PMD_SIZE); + vmemmap_flush_unused_pmd(); /* @@ -914,8 +915,7 @@ static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long * Mark with PAGE_UNUSED the unused parts of the new memmap range */ if (!IS_ALIGNED(start, PMD_SIZE)) - memset((void *)start, PAGE_UNUSED, - start - ALIGN_DOWN(start, PMD_SIZE)); + memset((void *)page, PAGE_UNUSED, start - page); /* * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 17a492c27306..1ad0228f8ceb 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -242,10 +242,15 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size, * If the page being mapped is in memory and SEV is active then * make sure the memory encryption attribute is enabled in the * resulting mapping. + * In TDX guests, memory is marked private by default. If encryption + * is not requested (using encrypted), explicitly set decrypt + * attribute in all IOREMAPPED memory. */ prot = PAGE_KERNEL_IO; if ((io_desc.flags & IORES_MAP_ENCRYPTED) || encrypted) prot = pgprot_encrypted(prot); + else + prot = pgprot_decrypted(prot); switch (pcm) { case _PAGE_CACHE_MODE_UC: diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 50d209939c66..11350e2fd736 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -42,7 +42,14 @@ bool force_dma_unencrypted(struct device *dev) static void print_mem_encrypt_feature_info(void) { - pr_info("AMD Memory Encryption Features active:"); + pr_info("Memory Encryption Features active:"); + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_cont(" Intel TDX\n"); + return; + } + + pr_cont(" AMD"); /* Secure Memory Encryption */ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { @@ -62,6 +69,10 @@ static void print_mem_encrypt_feature_info(void) if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) pr_cont(" SEV-ES"); + /* Secure Nested Paging */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + pr_cont(" SEV-SNP"); + pr_cont("\n"); } diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 6169053c2854..d3c88d9ef8d6 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -31,6 +31,7 @@ #include <asm/processor-flags.h> #include <asm/msr.h> #include <asm/cmdline.h> +#include <asm/sev.h> #include "mm_internal.h" @@ -48,6 +49,36 @@ EXPORT_SYMBOL(sme_me_mask); static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE); /* + * SNP-specific routine which needs to additionally change the page state from + * private to shared before copying the data from the source to destination and + * restore after the copy. + */ +static inline void __init snp_memcpy(void *dst, void *src, size_t sz, + unsigned long paddr, bool decrypt) +{ + unsigned long npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + + if (decrypt) { + /* + * @paddr needs to be accessed decrypted, mark the page shared in + * the RMP table before copying it. + */ + early_snp_set_memory_shared((unsigned long)__va(paddr), paddr, npages); + + memcpy(dst, src, sz); + + /* Restore the page state after the memcpy. */ + early_snp_set_memory_private((unsigned long)__va(paddr), paddr, npages); + } else { + /* + * @paddr need to be accessed encrypted, no need for the page state + * change. + */ + memcpy(dst, src, sz); + } +} + +/* * This routine does not change the underlying encryption setting of the * page(s) that map this memory. It assumes that eventually the memory is * meant to be accessed as either encrypted or decrypted but the contents @@ -95,8 +126,13 @@ static void __init __sme_early_enc_dec(resource_size_t paddr, * Use a temporary buffer, of cache-line multiple size, to * avoid data corruption as documented in the APM. */ - memcpy(sme_early_buffer, src, len); - memcpy(dst, sme_early_buffer, len); + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + snp_memcpy(sme_early_buffer, src, len, paddr, enc); + snp_memcpy(dst, sme_early_buffer, len, paddr, !enc); + } else { + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + } early_memunmap(dst, len); early_memunmap(src, len); @@ -280,11 +316,24 @@ static void enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) static void amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc) { + /* + * To maintain the security guarantees of SEV-SNP guests, make sure + * to invalidate the memory before encryption attribute is cleared. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc) + snp_set_memory_shared(vaddr, npages); } /* Return true unconditionally: return value doesn't matter for the SEV side */ static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc) { + /* + * After memory is mapped encrypted in the page table, validate it + * so that it is consistent with the page table updates. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && enc) + snp_set_memory_private(vaddr, npages); + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) enc_dec_hypercall(vaddr, npages, enc); @@ -322,14 +371,28 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) clflush_cache_range(__va(pa), size); /* Encrypt/decrypt the contents in-place */ - if (enc) + if (enc) { sme_early_encrypt(pa, size); - else + } else { sme_early_decrypt(pa, size); + /* + * ON SNP, the page state in the RMP table must happen + * before the page table updates. + */ + early_snp_set_memory_shared((unsigned long)__va(pa), pa, 1); + } + /* Change the page encryption mask. */ new_pte = pfn_pte(pfn, new_prot); set_pte_atomic(kpte, new_pte); + + /* + * If page is set encrypted in the page table, then update the RMP table to + * add this page as private. + */ + if (enc) + early_snp_set_memory_private((unsigned long)__va(pa), pa, 1); } static int __init early_set_memory_enc_dec(unsigned long vaddr, diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index b43bc24d2bb6..f415498d3175 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -45,6 +45,7 @@ #include <asm/sections.h> #include <asm/cmdline.h> #include <asm/coco.h> +#include <asm/sev.h> #include "mm_internal.h" @@ -509,8 +510,11 @@ void __init sme_enable(struct boot_params *bp) bool active_by_default; unsigned long me_mask; char buffer[16]; + bool snp; u64 msr; + snp = snp_init(bp); + /* Check for the SME/SEV support leaf */ eax = 0x80000000; ecx = 0; @@ -542,6 +546,10 @@ void __init sme_enable(struct boot_params *bp) sev_status = __rdmsr(MSR_AMD64_SEV); feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; + /* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */ + if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + snp_abort(); + /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { /* diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 933a2ebad471..c3317f0650d8 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -400,7 +400,7 @@ static void leave_uniprocessor(void) int cpu; int err; - if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0) + if (!cpumask_available(downed_cpus) || cpumask_empty(downed_cpus)) return; pr_notice("Re-enabling CPUs...\n"); for_each_cpu(cpu, downed_cpus) { diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 1a02b791d273..9a9305367fdd 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -123,7 +123,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * Continue to fill physical nodes with fake nodes until there is no * memory left on any of them. */ - while (nodes_weight(physnode_mask)) { + while (!nodes_empty(physnode_mask)) { for_each_node_mask(i, physnode_mask) { u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); u64 start, limit, end; @@ -270,7 +270,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, * Fill physical nodes with fake nodes of size until there is no memory * left on any of them. */ - while (nodes_weight(physnode_mask)) { + while (!nodes_empty(physnode_mask)) { for_each_node_mask(i, physnode_mask) { u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); u64 start, limit, end; diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 4ba2a3ee4bce..d5ef64ddd35e 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -101,7 +101,7 @@ int pat_debug_enable; static int __init pat_debug_setup(char *str) { pat_debug_enable = 1; - return 0; + return 1; } __setup("debugpat", pat_debug_setup); diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 5d5c7bb50ce9..ffe3b3a087fe 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -540,7 +540,7 @@ static inline bool pti_kernel_image_global_ok(void) * cases where RANDSTRUCT is in use to help keep the layout a * secret. */ - if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT)) + if (IS_ENABLED(CONFIG_RANDSTRUCT)) return false; return true; diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c deleted file mode 100644 index ed5667f5169f..000000000000 --- a/arch/x86/mm/setup_nx.c +++ /dev/null @@ -1,62 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/spinlock.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/pgtable.h> - -#include <asm/proto.h> -#include <asm/cpufeature.h> - -static int disable_nx; - -/* - * noexec = on|off - * - * Control non-executable mappings for processes. - * - * on Enable - * off Disable - */ -static int __init noexec_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - disable_nx = 0; - } else if (!strncmp(str, "off", 3)) { - disable_nx = 1; - } - x86_configure_nx(); - return 0; -} -early_param("noexec", noexec_setup); - -void x86_configure_nx(void) -{ - if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) - __supported_pte_mask |= _PAGE_NX; - else - __supported_pte_mask &= ~_PAGE_NX; -} - -void __init x86_report_nx(void) -{ - if (!boot_cpu_has(X86_FEATURE_NX)) { - printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " - "missing in CPU!\n"); - } else { -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) - if (disable_nx) { - printk(KERN_INFO "NX (Execute Disable) protection: " - "disabled by kernel command line option\n"); - } else { - printk(KERN_INFO "NX (Execute Disable) protection: " - "active\n"); - } -#else - /* 32bit non-PAE kernel, NX cannot be used */ - printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " - "cannot be enabled: non-PAE kernel!\n"); -#endif - } -} diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 16b6efacf7c6..f298b18a9a3d 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -228,6 +228,11 @@ static void jit_fill_hole(void *area, unsigned int size) memset(area, 0xcc, size); } +int bpf_arch_text_invalidate(void *dst, size_t len) +{ + return IS_ERR_OR_NULL(text_poke_set(dst, 0xcc, len)); +} + struct jit_context { int cleanup_addr; /* Epilogue code offset */ @@ -1762,13 +1767,32 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_prog *p, int stack_size, bool save_ret) + struct bpf_tramp_link *l, int stack_size, + int run_ctx_off, bool save_ret) { u8 *prog = *pprog; u8 *jmp_insn; + int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); + struct bpf_prog *p = l->link.prog; + u64 cookie = l->cookie; + + /* mov rdi, cookie */ + emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie); + + /* Prepare struct bpf_tramp_run_ctx. + * + * bpf_tramp_run_ctx is already preserved by + * arch_prepare_bpf_trampoline(). + * + * mov QWORD PTR [rbp - run_ctx_off + ctx_cookie_off], rdi + */ + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off); /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + /* arg2: lea rsi, [rbp - ctx_cookie_off] */ + EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); + if (emit_call(&prog, p->aux->sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter, prog)) @@ -1814,6 +1838,8 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: mov rsi, rbx <- start time in nsec */ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + /* arg3: lea rdx, [rbp - run_ctx_off] */ + EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); if (emit_call(&prog, p->aux->sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit, prog)) @@ -1850,15 +1876,15 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) } static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_progs *tp, int stack_size, - bool save_ret) + struct bpf_tramp_links *tl, int stack_size, + int run_ctx_off, bool save_ret) { int i; u8 *prog = *pprog; - for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, - save_ret)) + for (i = 0; i < tl->nr_links; i++) { + if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, + run_ctx_off, save_ret)) return -EINVAL; } *pprog = prog; @@ -1866,8 +1892,8 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, } static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_progs *tp, int stack_size, - u8 **branches) + struct bpf_tramp_links *tl, int stack_size, + int run_ctx_off, u8 **branches) { u8 *prog = *pprog; int i; @@ -1877,8 +1903,8 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, */ emit_mov_imm32(&prog, false, BPF_REG_0, 0); emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, true)) + for (i = 0; i < tl->nr_links; i++) { + if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true)) return -EINVAL; /* mod_ret prog stored return value into [rbp - 8]. Emit: @@ -1980,14 +2006,14 @@ static bool is_valid_bpf_tramp_flags(unsigned int flags) */ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_progs *tprogs, + struct bpf_tramp_links *tlinks, void *orig_call) { int ret, i, nr_args = m->nr_args; - int regs_off, ip_off, args_off, stack_size = nr_args * 8; - struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; - struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; - struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; + int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off; + struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; + struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; u8 **branches = NULL; u8 *prog; bool save_ret; @@ -2014,6 +2040,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i * RBP - args_off [ args count ] always * * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag + * + * RBP - run_ctx_off [ bpf_tramp_run_ctx ] */ /* room for return value of orig_call or fentry prog */ @@ -2032,6 +2060,9 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i ip_off = stack_size; + stack_size += (sizeof(struct bpf_tramp_run_ctx) + 7) & ~0x7; + run_ctx_off = stack_size; + if (flags & BPF_TRAMP_F_SKIP_FRAME) { /* skip patched call instruction and point orig_call to actual * body of the kernel function. @@ -2078,19 +2109,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } } - if (fentry->nr_progs) - if (invoke_bpf(m, &prog, fentry, regs_off, + if (fentry->nr_links) + if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; - if (fmod_ret->nr_progs) { - branches = kcalloc(fmod_ret->nr_progs, sizeof(u8 *), + if (fmod_ret->nr_links) { + branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *), GFP_KERNEL); if (!branches) return -ENOMEM; if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off, - branches)) { + run_ctx_off, branches)) { ret = -EINVAL; goto cleanup; } @@ -2111,7 +2142,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i prog += X86_PATCH_SIZE; } - if (fmod_ret->nr_progs) { + if (fmod_ret->nr_links) { /* From Intel 64 and IA-32 Architectures Optimization * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler * Coding Rule 11: All branch targets should be 16-byte @@ -2121,13 +2152,13 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i /* Update the branches saved in invoke_bpf_mod_ret with the * aligned address of do_fexit. */ - for (i = 0; i < fmod_ret->nr_progs; i++) + for (i = 0; i < fmod_ret->nr_links; i++) emit_cond_near_jump(&branches[i], prog, branches[i], X86_JNE); } - if (fexit->nr_progs) - if (invoke_bpf(m, &prog, fexit, regs_off, false)) { + if (fexit->nr_links) + if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false)) { ret = -EINVAL; goto cleanup; } diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 97b63e35e152..a498b847d740 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -25,6 +25,8 @@ #define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) #define PIRQ_VERSION 0x0100 +#define IRT_SIGNATURE (('$' << 0) + ('I' << 8) + ('R' << 16) + ('T' << 24)) + static int broken_hp_bios_irq9; static int acer_tm360_irqrouting; @@ -68,30 +70,99 @@ void (*pcibios_disable_irq)(struct pci_dev *dev) = pirq_disable_irq; * and perform checksum verification. */ -static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr) +static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr, + u8 *limit) { struct irq_routing_table *rt; int i; u8 sum; - rt = (struct irq_routing_table *) addr; + rt = (struct irq_routing_table *)addr; if (rt->signature != PIRQ_SIGNATURE || rt->version != PIRQ_VERSION || rt->size % 16 || - rt->size < sizeof(struct irq_routing_table)) + rt->size < sizeof(struct irq_routing_table) || + (limit && rt->size > limit - addr)) return NULL; sum = 0; for (i = 0; i < rt->size; i++) sum += addr[i]; if (!sum) { - DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", - rt); + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%lx\n", + __pa(rt)); return rt; } return NULL; } +/* + * Handle the $IRT PCI IRQ Routing Table format used by AMI for its BCP + * (BIOS Configuration Program) external tool meant for tweaking BIOS + * structures without the need to rebuild it from sources. The $IRT + * format has been invented by AMI before Microsoft has come up with its + * $PIR format and a $IRT table is therefore there in some systems that + * lack a $PIR table. + * + * It uses the same PCI BIOS 2.1 format for interrupt routing entries + * themselves but has a different simpler header prepended instead, + * occupying 8 bytes, where a `$IRT' signature is followed by one byte + * specifying the total number of interrupt routing entries allocated in + * the table, then one byte specifying the actual number of entries used + * (which the BCP tool can take advantage of when modifying the table), + * and finally a 16-bit word giving the IRQs devoted exclusively to PCI. + * Unlike with the $PIR table there is no alignment guarantee. + * + * Given the similarity of the two formats the $IRT one is trivial to + * convert to the $PIR one, which we do here, except that obviously we + * have no information as to the router device to use, but we can handle + * it by matching PCI device IDs actually seen on the bus against ones + * that our individual routers recognise. + * + * Reportedly there is another $IRT table format where a 16-bit word + * follows the header instead that points to interrupt routing entries + * in a $PIR table provided elsewhere. In that case this code will not + * be reached though as the $PIR table will have been chosen instead. + */ +static inline struct irq_routing_table *pirq_convert_irt_table(u8 *addr, + u8 *limit) +{ + struct irt_routing_table *ir; + struct irq_routing_table *rt; + u16 size; + u8 sum; + int i; + + ir = (struct irt_routing_table *)addr; + if (ir->signature != IRT_SIGNATURE || !ir->used || ir->size < ir->used) + return NULL; + + size = sizeof(*ir) + ir->used * sizeof(ir->slots[0]); + if (size > limit - addr) + return NULL; + + DBG(KERN_DEBUG "PCI: $IRT Interrupt Routing Table found at 0x%lx\n", + __pa(ir)); + size = sizeof(*rt) + ir->used * sizeof(rt->slots[0]); + rt = kzalloc(size, GFP_KERNEL); + if (!rt) + return NULL; + + rt->signature = PIRQ_SIGNATURE; + rt->version = PIRQ_VERSION; + rt->size = size; + rt->exclusive_irqs = ir->exclusive_irqs; + for (i = 0; i < ir->used; i++) + rt->slots[i] = ir->slots[i]; + + addr = (u8 *)rt; + sum = 0; + for (i = 0; i < size; i++) + sum += addr[i]; + rt->checksum = -sum; + + return rt; +} /* * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table. @@ -99,17 +170,29 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr) static struct irq_routing_table * __init pirq_find_routing_table(void) { + u8 * const bios_start = (u8 *)__va(0xf0000); + u8 * const bios_end = (u8 *)__va(0x100000); u8 *addr; struct irq_routing_table *rt; if (pirq_table_addr) { - rt = pirq_check_routing_table((u8 *) __va(pirq_table_addr)); + rt = pirq_check_routing_table((u8 *)__va(pirq_table_addr), + NULL); if (rt) return rt; printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n"); } - for (addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) { - rt = pirq_check_routing_table(addr); + for (addr = bios_start; + addr < bios_end - sizeof(struct irq_routing_table); + addr += 16) { + rt = pirq_check_routing_table(addr, bios_end); + if (rt) + return rt; + } + for (addr = bios_start; + addr < bios_end - sizeof(struct irt_routing_table); + addr++) { + rt = pirq_convert_irt_table(addr, bios_end); if (rt) return rt; } @@ -135,7 +218,8 @@ static void __init pirq_peer_trick(void) #ifdef DEBUG { int j; - DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot); + DBG(KERN_DEBUG "%02x:%02x.%x slot=%02x", + e->bus, e->devfn / 8, e->devfn % 8, e->slot); for (j = 0; j < 4; j++) DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap); DBG("\n"); @@ -253,6 +337,15 @@ static void write_pc_conf_nybble(u8 base, u8 index, u8 val) pc_conf_set(reg, x); } +/* + * FinALi pirq rules are as follows: + * + * - bit 0 selects between INTx Routing Table Mapping Registers, + * + * - bit 3 selects the nibble within the INTx Routing Table Mapping Register, + * + * - bits 7:4 map to bits 3:0 of the PCI INTx Sensitivity Register. + */ static int pirq_finali_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { @@ -260,11 +353,13 @@ static int pirq_finali_get(struct pci_dev *router, struct pci_dev *dev, 0, 9, 3, 10, 4, 5, 7, 6, 0, 11, 0, 12, 0, 14, 0, 15 }; unsigned long flags; + u8 index; u8 x; + index = (pirq & 1) << 1 | (pirq & 8) >> 3; raw_spin_lock_irqsave(&pc_conf_lock, flags); pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY); - x = irqmap[read_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, pirq - 1)]; + x = irqmap[read_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, index)]; pc_conf_set(PC_CONF_FINALI_LOCK, 0); raw_spin_unlock_irqrestore(&pc_conf_lock, flags); return x; @@ -278,13 +373,15 @@ static int pirq_finali_set(struct pci_dev *router, struct pci_dev *dev, }; u8 val = irqmap[irq]; unsigned long flags; + u8 index; if (!val) return 0; + index = (pirq & 1) << 1 | (pirq & 8) >> 3; raw_spin_lock_irqsave(&pc_conf_lock, flags); pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY); - write_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, pirq - 1, val); + write_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, index, val); pc_conf_set(PC_CONF_FINALI_LOCK, 0); raw_spin_unlock_irqrestore(&pc_conf_lock, flags); return 1; @@ -293,7 +390,7 @@ static int pirq_finali_set(struct pci_dev *router, struct pci_dev *dev, static int pirq_finali_lvl(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { - u8 mask = ~(1u << (pirq - 1)); + u8 mask = ~((pirq & 0xf0u) >> 4); unsigned long flags; u8 trig; @@ -579,6 +676,81 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, return 1; } + +/* + * PIRQ routing for the SiS85C497 AT Bus Controller & Megacell (ATM) + * ISA bridge used with the SiS 85C496/497 486 Green PC VESA/ISA/PCI + * Chipset. + * + * There are four PCI INTx#-to-IRQ Link registers provided in the + * SiS85C497 part of the peculiar combined 85C496/497 configuration + * space decoded by the SiS85C496 PCI & CPU Memory Controller (PCM) + * host bridge, at 0xc0/0xc1/0xc2/0xc3 respectively for the PCI INT + * A/B/C/D lines. Bit 7 enables the respective link if set and bits + * 3:0 select the 8259A IRQ line as follows: + * + * 0000 : Reserved + * 0001 : Reserved + * 0010 : Reserved + * 0011 : IRQ3 + * 0100 : IRQ4 + * 0101 : IRQ5 + * 0110 : IRQ6 + * 0111 : IRQ7 + * 1000 : Reserved + * 1001 : IRQ9 + * 1010 : IRQ10 + * 1011 : IRQ11 + * 1100 : IRQ12 + * 1101 : Reserved + * 1110 : IRQ14 + * 1111 : IRQ15 + * + * We avoid using a reserved value for disabled links, hence the + * choice of IRQ15 for that case. + * + * References: + * + * "486 Green PC VESA/ISA/PCI Chipset, SiS 85C496/497", Rev 3.0, + * Silicon Integrated Systems Corp., July 1995 + */ + +#define PCI_SIS497_INTA_TO_IRQ_LINK 0xc0u + +#define PIRQ_SIS497_IRQ_MASK 0x0fu +#define PIRQ_SIS497_IRQ_ENABLE 0x80u + +static int pirq_sis497_get(struct pci_dev *router, struct pci_dev *dev, + int pirq) +{ + int reg; + u8 x; + + reg = pirq; + if (reg >= 1 && reg <= 4) + reg += PCI_SIS497_INTA_TO_IRQ_LINK - 1; + + pci_read_config_byte(router, reg, &x); + return (x & PIRQ_SIS497_IRQ_ENABLE) ? (x & PIRQ_SIS497_IRQ_MASK) : 0; +} + +static int pirq_sis497_set(struct pci_dev *router, struct pci_dev *dev, + int pirq, int irq) +{ + int reg; + u8 x; + + reg = pirq; + if (reg >= 1 && reg <= 4) + reg += PCI_SIS497_INTA_TO_IRQ_LINK - 1; + + pci_read_config_byte(router, reg, &x); + x &= ~(PIRQ_SIS497_IRQ_MASK | PIRQ_SIS497_IRQ_ENABLE); + x |= irq ? (PIRQ_SIS497_IRQ_ENABLE | irq) : PIRQ_SIS497_IRQ_MASK; + pci_write_config_byte(router, reg, x); + return 1; +} + /* * PIRQ routing for SiS 85C503 router used in several SiS chipsets. * We have to deal with the following issues here: @@ -640,11 +812,12 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, * bit 6-4 are probably unused, not like 5595 */ -#define PIRQ_SIS_IRQ_MASK 0x0f -#define PIRQ_SIS_IRQ_DISABLE 0x80 -#define PIRQ_SIS_USB_ENABLE 0x40 +#define PIRQ_SIS503_IRQ_MASK 0x0f +#define PIRQ_SIS503_IRQ_DISABLE 0x80 +#define PIRQ_SIS503_USB_ENABLE 0x40 -static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +static int pirq_sis503_get(struct pci_dev *router, struct pci_dev *dev, + int pirq) { u8 x; int reg; @@ -653,10 +826,11 @@ static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq) if (reg >= 0x01 && reg <= 0x04) reg += 0x40; pci_read_config_byte(router, reg, &x); - return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK); + return (x & PIRQ_SIS503_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS503_IRQ_MASK); } -static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +static int pirq_sis503_set(struct pci_dev *router, struct pci_dev *dev, + int pirq, int irq) { u8 x; int reg; @@ -665,8 +839,8 @@ static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i if (reg >= 0x01 && reg <= 0x04) reg += 0x40; pci_read_config_byte(router, reg, &x); - x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE); - x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE; + x &= ~(PIRQ_SIS503_IRQ_MASK | PIRQ_SIS503_IRQ_DISABLE); + x |= irq ? irq : PIRQ_SIS503_IRQ_DISABLE; pci_write_config_byte(router, reg, x); return 1; } @@ -958,13 +1132,19 @@ static __init int serverworks_router_probe(struct irq_router *r, static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { - if (device != PCI_DEVICE_ID_SI_503) - return 0; - - r->name = "SIS"; - r->get = pirq_sis_get; - r->set = pirq_sis_set; - return 1; + switch (device) { + case PCI_DEVICE_ID_SI_496: + r->name = "SiS85C497"; + r->get = pirq_sis497_get; + r->set = pirq_sis497_set; + return 1; + case PCI_DEVICE_ID_SI_503: + r->name = "SiS85C503"; + r->get = pirq_sis503_get; + r->set = pirq_sis503_set; + return 1; + } + return 0; } static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) @@ -1084,10 +1264,32 @@ static struct pci_dev *pirq_router_dev; * chipset" ? */ +static bool __init pirq_try_router(struct irq_router *r, + struct irq_routing_table *rt, + struct pci_dev *dev) +{ + struct irq_router_handler *h; + + DBG(KERN_DEBUG "PCI: Trying IRQ router for [%04x:%04x]\n", + dev->vendor, dev->device); + + for (h = pirq_routers; h->vendor; h++) { + /* First look for a router match */ + if (rt->rtr_vendor == h->vendor && + h->probe(r, dev, rt->rtr_device)) + return true; + /* Fall back to a device match */ + if (dev->vendor == h->vendor && + h->probe(r, dev, dev->device)) + return true; + } + return false; +} + static void __init pirq_find_router(struct irq_router *r) { struct irq_routing_table *rt = pirq_table; - struct irq_router_handler *h; + struct pci_dev *dev; #ifdef CONFIG_PCI_BIOS if (!rt->signature) { @@ -1106,50 +1308,94 @@ static void __init pirq_find_router(struct irq_router *r) DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n", rt->rtr_vendor, rt->rtr_device); - pirq_router_dev = pci_get_domain_bus_and_slot(0, rt->rtr_bus, - rt->rtr_devfn); - if (!pirq_router_dev) { - DBG(KERN_DEBUG "PCI: Interrupt router not found at " - "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); - return; + /* Use any vendor:device provided by the routing table or try all. */ + if (rt->rtr_vendor) { + dev = pci_get_domain_bus_and_slot(0, rt->rtr_bus, + rt->rtr_devfn); + if (dev && pirq_try_router(r, rt, dev)) + pirq_router_dev = dev; + } else { + dev = NULL; + for_each_pci_dev(dev) { + if (pirq_try_router(r, rt, dev)) { + pirq_router_dev = dev; + break; + } + } } - for (h = pirq_routers; h->vendor; h++) { - /* First look for a router match */ - if (rt->rtr_vendor == h->vendor && - h->probe(r, pirq_router_dev, rt->rtr_device)) - break; - /* Fall back to a device match */ - if (pirq_router_dev->vendor == h->vendor && - h->probe(r, pirq_router_dev, pirq_router_dev->device)) - break; - } - dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n", - pirq_router.name, - pirq_router_dev->vendor, pirq_router_dev->device); + if (pirq_router_dev) + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n", + pirq_router.name, + pirq_router_dev->vendor, pirq_router_dev->device); + else + DBG(KERN_DEBUG "PCI: Interrupt router not found at " + "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); /* The device remains referenced for the kernel lifetime */ } -static struct irq_info *pirq_get_info(struct pci_dev *dev) +/* + * We're supposed to match on the PCI device only and not the function, + * but some BIOSes build their tables with the PCI function included + * for motherboard devices, so if a complete match is found, then give + * it precedence over a slot match. + */ +static struct irq_info *pirq_get_dev_info(struct pci_dev *dev) { struct irq_routing_table *rt = pirq_table; int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); + struct irq_info *slotinfo = NULL; struct irq_info *info; for (info = rt->slots; entries--; info++) - if (info->bus == dev->bus->number && - PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) - return info; - return NULL; + if (info->bus == dev->bus->number) { + if (info->devfn == dev->devfn) + return info; + if (!slotinfo && + PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) + slotinfo = info; + } + return slotinfo; +} + +/* + * Buses behind bridges are typically not listed in the PIRQ routing table. + * Do the usual dance then and walk the tree of bridges up adjusting the + * pin number accordingly on the way until the originating root bus device + * has been reached and then use its routing information. + */ +static struct irq_info *pirq_get_info(struct pci_dev *dev, u8 *pin) +{ + struct pci_dev *temp_dev = dev; + struct irq_info *info; + u8 temp_pin = *pin; + u8 dpin = temp_pin; + + info = pirq_get_dev_info(dev); + while (!info && temp_dev->bus->parent) { + struct pci_dev *bridge = temp_dev->bus->self; + + temp_pin = pci_swizzle_interrupt_pin(temp_dev, temp_pin); + info = pirq_get_dev_info(bridge); + if (info) + dev_warn(&dev->dev, + "using bridge %s INT %c to get INT %c\n", + pci_name(bridge), + 'A' + temp_pin - 1, 'A' + dpin - 1); + + temp_dev = bridge; + } + *pin = temp_pin; + return info; } static int pcibios_lookup_irq(struct pci_dev *dev, int assign) { - u8 pin; struct irq_info *info; int i, pirq, newirq; + u8 dpin, pin; int irq = 0; u32 mask; struct irq_router *r = &pirq_router; @@ -1157,8 +1403,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) char *msg = NULL; /* Find IRQ pin */ - pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - if (!pin) { + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &dpin); + if (!dpin) { dev_dbg(&dev->dev, "no interrupt pin\n"); return 0; } @@ -1171,20 +1417,21 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) if (!pirq_table) return 0; - info = pirq_get_info(dev); + pin = dpin; + info = pirq_get_info(dev, &pin); if (!info) { dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n", - 'A' + pin - 1); + 'A' + dpin - 1); return 0; } pirq = info->irq[pin - 1].link; mask = info->irq[pin - 1].bitmap; if (!pirq) { - dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin - 1); + dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + dpin - 1); return 0; } dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x", - 'A' + pin - 1, pirq, mask, pirq_table->exclusive_irqs); + 'A' + dpin - 1, pirq, mask, pirq_table->exclusive_irqs); mask &= pcibios_irq_mask; /* Work around broken HP Pavilion Notebooks which assign USB to @@ -1226,7 +1473,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) newirq = i; } } - dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin - 1, newirq); + dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + dpin - 1, newirq); /* Check if it is hardcoded */ if ((pirq & 0xf0) == 0xf0) { @@ -1260,15 +1507,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) return 0; } } - dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq); + dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", + msg, 'A' + dpin - 1, irq); /* Update IRQ for all devices with the same pirq value */ for_each_pci_dev(dev2) { - pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); - if (!pin) + pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &dpin); + if (!dpin) continue; - info = pirq_get_info(dev2); + pin = dpin; + info = pirq_get_info(dev2, &pin); if (!info) continue; if (info->irq[pin - 1].link == pirq) { diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 147c30a81f15..1591d67e0bcd 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -93,6 +93,9 @@ static const unsigned long * const efi_tables[] = { #ifdef CONFIG_LOAD_UEFI_KEYS &efi.mokvar_table, #endif +#ifdef CONFIG_EFI_COCO_SECRET + &efi.coco_secret, +#endif }; u64 efi_setup; /* efi setup_data physical address */ diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 1e9ff28bc2e0..a60af0230e27 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -244,8 +244,10 @@ static inline bool uv_nmi_action_is(const char *action) /* Setup which NMI support is present in system */ static void uv_nmi_setup_mmrs(void) { + bool new_nmi_method_only = false; + /* First determine arch specific MMRs to handshake with BIOS */ - if (UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK) { + if (UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK) { /* UV2,3,4 setup */ uvh_nmi_mmrx = UVH_EVENT_OCCURRED0; uvh_nmi_mmrx_clear = UVH_EVENT_OCCURRED0_ALIAS; uvh_nmi_mmrx_shift = UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT; @@ -255,26 +257,25 @@ static void uv_nmi_setup_mmrs(void) uvh_nmi_mmrx_req = UVH_BIOS_KERNEL_MMR_ALIAS_2; uvh_nmi_mmrx_req_shift = 62; - } else if (UVH_EVENT_OCCURRED1_EXTIO_INT0_MASK) { + } else if (UVH_EVENT_OCCURRED1_EXTIO_INT0_MASK) { /* UV5+ setup */ uvh_nmi_mmrx = UVH_EVENT_OCCURRED1; uvh_nmi_mmrx_clear = UVH_EVENT_OCCURRED1_ALIAS; uvh_nmi_mmrx_shift = UVH_EVENT_OCCURRED1_EXTIO_INT0_SHFT; uvh_nmi_mmrx_type = "OCRD1-EXTIO_INT0"; - uvh_nmi_mmrx_supported = UVH_EXTIO_INT0_BROADCAST; - uvh_nmi_mmrx_req = UVH_BIOS_KERNEL_MMR_ALIAS_2; - uvh_nmi_mmrx_req_shift = 62; + new_nmi_method_only = true; /* Newer nmi always valid on UV5+ */ + uvh_nmi_mmrx_req = 0; /* no request bit to clear */ } else { - pr_err("UV:%s:cannot find EVENT_OCCURRED*_EXTIO_INT0\n", - __func__); + pr_err("UV:%s:NMI support not available on this system\n", __func__); return; } /* Then find out if new NMI is supported */ - if (likely(uv_read_local_mmr(uvh_nmi_mmrx_supported))) { - uv_write_local_mmr(uvh_nmi_mmrx_req, - 1UL << uvh_nmi_mmrx_req_shift); + if (new_nmi_method_only || uv_read_local_mmr(uvh_nmi_mmrx_supported)) { + if (uvh_nmi_mmrx_req) + uv_write_local_mmr(uvh_nmi_mmrx_req, + 1UL << uvh_nmi_mmrx_req_shift); nmi_mmr = uvh_nmi_mmrx; nmi_mmr_clear = uvh_nmi_mmrx_clear; nmi_mmr_pending = 1UL << uvh_nmi_mmrx_shift; @@ -985,7 +986,7 @@ static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) /* Clear global flags */ if (master) { - if (cpumask_weight(uv_nmi_cpu_mask)) + if (!cpumask_empty(uv_nmi_cpu_mask)) uv_nmi_cleanup_mask(); atomic_set(&uv_nmi_cpus_in_nmi, -1); atomic_set(&uv_nmi_cpu, -1); diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index c5e29db02a46..41d7669a97ad 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -67,7 +67,7 @@ void __init reserve_real_mode(void) memblock_reserve(0, SZ_1M); } -static void sme_sev_setup_real_mode(struct trampoline_header *th) +static void __init sme_sev_setup_real_mode(struct trampoline_header *th) { #ifdef CONFIG_AMD_MEM_ENCRYPT if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index 8c1db5bf5d78..2eb62be6d256 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -24,6 +24,7 @@ SYM_DATA_START(real_mode_header) .long pa_sev_es_trampoline_start #endif #ifdef CONFIG_X86_64 + .long pa_trampoline_start64 .long pa_trampoline_pgd; #endif /* ACPI S3 wakeup */ diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index cc8391f86cdb..e38d61d6562e 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -70,7 +70,7 @@ SYM_CODE_START(trampoline_start) movw $__KERNEL_DS, %dx # Data segment descriptor # Enable protected mode - movl $X86_CR0_PE, %eax # protected mode (PE) bit + movl $(CR0_STATE & ~X86_CR0_PG), %eax movl %eax, %cr0 # into protected mode # flush prefetch and jump to startup_32 @@ -143,13 +143,24 @@ SYM_CODE_START(startup_32) movl %eax, %cr3 # Set up EFER + movl $MSR_EFER, %ecx + rdmsr + /* + * Skip writing to EFER if the register already has desired + * value (to avoid #VE for the TDX guest). + */ + cmp pa_tr_efer, %eax + jne .Lwrite_efer + cmp pa_tr_efer + 4, %edx + je .Ldone_efer +.Lwrite_efer: movl pa_tr_efer, %eax movl pa_tr_efer + 4, %edx - movl $MSR_EFER, %ecx wrmsr - # Enable paging and in turn activate Long Mode - movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax +.Ldone_efer: + # Enable paging and in turn activate Long Mode. + movl $CR0_STATE, %eax movl %eax, %cr0 /* @@ -161,6 +172,19 @@ SYM_CODE_START(startup_32) ljmpl $__KERNEL_CS, $pa_startup_64 SYM_CODE_END(startup_32) +SYM_CODE_START(pa_trampoline_compat) + /* + * In compatibility mode. Prep ESP and DX for startup_32, then disable + * paging and complete the switch to legacy 32-bit mode. + */ + movl $rm_stack_end, %esp + movw $__KERNEL_DS, %dx + + movl $(CR0_STATE & ~X86_CR0_PG), %eax + movl %eax, %cr0 + ljmpl $__KERNEL32_CS, $pa_startup_32 +SYM_CODE_END(pa_trampoline_compat) + .section ".text64","ax" .code64 .balign 4 @@ -169,6 +193,20 @@ SYM_CODE_START(startup_64) jmpq *tr_start(%rip) SYM_CODE_END(startup_64) +SYM_CODE_START(trampoline_start64) + /* + * APs start here on a direct transfer from 64-bit BIOS with identity + * mapped page tables. Load the kernel's GDT in order to gear down to + * 32-bit mode (to handle 4-level vs. 5-level paging), and to (re)load + * segment registers. Load the zero IDT so any fault triggers a + * shutdown instead of jumping back into BIOS. + */ + lidt tr_idt(%rip) + lgdt tr_gdt64(%rip) + + ljmpl *tr_compat(%rip) +SYM_CODE_END(trampoline_start64) + .section ".rodata","a" # Duplicate the global descriptor table # so the kernel can live anywhere @@ -182,6 +220,17 @@ SYM_DATA_START(tr_gdt) .quad 0x00cf93000000ffff # __KERNEL_DS SYM_DATA_END_LABEL(tr_gdt, SYM_L_LOCAL, tr_gdt_end) +SYM_DATA_START(tr_gdt64) + .short tr_gdt_end - tr_gdt - 1 # gdt limit + .long pa_tr_gdt + .long 0 +SYM_DATA_END(tr_gdt64) + +SYM_DATA_START(tr_compat) + .long pa_trampoline_compat + .short __KERNEL32_CS +SYM_DATA_END(tr_compat) + .bss .balign PAGE_SIZE SYM_DATA(trampoline_pgd, .space PAGE_SIZE) diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S index 5033e640f957..4331c32c47f8 100644 --- a/arch/x86/realmode/rm/trampoline_common.S +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -1,4 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ .section ".rodata","a" .balign 16 -SYM_DATA_LOCAL(tr_idt, .fill 1, 6, 0) + +/* + * When a bootloader hands off to the kernel in 32-bit mode an + * IDT with a 2-byte limit and 4-byte base is needed. When a boot + * loader hands off to a kernel 64-bit mode the base address + * extends to 8-bytes. Reserve enough space for either scenario. + */ +SYM_DATA_START_LOCAL(tr_idt) + .short 0 + .quad 0 +SYM_DATA_END(tr_idt) diff --git a/arch/x86/realmode/rm/wakemain.c b/arch/x86/realmode/rm/wakemain.c index 1d6437e6d2ba..a6f4d8388ad8 100644 --- a/arch/x86/realmode/rm/wakemain.c +++ b/arch/x86/realmode/rm/wakemain.c @@ -62,8 +62,12 @@ static void send_morse(const char *pattern) } } +struct port_io_ops pio_ops; + void main(void) { + init_default_io_ops(); + /* Kill machine if structures are wrong */ if (wakeup_header.real_magic != 0x12345678) while (1) diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S new file mode 100644 index 000000000000..49a54356ae99 --- /dev/null +++ b/arch/x86/virt/vmx/tdx/tdxcall.S @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <asm/asm-offsets.h> +#include <asm/tdx.h> + +/* + * TDCALL and SEAMCALL are supported in Binutils >= 2.36. + */ +#define tdcall .byte 0x66,0x0f,0x01,0xcc +#define seamcall .byte 0x66,0x0f,0x01,0xcf + +/* + * TDX_MODULE_CALL - common helper macro for both + * TDCALL and SEAMCALL instructions. + * + * TDCALL - used by TDX guests to make requests to the + * TDX module and hypercalls to the VMM. + * SEAMCALL - used by TDX hosts to make requests to the + * TDX module. + */ +.macro TDX_MODULE_CALL host:req + /* + * R12 will be used as temporary storage for struct tdx_module_output + * pointer. Since R12-R15 registers are not used by TDCALL/SEAMCALL + * services supported by this function, it can be reused. + */ + + /* Callee saved, so preserve it */ + push %r12 + + /* + * Push output pointer to stack. + * After the operation, it will be fetched into R12 register. + */ + push %r9 + + /* Mangle function call ABI into TDCALL/SEAMCALL ABI: */ + /* Move Leaf ID to RAX */ + mov %rdi, %rax + /* Move input 4 to R9 */ + mov %r8, %r9 + /* Move input 3 to R8 */ + mov %rcx, %r8 + /* Move input 1 to RCX */ + mov %rsi, %rcx + /* Leave input param 2 in RDX */ + + .if \host + seamcall + /* + * SEAMCALL instruction is essentially a VMExit from VMX root + * mode to SEAM VMX root mode. VMfailInvalid (CF=1) indicates + * that the targeted SEAM firmware is not loaded or disabled, + * or P-SEAMLDR is busy with another SEAMCALL. %rax is not + * changed in this case. + * + * Set %rax to TDX_SEAMCALL_VMFAILINVALID for VMfailInvalid. + * This value will never be used as actual SEAMCALL error code as + * it is from the Reserved status code class. + */ + jnc .Lno_vmfailinvalid + mov $TDX_SEAMCALL_VMFAILINVALID, %rax +.Lno_vmfailinvalid: + + .else + tdcall + .endif + + /* + * Fetch output pointer from stack to R12 (It is used + * as temporary storage) + */ + pop %r12 + + /* + * Since this macro can be invoked with NULL as an output pointer, + * check if caller provided an output struct before storing output + * registers. + * + * Update output registers, even if the call failed (RAX != 0). + * Other registers may contain details of the failure. + */ + test %r12, %r12 + jz .Lno_output_struct + + /* Copy result registers to output struct: */ + movq %rcx, TDX_MODULE_rcx(%r12) + movq %rdx, TDX_MODULE_rdx(%r12) + movq %r8, TDX_MODULE_r8(%r12) + movq %r9, TDX_MODULE_r9(%r12) + movq %r10, TDX_MODULE_r10(%r12) + movq %r11, TDX_MODULE_r11(%r12) + +.Lno_output_struct: + /* Restore the state of R12 register */ + pop %r12 +.endm diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5038edb79ad5..ca85d1409917 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -30,7 +30,6 @@ #include <linux/pci.h> #include <linux/gfp.h> #include <linux/edd.h> -#include <linux/objtool.h> #include <xen/xen.h> #include <xen/events.h> @@ -165,7 +164,6 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, *bx &= maskebx; } -STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */ static bool __init xen_check_mwait(void) { diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 688aa8b6ae29..ba7af2eca755 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -260,8 +260,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); - if (ctxt == NULL) + if (ctxt == NULL) { + cpumask_clear_cpu(cpu, xen_cpu_initialized_map); + cpumask_clear_cpu(cpu, cpu_callout_mask); return -ENOMEM; + } gdt = get_cpu_gdt_rw(cpu); diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index bd113bc6e192..0b0f0172cced 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -4,6 +4,7 @@ config XTENSA select ARCH_32BIT_OFF_T select ARCH_HAS_BINFMT_FLAT if !MMU select ARCH_HAS_CURRENT_STACK_POINTER + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT if MMU select ARCH_HAS_SYNC_DMA_FOR_CPU if MMU select ARCH_HAS_SYNC_DMA_FOR_DEVICE if MMU @@ -29,8 +30,10 @@ config XTENSA select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL + select HAVE_ARCH_KCSAN select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK + select HAVE_CONTEXT_TRACKING select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_EXIT_THREAD @@ -42,6 +45,7 @@ config XTENSA select HAVE_PERF_EVENTS select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS + select HAVE_VIRT_CPU_ACCOUNTING_GEN select IRQ_DOMAIN select MODULES_USE_ELF_RELA select PERF_USE_VMALLOC @@ -79,6 +83,7 @@ config STACKTRACE_SUPPORT config MMU def_bool n + select PFAULT config HAVE_XTENSA_GPIO32 def_bool n @@ -178,6 +183,16 @@ config XTENSA_FAKE_NMI If unsure, say N. +config PFAULT + bool "Handle protection faults" if EXPERT && !MMU + default y + help + Handle protection faults. MMU configurations must enable it. + noMMU configurations may disable it if used memory map never + generates protection faults or faults are always fatal. + + If unsure, say Y. + config XTENSA_UNALIGNED_USER bool "Unaligned memory access in user space" help @@ -773,6 +788,9 @@ endmenu menu "Power management options" +config ARCH_HIBERNATION_POSSIBLE + def_bool y + source "kernel/power/Kconfig" endmenu diff --git a/arch/xtensa/boot/lib/Makefile b/arch/xtensa/boot/lib/Makefile index e3d717c7bfa1..162d10af36f3 100644 --- a/arch/xtensa/boot/lib/Makefile +++ b/arch/xtensa/boot/lib/Makefile @@ -16,6 +16,7 @@ CFLAGS_REMOVE_inffast.o = -pg endif KASAN_SANITIZE := n +KCSAN_SANITIZE := n CFLAGS_REMOVE_inflate.o += -fstack-protector -fstack-protector-strong CFLAGS_REMOVE_zmem.o += -fstack-protector -fstack-protector-strong diff --git a/arch/xtensa/include/asm/barrier.h b/arch/xtensa/include/asm/barrier.h index d6f8d4ddc2bc..898ea397e9bc 100644 --- a/arch/xtensa/include/asm/barrier.h +++ b/arch/xtensa/include/asm/barrier.h @@ -11,9 +11,15 @@ #include <asm/core.h> -#define mb() ({ __asm__ __volatile__("memw" : : : "memory"); }) -#define rmb() barrier() -#define wmb() mb() +#define __mb() ({ __asm__ __volatile__("memw" : : : "memory"); }) +#define __rmb() barrier() +#define __wmb() __mb() + +#ifdef CONFIG_SMP +#define __smp_mb() __mb() +#define __smp_rmb() __rmb() +#define __smp_wmb() __wmb() +#endif #if XCHAL_HAVE_S32C1I #define __smp_mb__before_atomic() barrier() diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h index cd225896c40f..e02ec5833389 100644 --- a/arch/xtensa/include/asm/bitops.h +++ b/arch/xtensa/include/asm/bitops.h @@ -99,7 +99,7 @@ static inline unsigned long __fls(unsigned long word) #if XCHAL_HAVE_EXCLUSIVE #define BIT_OP(op, insn, inv) \ -static inline void op##_bit(unsigned int bit, volatile unsigned long *p)\ +static inline void arch_##op##_bit(unsigned int bit, volatile unsigned long *p)\ { \ unsigned long tmp; \ unsigned long mask = 1UL << (bit & 31); \ @@ -119,7 +119,7 @@ static inline void op##_bit(unsigned int bit, volatile unsigned long *p)\ #define TEST_AND_BIT_OP(op, insn, inv) \ static inline int \ -test_and_##op##_bit(unsigned int bit, volatile unsigned long *p) \ +arch_test_and_##op##_bit(unsigned int bit, volatile unsigned long *p) \ { \ unsigned long tmp, value; \ unsigned long mask = 1UL << (bit & 31); \ @@ -142,7 +142,7 @@ test_and_##op##_bit(unsigned int bit, volatile unsigned long *p) \ #elif XCHAL_HAVE_S32C1I #define BIT_OP(op, insn, inv) \ -static inline void op##_bit(unsigned int bit, volatile unsigned long *p)\ +static inline void arch_##op##_bit(unsigned int bit, volatile unsigned long *p)\ { \ unsigned long tmp, value; \ unsigned long mask = 1UL << (bit & 31); \ @@ -163,7 +163,7 @@ static inline void op##_bit(unsigned int bit, volatile unsigned long *p)\ #define TEST_AND_BIT_OP(op, insn, inv) \ static inline int \ -test_and_##op##_bit(unsigned int bit, volatile unsigned long *p) \ +arch_test_and_##op##_bit(unsigned int bit, volatile unsigned long *p) \ { \ unsigned long tmp, value; \ unsigned long mask = 1UL << (bit & 31); \ @@ -205,6 +205,8 @@ BIT_OPS(change, "xor", ) #undef BIT_OP #undef TEST_AND_BIT_OP +#include <asm-generic/bitops/instrumented-atomic.h> + #include <asm-generic/bitops/le.h> #include <asm-generic/bitops/ext2-atomic-setbit.h> diff --git a/arch/xtensa/include/asm/coprocessor.h b/arch/xtensa/include/asm/coprocessor.h index 0fbe2a740b8d..3b1a0d5d2169 100644 --- a/arch/xtensa/include/asm/coprocessor.h +++ b/arch/xtensa/include/asm/coprocessor.h @@ -142,11 +142,12 @@ typedef struct { XCHAL_CP6_SA_LIST(2) } xtregs_cp6_t typedef struct { XCHAL_CP7_SA_LIST(2) } xtregs_cp7_t __attribute__ ((aligned (XCHAL_CP7_SA_ALIGN))); -extern struct thread_info* coprocessor_owner[XCHAL_CP_MAX]; -extern void coprocessor_flush(struct thread_info*, int); - -extern void coprocessor_release_all(struct thread_info*); -extern void coprocessor_flush_all(struct thread_info*); +struct thread_info; +void coprocessor_flush(struct thread_info *ti, int cp_index); +void coprocessor_release_all(struct thread_info *ti); +void coprocessor_flush_all(struct thread_info *ti); +void coprocessor_flush_release_all(struct thread_info *ti); +void local_coprocessors_flush_release_all(void); #endif /* XTENSA_HAVE_COPROCESSORS */ diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 4489a27d527a..76bc63127c66 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -246,6 +246,13 @@ extern unsigned long __get_wchan(struct task_struct *p); v; \ }) +#define xtensa_xsr(x, sr) \ + ({ \ + unsigned int __v__ = (unsigned int)(x); \ + __asm__ __volatile__ ("xsr %0, " __stringify(sr) : "+a"(__v__)); \ + __v__; \ + }) + #if XCHAL_HAVE_EXTERN_REGS static inline void set_er(unsigned long value, unsigned long addr) diff --git a/arch/xtensa/include/asm/sections.h b/arch/xtensa/include/asm/sections.h index a8c42d08e281..3bc6b9afa993 100644 --- a/arch/xtensa/include/asm/sections.h +++ b/arch/xtensa/include/asm/sections.h @@ -29,7 +29,7 @@ extern char _Level5InterruptVector_text_end[]; extern char _Level6InterruptVector_text_start[]; extern char _Level6InterruptVector_text_end[]; #endif -#ifdef CONFIG_SMP +#ifdef CONFIG_SECONDARY_RESET_VECTOR extern char _SecondaryResetVector_text_start[]; extern char _SecondaryResetVector_text_end[]; #endif diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index f6fcbba1d02f..326db1c1d5d8 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -52,12 +52,21 @@ struct thread_info { __u32 cpu; /* current CPU */ __s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/ - unsigned long cpenable; #if XCHAL_HAVE_EXCLUSIVE /* result of the most recent exclusive store */ unsigned long atomctl8; #endif +#ifdef CONFIG_USER_ABI_CALL0_PROBE + /* Address where PS.WOE was enabled by the ABI probing code */ + unsigned long ps_woe_fix_addr; +#endif + /* + * If i-th bit is set then coprocessor state is loaded into the + * coprocessor i on CPU cp_owner_cpu. + */ + unsigned long cpenable; + u32 cp_owner_cpu; /* Allocate storage for extra user states and coprocessor states. */ #if XTENSA_HAVE_COPROCESSORS xtregs_coprocessor_t xtregs_cp; diff --git a/arch/xtensa/include/asm/timex.h b/arch/xtensa/include/asm/timex.h index 233ec75e60c6..3f2462f2d027 100644 --- a/arch/xtensa/include/asm/timex.h +++ b/arch/xtensa/include/asm/timex.h @@ -29,10 +29,6 @@ extern unsigned long ccount_freq; -typedef unsigned long long cycles_t; - -#define get_cycles() (0) - void local_timer_setup(unsigned cpu); /* @@ -59,4 +55,6 @@ static inline void set_linux_timer (unsigned long ccompare) xtensa_set_sr(ccompare, SREG_CCOMPARE + LINUX_TIMER); } +#include <asm-generic/timex.h> + #endif /* _XTENSA_TIMEX_H */ diff --git a/arch/xtensa/include/asm/traps.h b/arch/xtensa/include/asm/traps.h index 6fa47cd8e02d..6f74ccc0c7ea 100644 --- a/arch/xtensa/include/asm/traps.h +++ b/arch/xtensa/include/asm/traps.h @@ -12,6 +12,8 @@ #include <asm/ptrace.h> +typedef void xtensa_exception_handler(struct pt_regs *regs); + /* * Per-CPU exception handling data structure. * EXCSAVE1 points to it. @@ -25,31 +27,47 @@ struct exc_table { void *fixup; /* For passing a parameter to fixup */ void *fixup_param; +#if XTENSA_HAVE_COPROCESSORS + /* Pointers to owner struct thread_info */ + struct thread_info *coprocessor_owner[XCHAL_CP_MAX]; +#endif /* Fast user exception handlers */ void *fast_user_handler[EXCCAUSE_N]; /* Fast kernel exception handlers */ void *fast_kernel_handler[EXCCAUSE_N]; /* Default C-Handlers */ - void *default_handler[EXCCAUSE_N]; + xtensa_exception_handler *default_handler[EXCCAUSE_N]; }; -/* - * handler must be either of the following: - * void (*)(struct pt_regs *regs); - * void (*)(struct pt_regs *regs, unsigned long exccause); - */ -extern void * __init trap_set_handler(int cause, void *handler); -extern void do_unhandled(struct pt_regs *regs, unsigned long exccause); -void fast_second_level_miss(void); +DECLARE_PER_CPU(struct exc_table, exc_table); + +xtensa_exception_handler * +__init trap_set_handler(int cause, xtensa_exception_handler *handler); + +asmlinkage void fast_illegal_instruction_user(void); +asmlinkage void fast_syscall_user(void); +asmlinkage void fast_alloca(void); +asmlinkage void fast_unaligned(void); +asmlinkage void fast_second_level_miss(void); +asmlinkage void fast_store_prohibited(void); +asmlinkage void fast_coprocessor(void); + +asmlinkage void kernel_exception(void); +asmlinkage void user_exception(void); +asmlinkage void system_call(struct pt_regs *regs); + +void do_IRQ(int hwirq, struct pt_regs *regs); +void do_page_fault(struct pt_regs *regs); +void do_unhandled(struct pt_regs *regs); /* Initialize minimal exc_table structure sufficient for basic paging */ static inline void __init early_trap_init(void) { - static struct exc_table exc_table __initdata = { + static struct exc_table init_exc_table __initdata = { .fast_kernel_handler[EXCCAUSE_DTLB_MISS] = fast_second_level_miss, }; - __asm__ __volatile__("wsr %0, excsave1\n" : : "a" (&exc_table)); + xtensa_set_sr(&init_exc_table, excsave1); } void secondary_trap_init(void); diff --git a/arch/xtensa/kernel/Makefile b/arch/xtensa/kernel/Makefile index 5fd6cd15e0fb..897c1c741058 100644 --- a/arch/xtensa/kernel/Makefile +++ b/arch/xtensa/kernel/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_XTENSA_VARIANT_HAVE_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_S32C1I_SELFTEST) += s32c1i_selftest.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_HIBERNATION) += hibernate.o # In the Xtensa architecture, assembly generates literals which must always # precede the L32R instruction with a relative offset less than 256 kB. diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c index 37278e2785fb..da38de20ae59 100644 --- a/arch/xtensa/kernel/asm-offsets.c +++ b/arch/xtensa/kernel/asm-offsets.c @@ -21,6 +21,7 @@ #include <linux/ptrace.h> #include <linux/mm.h> #include <linux/kbuild.h> +#include <linux/suspend.h> #include <asm/ptrace.h> #include <asm/traps.h> @@ -87,14 +88,19 @@ int main(void) OFFSET(TI_STSTUS, thread_info, status); OFFSET(TI_CPU, thread_info, cpu); OFFSET(TI_PRE_COUNT, thread_info, preempt_count); +#ifdef CONFIG_USER_ABI_CALL0_PROBE + OFFSET(TI_PS_WOE_FIX_ADDR, thread_info, ps_woe_fix_addr); +#endif /* struct thread_info (offset from start_struct) */ DEFINE(THREAD_RA, offsetof (struct task_struct, thread.ra)); DEFINE(THREAD_SP, offsetof (struct task_struct, thread.sp)); - DEFINE(THREAD_CPENABLE, offsetof (struct thread_info, cpenable)); #if XCHAL_HAVE_EXCLUSIVE DEFINE(THREAD_ATOMCTL8, offsetof (struct thread_info, atomctl8)); #endif + DEFINE(THREAD_CPENABLE, offsetof(struct thread_info, cpenable)); + DEFINE(THREAD_CPU, offsetof(struct thread_info, cpu)); + DEFINE(THREAD_CP_OWNER_CPU, offsetof(struct thread_info, cp_owner_cpu)); #if XTENSA_HAVE_COPROCESSORS DEFINE(THREAD_XTREGS_CP0, offsetof(struct thread_info, xtregs_cp.cp0)); DEFINE(THREAD_XTREGS_CP1, offsetof(struct thread_info, xtregs_cp.cp1)); @@ -137,11 +143,22 @@ int main(void) DEFINE(EXC_TABLE_DOUBLE_SAVE, offsetof(struct exc_table, double_save)); DEFINE(EXC_TABLE_FIXUP, offsetof(struct exc_table, fixup)); DEFINE(EXC_TABLE_PARAM, offsetof(struct exc_table, fixup_param)); +#if XTENSA_HAVE_COPROCESSORS + DEFINE(EXC_TABLE_COPROCESSOR_OWNER, + offsetof(struct exc_table, coprocessor_owner)); +#endif DEFINE(EXC_TABLE_FAST_USER, offsetof(struct exc_table, fast_user_handler)); DEFINE(EXC_TABLE_FAST_KERNEL, offsetof(struct exc_table, fast_kernel_handler)); DEFINE(EXC_TABLE_DEFAULT, offsetof(struct exc_table, default_handler)); +#ifdef CONFIG_HIBERNATION + DEFINE(PBE_ADDRESS, offsetof(struct pbe, address)); + DEFINE(PBE_ORIG_ADDRESS, offsetof(struct pbe, orig_address)); + DEFINE(PBE_NEXT, offsetof(struct pbe, next)); + DEFINE(PBE_SIZE, sizeof(struct pbe)); +#endif + return 0; } diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S index c7b9f12896f2..ef33e76e07d8 100644 --- a/arch/xtensa/kernel/coprocessor.S +++ b/arch/xtensa/kernel/coprocessor.S @@ -19,6 +19,26 @@ #include <asm/current.h> #include <asm/regs.h> +/* + * Rules for coprocessor state manipulation on SMP: + * + * - a task may have live coprocessors only on one CPU. + * + * - whether coprocessor context of task T is live on some CPU is + * denoted by T's thread_info->cpenable. + * + * - non-zero thread_info->cpenable means that thread_info->cp_owner_cpu + * is valid in the T's thread_info. Zero thread_info->cpenable means that + * coprocessor context is valid in the T's thread_info. + * + * - if a coprocessor context of task T is live on CPU X, only CPU X changes + * T's thread_info->cpenable, cp_owner_cpu and coprocessor save area. + * This is done by making sure that for the task T with live coprocessor + * on CPU X cpenable SR is 0 when T runs on any other CPU Y. + * When fast_coprocessor exception is taken on CPU Y it goes to the + * C-level do_coprocessor that uses IPI to make CPU X flush T's coprocessors. + */ + #if XTENSA_HAVE_COPROCESSORS /* @@ -30,34 +50,30 @@ .align 4; \ .Lsave_cp_regs_cp##x: \ xchal_cp##x##_store a2 a3 a4 a5 a6; \ - jx a0; \ + ret; \ .endif -#define SAVE_CP_REGS_TAB(x) \ - .if XTENSA_HAVE_COPROCESSOR(x); \ - .long .Lsave_cp_regs_cp##x; \ - .else; \ - .long 0; \ - .endif; \ - .long THREAD_XTREGS_CP##x - - #define LOAD_CP_REGS(x) \ .if XTENSA_HAVE_COPROCESSOR(x); \ .align 4; \ .Lload_cp_regs_cp##x: \ xchal_cp##x##_load a2 a3 a4 a5 a6; \ - jx a0; \ + ret; \ .endif -#define LOAD_CP_REGS_TAB(x) \ +#define CP_REGS_TAB(x) \ .if XTENSA_HAVE_COPROCESSOR(x); \ + .long .Lsave_cp_regs_cp##x; \ .long .Lload_cp_regs_cp##x; \ .else; \ - .long 0; \ + .long 0, 0; \ .endif; \ .long THREAD_XTREGS_CP##x +#define CP_REGS_TAB_SAVE 0 +#define CP_REGS_TAB_LOAD 4 +#define CP_REGS_TAB_OFFSET 8 + __XTENSA_HANDLER SAVE_CP_REGS(0) @@ -79,25 +95,15 @@ LOAD_CP_REGS(7) .align 4 -.Lsave_cp_regs_jump_table: - SAVE_CP_REGS_TAB(0) - SAVE_CP_REGS_TAB(1) - SAVE_CP_REGS_TAB(2) - SAVE_CP_REGS_TAB(3) - SAVE_CP_REGS_TAB(4) - SAVE_CP_REGS_TAB(5) - SAVE_CP_REGS_TAB(6) - SAVE_CP_REGS_TAB(7) - -.Lload_cp_regs_jump_table: - LOAD_CP_REGS_TAB(0) - LOAD_CP_REGS_TAB(1) - LOAD_CP_REGS_TAB(2) - LOAD_CP_REGS_TAB(3) - LOAD_CP_REGS_TAB(4) - LOAD_CP_REGS_TAB(5) - LOAD_CP_REGS_TAB(6) - LOAD_CP_REGS_TAB(7) +.Lcp_regs_jump_table: + CP_REGS_TAB(0) + CP_REGS_TAB(1) + CP_REGS_TAB(2) + CP_REGS_TAB(3) + CP_REGS_TAB(4) + CP_REGS_TAB(5) + CP_REGS_TAB(6) + CP_REGS_TAB(7) /* * Entry condition: @@ -115,9 +121,37 @@ ENTRY(fast_coprocessor) + s32i a3, a2, PT_AREG3 + +#ifdef CONFIG_SMP + /* + * Check if any coprocessor context is live on another CPU + * and if so go through the C-level coprocessor exception handler + * to flush it to memory. + */ + GET_THREAD_INFO (a0, a2) + l32i a3, a0, THREAD_CPENABLE + beqz a3, .Lload_local + + /* + * Pairs with smp_wmb in local_coprocessor_release_all + * and with both memws below. + */ + memw + l32i a3, a0, THREAD_CPU + l32i a0, a0, THREAD_CP_OWNER_CPU + beq a0, a3, .Lload_local + + rsr a0, ps + l32i a3, a2, PT_AREG3 + bbci.l a0, PS_UM_BIT, 1f + call0 user_exception +1: call0 kernel_exception +#endif + /* Save remaining registers a1-a3 and SAR */ - s32i a3, a2, PT_AREG3 +.Lload_local: rsr a3, sar s32i a1, a2, PT_AREG1 s32i a3, a2, PT_SAR @@ -125,13 +159,15 @@ ENTRY(fast_coprocessor) rsr a2, depc s32i a2, a1, PT_AREG2 - /* - * The hal macros require up to 4 temporary registers. We use a3..a6. - */ + /* The hal macros require up to 4 temporary registers. We use a3..a6. */ s32i a4, a1, PT_AREG4 s32i a5, a1, PT_AREG5 s32i a6, a1, PT_AREG6 + s32i a7, a1, PT_AREG7 + s32i a8, a1, PT_AREG8 + s32i a9, a1, PT_AREG9 + s32i a10, a1, PT_AREG10 /* Find coprocessor number. Subtract first CP EXCCAUSE from EXCCAUSE */ @@ -148,58 +184,74 @@ ENTRY(fast_coprocessor) wsr a0, cpenable rsync - /* Retrieve previous owner. (a3 still holds CP number) */ + /* Get coprocessor save/load table entry (a7). */ - movi a0, coprocessor_owner # list of owners - addx4 a0, a3, a0 # entry for CP - l32i a4, a0, 0 + movi a7, .Lcp_regs_jump_table + addx8 a7, a3, a7 + addx4 a7, a3, a7 - beqz a4, 1f # skip 'save' if no previous owner + /* Retrieve previous owner (a8). */ - /* Disable coprocessor for previous owner. (a2 = 1 << CP number) */ + rsr a0, excsave1 # exc_table + addx4 a0, a3, a0 # entry for CP + l32i a8, a0, EXC_TABLE_COPROCESSOR_OWNER + + /* Set new owner (a9). */ - l32i a5, a4, THREAD_CPENABLE - xor a5, a5, a2 # (1 << cp-id) still in a2 - s32i a5, a4, THREAD_CPENABLE + GET_THREAD_INFO (a9, a1) + l32i a4, a9, THREAD_CPU + s32i a9, a0, EXC_TABLE_COPROCESSOR_OWNER + s32i a4, a9, THREAD_CP_OWNER_CPU /* - * Get context save area and 'call' save routine. - * (a4 still holds previous owner (thread_info), a3 CP number) + * Enable coprocessor for the new owner. (a2 = 1 << CP number) + * This can be done before loading context into the coprocessor. */ + l32i a4, a9, THREAD_CPENABLE + or a4, a4, a2 - movi a5, .Lsave_cp_regs_jump_table - movi a0, 2f # a0: 'return' address - addx8 a3, a3, a5 # a3: coprocessor number - l32i a2, a3, 4 # a2: xtregs offset - l32i a3, a3, 0 # a3: jump address - add a2, a2, a4 - jx a3 + /* + * Make sure THREAD_CP_OWNER_CPU is in memory before updating + * THREAD_CPENABLE + */ + memw # (2) + s32i a4, a9, THREAD_CPENABLE - /* Note that only a0 and a1 were preserved. */ + beqz a8, 1f # skip 'save' if no previous owner -2: rsr a3, exccause - addi a3, a3, -EXCCAUSE_COPROCESSOR0_DISABLED - movi a0, coprocessor_owner - addx4 a0, a3, a0 + /* Disable coprocessor for previous owner. (a2 = 1 << CP number) */ - /* Set new 'owner' (a0 points to the CP owner, a3 contains the CP nr) */ + l32i a10, a8, THREAD_CPENABLE + xor a10, a10, a2 -1: GET_THREAD_INFO (a4, a1) - s32i a4, a0, 0 + /* Get context save area and call save routine. */ - /* Get context save area and 'call' load routine. */ + l32i a2, a7, CP_REGS_TAB_OFFSET + l32i a3, a7, CP_REGS_TAB_SAVE + add a2, a2, a8 + callx0 a3 - movi a5, .Lload_cp_regs_jump_table - movi a0, 1f - addx8 a3, a3, a5 - l32i a2, a3, 4 # a2: xtregs offset - l32i a3, a3, 0 # a3: jump address - add a2, a2, a4 - jx a3 + /* + * Make sure coprocessor context and THREAD_CP_OWNER_CPU are in memory + * before updating THREAD_CPENABLE + */ + memw # (3) + s32i a10, a8, THREAD_CPENABLE +1: + /* Get context save area and call load routine. */ + + l32i a2, a7, CP_REGS_TAB_OFFSET + l32i a3, a7, CP_REGS_TAB_LOAD + add a2, a2, a9 + callx0 a3 /* Restore all registers and return from exception handler. */ -1: l32i a6, a1, PT_AREG6 + l32i a10, a1, PT_AREG10 + l32i a9, a1, PT_AREG9 + l32i a8, a1, PT_AREG8 + l32i a7, a1, PT_AREG7 + l32i a6, a1, PT_AREG6 l32i a5, a1, PT_AREG5 l32i a4, a1, PT_AREG4 @@ -230,29 +282,21 @@ ENDPROC(fast_coprocessor) ENTRY(coprocessor_flush) - /* reserve 4 bytes on stack to save a0 */ - abi_entry(4) - - s32i a0, a1, 0 - movi a0, .Lsave_cp_regs_jump_table - addx8 a3, a3, a0 - l32i a4, a3, 4 - l32i a3, a3, 0 - add a2, a2, a4 - beqz a3, 1f - callx0 a3 -1: l32i a0, a1, 0 - - abi_ret(4) + abi_entry_default + + movi a4, .Lcp_regs_jump_table + addx8 a4, a3, a4 + addx4 a3, a3, a4 + l32i a4, a3, CP_REGS_TAB_SAVE + beqz a4, 1f + l32i a3, a3, CP_REGS_TAB_OFFSET + add a2, a2, a3 + mov a7, a0 + callx0 a4 + mov a0, a7 +1: + abi_ret_default ENDPROC(coprocessor_flush) - .data - -ENTRY(coprocessor_owner) - - .fill XCHAL_CP_MAX, 4, 0 - -END(coprocessor_owner) - #endif /* XTENSA_HAVE_COPROCESSORS */ diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S index 6b6eff658795..e3eae648ba2e 100644 --- a/arch/xtensa/kernel/entry.S +++ b/arch/xtensa/kernel/entry.S @@ -28,15 +28,6 @@ #include <asm/tlbflush.h> #include <variant/tie-asm.h> -/* Unimplemented features. */ - -#undef KERNEL_STACK_OVERFLOW_CHECK - -/* Not well tested. - * - * - fast_coprocessor - */ - /* * Macro to find first bit set in WINDOWBASE from the left + 1 * @@ -178,28 +169,26 @@ _user_exception: /* Save only live registers. */ -UABI_W _bbsi.l a2, 1, 1f +UABI_W _bbsi.l a2, 1, .Lsave_window_registers s32i a4, a1, PT_AREG4 s32i a5, a1, PT_AREG5 s32i a6, a1, PT_AREG6 s32i a7, a1, PT_AREG7 -UABI_W _bbsi.l a2, 2, 1f +UABI_W _bbsi.l a2, 2, .Lsave_window_registers s32i a8, a1, PT_AREG8 s32i a9, a1, PT_AREG9 s32i a10, a1, PT_AREG10 s32i a11, a1, PT_AREG11 -UABI_W _bbsi.l a2, 3, 1f +UABI_W _bbsi.l a2, 3, .Lsave_window_registers s32i a12, a1, PT_AREG12 s32i a13, a1, PT_AREG13 s32i a14, a1, PT_AREG14 s32i a15, a1, PT_AREG15 #if defined(USER_SUPPORT_WINDOWED) - _bnei a2, 1, 1f # only one valid frame? + /* If only one valid frame skip saving regs. */ - /* Only one valid frame, skip saving regs. */ - - j 2f + beqi a2, 1, common_exception /* Save the remaining registers. * We have to save all registers up to the first '1' from @@ -208,8 +197,8 @@ UABI_W _bbsi.l a2, 3, 1f * All register frames starting from the top field to the marked '1' * must be saved. */ - -1: addi a3, a2, -1 # eliminate '1' in bit 0: yyyyxxww0 +.Lsave_window_registers: + addi a3, a2, -1 # eliminate '1' in bit 0: yyyyxxww0 neg a3, a3 # yyyyxxww0 -> YYYYXXWW1+1 and a3, a3, a2 # max. only one bit is set @@ -250,7 +239,7 @@ UABI_W _bbsi.l a2, 3, 1f /* We are back to the original stack pointer (a1) */ #endif -2: /* Now, jump to the common exception handler. */ + /* Now, jump to the common exception handler. */ j common_exception @@ -350,15 +339,6 @@ KABI_W _bbsi.l a2, 3, 1f l32i a0, a1, PT_AREG0 # restore saved a0 wsr a0, depc -#ifdef KERNEL_STACK_OVERFLOW_CHECK - - /* Stack overflow check, for debugging */ - extui a2, a1, TASK_SIZE_BITS,XX - movi a3, SIZE?? - _bge a2, a3, out_of_stack_panic - -#endif - /* * This is the common exception handler. * We get here from the user exception handler or simply by falling through @@ -442,7 +422,6 @@ KABI_W or a3, a3, a0 moveqz a3, a0, a2 # a3 = LOCKLEVEL iff interrupt KABI_W movi a2, PS_WOE_MASK KABI_W or a3, a3, a2 - rsr a2, exccause #endif /* restore return address (or 0 if return to userspace) */ @@ -469,42 +448,56 @@ KABI_W or a3, a3, a2 save_xtregs_opt a1 a3 a4 a5 a6 a7 PT_XTREGS_OPT +#ifdef CONFIG_TRACE_IRQFLAGS + rsr abi_tmp0, ps + extui abi_tmp0, abi_tmp0, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH + beqz abi_tmp0, 1f + abi_call trace_hardirqs_off +1: +#endif +#ifdef CONFIG_CONTEXT_TRACKING + l32i abi_tmp0, a1, PT_PS + bbci.l abi_tmp0, PS_UM_BIT, 1f + abi_call context_tracking_user_exit +1: +#endif + /* Go to second-level dispatcher. Set up parameters to pass to the * exception handler and call the exception handler. */ - rsr a4, excsave1 - addx4 a4, a2, a4 - l32i a4, a4, EXC_TABLE_DEFAULT # load handler - mov abi_arg1, a2 # pass EXCCAUSE - mov abi_arg0, a1 # pass stack frame + l32i abi_arg1, a1, PT_EXCCAUSE # pass EXCCAUSE + rsr abi_tmp0, excsave1 + addx4 abi_tmp0, abi_arg1, abi_tmp0 + l32i abi_tmp0, abi_tmp0, EXC_TABLE_DEFAULT # load handler + mov abi_arg0, a1 # pass stack frame /* Call the second-level handler */ - abi_callx a4 + abi_callx abi_tmp0 /* Jump here for exception exit */ .global common_exception_return common_exception_return: #if XTENSA_FAKE_NMI - l32i abi_tmp0, a1, PT_EXCCAUSE - movi abi_tmp1, EXCCAUSE_MAPPED_NMI - l32i abi_saved1, a1, PT_PS - beq abi_tmp0, abi_tmp1, .Lrestore_state + l32i abi_tmp0, a1, PT_EXCCAUSE + movi abi_tmp1, EXCCAUSE_MAPPED_NMI + l32i abi_saved1, a1, PT_PS + beq abi_tmp0, abi_tmp1, .Lrestore_state #endif .Ltif_loop: - irq_save a2, a3 + irq_save abi_tmp0, abi_tmp1 #ifdef CONFIG_TRACE_IRQFLAGS abi_call trace_hardirqs_off #endif /* Jump if we are returning from kernel exceptions. */ - l32i abi_saved1, a1, PT_PS - GET_THREAD_INFO(a2, a1) - l32i a4, a2, TI_FLAGS - _bbci.l abi_saved1, PS_UM_BIT, .Lexit_tif_loop_kernel + l32i abi_saved1, a1, PT_PS + GET_THREAD_INFO(abi_tmp0, a1) + l32i abi_saved0, abi_tmp0, TI_FLAGS + _bbci.l abi_saved1, PS_UM_BIT, .Lexit_tif_loop_kernel /* Specific to a user exception exit: * We need to check some flags for signal handling and rescheduling, @@ -513,75 +506,80 @@ common_exception_return: * Note that we don't disable interrupts here. */ - _bbsi.l a4, TIF_NEED_RESCHED, .Lresched - movi a2, _TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NOTIFY_SIGNAL - bnone a4, a2, .Lexit_tif_loop_user + _bbsi.l abi_saved0, TIF_NEED_RESCHED, .Lresched + movi abi_tmp0, _TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NOTIFY_SIGNAL + bnone abi_saved0, abi_tmp0, .Lexit_tif_loop_user - l32i a4, a1, PT_DEPC - bgeui a4, VALID_DOUBLE_EXCEPTION_ADDRESS, .Lrestore_state + l32i abi_tmp0, a1, PT_DEPC + bgeui abi_tmp0, VALID_DOUBLE_EXCEPTION_ADDRESS, .Lrestore_state /* Call do_signal() */ #ifdef CONFIG_TRACE_IRQFLAGS abi_call trace_hardirqs_on #endif - rsil a2, 0 - mov abi_arg0, a1 + rsil abi_tmp0, 0 + mov abi_arg0, a1 abi_call do_notify_resume # int do_notify_resume(struct pt_regs*) - j .Ltif_loop + j .Ltif_loop .Lresched: #ifdef CONFIG_TRACE_IRQFLAGS abi_call trace_hardirqs_on #endif - rsil a2, 0 + rsil abi_tmp0, 0 abi_call schedule # void schedule (void) - j .Ltif_loop + j .Ltif_loop .Lexit_tif_loop_kernel: #ifdef CONFIG_PREEMPTION - _bbci.l a4, TIF_NEED_RESCHED, .Lrestore_state + _bbci.l abi_saved0, TIF_NEED_RESCHED, .Lrestore_state /* Check current_thread_info->preempt_count */ - l32i a4, a2, TI_PRE_COUNT - bnez a4, .Lrestore_state + l32i abi_tmp1, abi_tmp0, TI_PRE_COUNT + bnez abi_tmp1, .Lrestore_state abi_call preempt_schedule_irq #endif - j .Lrestore_state + j .Lrestore_state .Lexit_tif_loop_user: +#ifdef CONFIG_CONTEXT_TRACKING + abi_call context_tracking_user_enter +#endif #ifdef CONFIG_HAVE_HW_BREAKPOINT - _bbci.l a4, TIF_DB_DISABLED, 1f + _bbci.l abi_saved0, TIF_DB_DISABLED, 1f abi_call restore_dbreak 1: #endif #ifdef CONFIG_DEBUG_TLB_SANITY - l32i a4, a1, PT_DEPC - bgeui a4, VALID_DOUBLE_EXCEPTION_ADDRESS, .Lrestore_state + l32i abi_tmp0, a1, PT_DEPC + bgeui abi_tmp0, VALID_DOUBLE_EXCEPTION_ADDRESS, .Lrestore_state abi_call check_tlb_sanity #endif .Lrestore_state: #ifdef CONFIG_TRACE_IRQFLAGS - extui a4, abi_saved1, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH - bgei a4, LOCKLEVEL, 1f + extui abi_tmp0, abi_saved1, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH + bgei abi_tmp0, LOCKLEVEL, 1f abi_call trace_hardirqs_on 1: #endif - /* Restore optional registers. */ + /* + * Restore optional registers. + * abi_arg* are used as temporary registers here. + */ - load_xtregs_opt a1 a2 a4 a5 a6 a7 PT_XTREGS_OPT + load_xtregs_opt a1 abi_tmp0 abi_arg0 abi_arg1 abi_arg2 abi_arg3 PT_XTREGS_OPT /* Restore SCOMPARE1 */ #if XCHAL_HAVE_S32C1I - l32i a2, a1, PT_SCOMPARE1 - wsr a2, scompare1 + l32i abi_tmp0, a1, PT_SCOMPARE1 + wsr abi_tmp0, scompare1 #endif - wsr abi_saved1, ps /* disable interrupts */ - - _bbci.l abi_saved1, PS_UM_BIT, kernel_exception_exit + wsr abi_saved1, ps /* disable interrupts */ + _bbci.l abi_saved1, PS_UM_BIT, kernel_exception_exit user_exception_exit: @@ -795,7 +793,7 @@ ENDPROC(kernel_exception) ENTRY(debug_exception) rsr a0, SREG_EPS + XCHAL_DEBUGLEVEL - bbsi.l a0, PS_EXCM_BIT, 1f # exception mode + bbsi.l a0, PS_EXCM_BIT, .Ldebug_exception_in_exception # exception mode /* Set EPC1 and EXCCAUSE */ @@ -814,10 +812,10 @@ ENTRY(debug_exception) /* Switch to kernel/user stack, restore jump vector, and save a0 */ - bbsi.l a2, PS_UM_BIT, 2f # jump if user mode - + bbsi.l a2, PS_UM_BIT, .Ldebug_exception_user # jump if user mode addi a2, a1, -16 - PT_KERNEL_SIZE # assume kernel stack -3: + +.Ldebug_exception_continue: l32i a0, a3, DT_DEBUG_SAVE s32i a1, a2, PT_AREG1 s32i a0, a2, PT_AREG0 @@ -845,10 +843,12 @@ ENTRY(debug_exception) bbsi.l a2, PS_UM_BIT, _user_exception j _kernel_exception -2: rsr a2, excsave1 +.Ldebug_exception_user: + rsr a2, excsave1 l32i a2, a2, EXC_TABLE_KSTK # load kernel stack pointer - j 3b + j .Ldebug_exception_continue +.Ldebug_exception_in_exception: #ifdef CONFIG_HAVE_HW_BREAKPOINT /* Debug exception while in exception mode. This may happen when * window overflow/underflow handler or fast exception handler hits @@ -856,8 +856,8 @@ ENTRY(debug_exception) * breakpoints, single-step faulting instruction and restore data * breakpoints. */ -1: - bbci.l a0, PS_UM_BIT, 1b # jump if kernel mode + + bbci.l a0, PS_UM_BIT, .Ldebug_exception_in_exception # jump if kernel mode rsr a0, debugcause bbsi.l a0, DEBUGCAUSE_DBREAK_BIT, .Ldebug_save_dbreak @@ -901,7 +901,7 @@ ENTRY(debug_exception) rfi XCHAL_DEBUGLEVEL #else /* Debug exception while in exception mode. Should not happen. */ -1: j 1b // FIXME!! + j .Ldebug_exception_in_exception // FIXME!! #endif ENDPROC(debug_exception) @@ -1056,6 +1056,11 @@ ENTRY(fast_illegal_instruction_user) movi a3, PS_WOE_MASK or a0, a0, a3 wsr a0, ps +#ifdef CONFIG_USER_ABI_CALL0_PROBE + GET_THREAD_INFO(a3, a2) + rsr a0, epc1 + s32i a0, a3, TI_PS_WOE_FIX_ADDR +#endif l32i a3, a2, PT_AREG3 l32i a0, a2, PT_AREG0 rsr a2, depc @@ -1630,12 +1635,13 @@ ENTRY(fast_second_level_miss) GET_CURRENT(a1,a2) l32i a0, a1, TASK_MM # tsk->mm - beqz a0, 9f + beqz a0, .Lfast_second_level_miss_no_mm -8: rsr a3, excvaddr # fault address +.Lfast_second_level_miss_continue: + rsr a3, excvaddr # fault address _PGD_OFFSET(a0, a3, a1) l32i a0, a0, 0 # read pmdval - beqz a0, 2f + beqz a0, .Lfast_second_level_miss_no_pmd /* Read ptevaddr and convert to top of page-table page. * @@ -1678,12 +1684,13 @@ ENTRY(fast_second_level_miss) addi a3, a3, DTLB_WAY_PGD add a1, a1, a3 # ... + way_number -3: wdtlb a0, a1 +.Lfast_second_level_miss_wdtlb: + wdtlb a0, a1 dsync /* Exit critical section. */ - -4: rsr a3, excsave1 +.Lfast_second_level_miss_skip_wdtlb: + rsr a3, excsave1 movi a0, 0 s32i a0, a3, EXC_TABLE_FIXUP @@ -1707,19 +1714,21 @@ ENTRY(fast_second_level_miss) esync rfde -9: l32i a0, a1, TASK_ACTIVE_MM # unlikely case mm == 0 - bnez a0, 8b +.Lfast_second_level_miss_no_mm: + l32i a0, a1, TASK_ACTIVE_MM # unlikely case mm == 0 + bnez a0, .Lfast_second_level_miss_continue /* Even more unlikely case active_mm == 0. * We can get here with NMI in the middle of context_switch that * touches vmalloc area. */ movi a0, init_mm - j 8b + j .Lfast_second_level_miss_continue +.Lfast_second_level_miss_no_pmd: #if (DCACHE_WAY_SIZE > PAGE_SIZE) -2: /* Special case for cache aliasing. + /* Special case for cache aliasing. * We (should) only get here if a clear_user_page, copy_user_page * or the aliased cache flush functions got preemptively interrupted * by another task. Re-establish temporary mapping to the @@ -1729,24 +1738,24 @@ ENTRY(fast_second_level_miss) /* We shouldn't be in a double exception */ l32i a0, a2, PT_DEPC - bgeui a0, VALID_DOUBLE_EXCEPTION_ADDRESS, 2f + bgeui a0, VALID_DOUBLE_EXCEPTION_ADDRESS, .Lfast_second_level_miss_slow /* Make sure the exception originated in the special functions */ movi a0, __tlbtemp_mapping_start rsr a3, epc1 - bltu a3, a0, 2f + bltu a3, a0, .Lfast_second_level_miss_slow movi a0, __tlbtemp_mapping_end - bgeu a3, a0, 2f + bgeu a3, a0, .Lfast_second_level_miss_slow /* Check if excvaddr was in one of the TLBTEMP_BASE areas. */ movi a3, TLBTEMP_BASE_1 rsr a0, excvaddr - bltu a0, a3, 2f + bltu a0, a3, .Lfast_second_level_miss_slow addi a1, a0, -TLBTEMP_SIZE - bgeu a1, a3, 2f + bgeu a1, a3, .Lfast_second_level_miss_slow /* Check if we have to restore an ITLB mapping. */ @@ -1772,19 +1781,19 @@ ENTRY(fast_second_level_miss) mov a0, a6 movnez a0, a7, a3 - j 3b + j .Lfast_second_level_miss_wdtlb /* ITLB entry. We only use dst in a6. */ 1: witlb a6, a1 isync - j 4b + j .Lfast_second_level_miss_skip_wdtlb #endif // DCACHE_WAY_SIZE > PAGE_SIZE - -2: /* Invalid PGD, default exception handling */ + /* Invalid PGD, default exception handling */ +.Lfast_second_level_miss_slow: rsr a1, depc s32i a1, a2, PT_AREG2 @@ -1824,12 +1833,13 @@ ENTRY(fast_store_prohibited) GET_CURRENT(a1,a2) l32i a0, a1, TASK_MM # tsk->mm - beqz a0, 9f + beqz a0, .Lfast_store_no_mm -8: rsr a1, excvaddr # fault address +.Lfast_store_continue: + rsr a1, excvaddr # fault address _PGD_OFFSET(a0, a1, a3) l32i a0, a0, 0 - beqz a0, 2f + beqz a0, .Lfast_store_slow /* * Note that we test _PAGE_WRITABLE_BIT only if PTE is present @@ -1839,8 +1849,8 @@ ENTRY(fast_store_prohibited) _PTE_OFFSET(a0, a1, a3) l32i a3, a0, 0 # read pteval movi a1, _PAGE_CA_INVALID - ball a3, a1, 2f - bbci.l a3, _PAGE_WRITABLE_BIT, 2f + ball a3, a1, .Lfast_store_slow + bbci.l a3, _PAGE_WRITABLE_BIT, .Lfast_store_slow movi a1, _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_HW_WRITE or a3, a3, a1 @@ -1868,7 +1878,6 @@ ENTRY(fast_store_prohibited) l32i a2, a2, PT_DEPC bgeui a2, VALID_DOUBLE_EXCEPTION_ADDRESS, 1f - rsr a2, depc rfe @@ -1878,11 +1887,17 @@ ENTRY(fast_store_prohibited) esync rfde -9: l32i a0, a1, TASK_ACTIVE_MM # unlikely case mm == 0 - j 8b - -2: /* If there was a problem, handle fault in C */ +.Lfast_store_no_mm: + l32i a0, a1, TASK_ACTIVE_MM # unlikely case mm == 0 + j .Lfast_store_continue + /* If there was a problem, handle fault in C */ +.Lfast_store_slow: + rsr a1, excvaddr + pdtlb a0, a1 + bbci.l a0, DTLB_HIT_BIT, 1f + idtlb a0 +1: rsr a3, depc # still holds a2 s32i a3, a2, PT_AREG2 mov a1, a2 @@ -2071,8 +2086,16 @@ ENTRY(_switch_to) #if (XTENSA_HAVE_COPROCESSORS || XTENSA_HAVE_IO_PORTS) l32i a3, a5, THREAD_CPENABLE - xsr a3, cpenable - s32i a3, a4, THREAD_CPENABLE +#ifdef CONFIG_SMP + beqz a3, 1f + memw # pairs with memw (2) in fast_coprocessor + l32i a6, a5, THREAD_CP_OWNER_CPU + l32i a7, a5, THREAD_CPU + beq a6, a7, 1f # load 0 into CPENABLE if current CPU is not the owner + movi a3, 0 +1: +#endif + wsr a3, cpenable #endif #if XCHAL_HAVE_EXCLUSIVE @@ -2147,3 +2170,95 @@ ENTRY(ret_from_kernel_thread) j common_exception_return ENDPROC(ret_from_kernel_thread) + +#ifdef CONFIG_HIBERNATION + + .bss + .align 4 +.Lsaved_regs: +#if defined(__XTENSA_WINDOWED_ABI__) + .fill 2, 4 +#elif defined(__XTENSA_CALL0_ABI__) + .fill 6, 4 +#else +#error Unsupported Xtensa ABI +#endif + .align XCHAL_NCP_SA_ALIGN +.Lsaved_user_regs: + .fill XTREGS_USER_SIZE, 1 + + .previous + +ENTRY(swsusp_arch_suspend) + + abi_entry_default + + movi a2, .Lsaved_regs + movi a3, .Lsaved_user_regs + s32i a0, a2, 0 + s32i a1, a2, 4 + save_xtregs_user a3 a4 a5 a6 a7 a8 0 +#if defined(__XTENSA_WINDOWED_ABI__) + spill_registers_kernel +#elif defined(__XTENSA_CALL0_ABI__) + s32i a12, a2, 8 + s32i a13, a2, 12 + s32i a14, a2, 16 + s32i a15, a2, 20 +#else +#error Unsupported Xtensa ABI +#endif + abi_call swsusp_save + mov a2, abi_rv + abi_ret_default + +ENDPROC(swsusp_arch_suspend) + +ENTRY(swsusp_arch_resume) + + abi_entry_default + +#if defined(__XTENSA_WINDOWED_ABI__) + spill_registers_kernel +#endif + + movi a2, restore_pblist + l32i a2, a2, 0 + +.Lcopy_pbe: + l32i a3, a2, PBE_ADDRESS + l32i a4, a2, PBE_ORIG_ADDRESS + + __loopi a3, a9, PAGE_SIZE, 16 + l32i a5, a3, 0 + l32i a6, a3, 4 + l32i a7, a3, 8 + l32i a8, a3, 12 + addi a3, a3, 16 + s32i a5, a4, 0 + s32i a6, a4, 4 + s32i a7, a4, 8 + s32i a8, a4, 12 + addi a4, a4, 16 + __endl a3, a9 + + l32i a2, a2, PBE_NEXT + bnez a2, .Lcopy_pbe + + movi a2, .Lsaved_regs + movi a3, .Lsaved_user_regs + l32i a0, a2, 0 + l32i a1, a2, 4 + load_xtregs_user a3 a4 a5 a6 a7 a8 0 +#if defined(__XTENSA_CALL0_ABI__) + l32i a12, a2, 8 + l32i a13, a2, 12 + l32i a14, a2, 16 + l32i a15, a2, 20 +#endif + movi a2, 0 + abi_ret_default + +ENDPROC(swsusp_arch_resume) + +#endif diff --git a/arch/xtensa/kernel/hibernate.c b/arch/xtensa/kernel/hibernate.c new file mode 100644 index 000000000000..06984327d6e2 --- /dev/null +++ b/arch/xtensa/kernel/hibernate.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/mm.h> +#include <linux/suspend.h> +#include <asm/coprocessor.h> + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = PFN_DOWN(__pa(&__nosave_begin)); + unsigned long nosave_end_pfn = PFN_UP(__pa(&__nosave_end)); + + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} + +void notrace save_processor_state(void) +{ + WARN_ON(num_online_cpus() != 1); +#if XTENSA_HAVE_COPROCESSORS + local_coprocessors_flush_release_all(); +#endif +} + +void notrace restore_processor_state(void) +{ +} diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index e8bfbca5f001..7e38292dd07a 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -47,6 +47,7 @@ #include <asm/asm-offsets.h> #include <asm/regs.h> #include <asm/hw_breakpoint.h> +#include <asm/traps.h> extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); @@ -63,52 +64,114 @@ EXPORT_SYMBOL(__stack_chk_guard); #if XTENSA_HAVE_COPROCESSORS -void coprocessor_release_all(struct thread_info *ti) +void local_coprocessors_flush_release_all(void) { - unsigned long cpenable; - int i; + struct thread_info **coprocessor_owner; + struct thread_info *unique_owner[XCHAL_CP_MAX]; + int n = 0; + int i, j; - /* Make sure we don't switch tasks during this operation. */ + coprocessor_owner = this_cpu_ptr(&exc_table)->coprocessor_owner; + xtensa_set_sr(XCHAL_CP_MASK, cpenable); - preempt_disable(); + for (i = 0; i < XCHAL_CP_MAX; i++) { + struct thread_info *ti = coprocessor_owner[i]; - /* Walk through all cp owners and release it for the requested one. */ + if (ti) { + coprocessor_flush(ti, i); - cpenable = ti->cpenable; + for (j = 0; j < n; j++) + if (unique_owner[j] == ti) + break; + if (j == n) + unique_owner[n++] = ti; - for (i = 0; i < XCHAL_CP_MAX; i++) { - if (coprocessor_owner[i] == ti) { - coprocessor_owner[i] = 0; - cpenable &= ~(1 << i); + coprocessor_owner[i] = NULL; } } + for (i = 0; i < n; i++) { + /* pairs with memw (1) in fast_coprocessor and memw in switch_to */ + smp_wmb(); + unique_owner[i]->cpenable = 0; + } + xtensa_set_sr(0, cpenable); +} - ti->cpenable = cpenable; +static void local_coprocessor_release_all(void *info) +{ + struct thread_info *ti = info; + struct thread_info **coprocessor_owner; + int i; + + coprocessor_owner = this_cpu_ptr(&exc_table)->coprocessor_owner; + + /* Walk through all cp owners and release it for the requested one. */ + + for (i = 0; i < XCHAL_CP_MAX; i++) { + if (coprocessor_owner[i] == ti) + coprocessor_owner[i] = NULL; + } + /* pairs with memw (1) in fast_coprocessor and memw in switch_to */ + smp_wmb(); + ti->cpenable = 0; if (ti == current_thread_info()) xtensa_set_sr(0, cpenable); +} - preempt_enable(); +void coprocessor_release_all(struct thread_info *ti) +{ + if (ti->cpenable) { + /* pairs with memw (2) in fast_coprocessor */ + smp_rmb(); + smp_call_function_single(ti->cp_owner_cpu, + local_coprocessor_release_all, + ti, true); + } } -void coprocessor_flush_all(struct thread_info *ti) +static void local_coprocessor_flush_all(void *info) { - unsigned long cpenable, old_cpenable; + struct thread_info *ti = info; + struct thread_info **coprocessor_owner; + unsigned long old_cpenable; int i; - preempt_disable(); - - old_cpenable = xtensa_get_sr(cpenable); - cpenable = ti->cpenable; - xtensa_set_sr(cpenable, cpenable); + coprocessor_owner = this_cpu_ptr(&exc_table)->coprocessor_owner; + old_cpenable = xtensa_xsr(ti->cpenable, cpenable); for (i = 0; i < XCHAL_CP_MAX; i++) { - if ((cpenable & 1) != 0 && coprocessor_owner[i] == ti) + if (coprocessor_owner[i] == ti) coprocessor_flush(ti, i); - cpenable >>= 1; } xtensa_set_sr(old_cpenable, cpenable); +} - preempt_enable(); +void coprocessor_flush_all(struct thread_info *ti) +{ + if (ti->cpenable) { + /* pairs with memw (2) in fast_coprocessor */ + smp_rmb(); + smp_call_function_single(ti->cp_owner_cpu, + local_coprocessor_flush_all, + ti, true); + } +} + +static void local_coprocessor_flush_release_all(void *info) +{ + local_coprocessor_flush_all(info); + local_coprocessor_release_all(info); +} + +void coprocessor_flush_release_all(struct thread_info *ti) +{ + if (ti->cpenable) { + /* pairs with memw (2) in fast_coprocessor */ + smp_rmb(); + smp_call_function_single(ti->cp_owner_cpu, + local_coprocessor_flush_release_all, + ti, true); + } } #endif @@ -140,8 +203,7 @@ void flush_thread(void) { #if XTENSA_HAVE_COPROCESSORS struct thread_info *ti = current_thread_info(); - coprocessor_flush_all(ti); - coprocessor_release_all(ti); + coprocessor_flush_release_all(ti); #endif flush_ptrace_hw_breakpoint(current); } diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c index 323c678a691f..22cdaa6729d3 100644 --- a/arch/xtensa/kernel/ptrace.c +++ b/arch/xtensa/kernel/ptrace.c @@ -171,8 +171,7 @@ static int tie_set(struct task_struct *target, #if XTENSA_HAVE_COPROCESSORS /* Flush all coprocessors before we overwrite them. */ - coprocessor_flush_all(ti); - coprocessor_release_all(ti); + coprocessor_flush_release_all(ti); ti->xtregs_cp.cp0 = newregs->cp0; ti->xtregs_cp.cp1 = newregs->cp1; ti->xtregs_cp.cp2 = newregs->cp2; diff --git a/arch/xtensa/kernel/s32c1i_selftest.c b/arch/xtensa/kernel/s32c1i_selftest.c index 07e56e3a9a8b..8362388c8719 100644 --- a/arch/xtensa/kernel/s32c1i_selftest.c +++ b/arch/xtensa/kernel/s32c1i_selftest.c @@ -40,14 +40,13 @@ static inline int probed_compare_swap(int *v, int cmp, int set) /* Handle probed exception */ -static void __init do_probed_exception(struct pt_regs *regs, - unsigned long exccause) +static void __init do_probed_exception(struct pt_regs *regs) { if (regs->pc == rcw_probe_pc) { /* exception on s32c1i ? */ regs->pc += 3; /* skip the s32c1i instruction */ - rcw_exc = exccause; + rcw_exc = regs->exccause; } else { - do_unhandled(regs, exccause); + do_unhandled(regs); } } diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index 6f68649e86ba..c9ffd42db873 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -162,8 +162,7 @@ setup_sigcontext(struct rt_sigframe __user *frame, struct pt_regs *regs) return err; #if XTENSA_HAVE_COPROCESSORS - coprocessor_flush_all(ti); - coprocessor_release_all(ti); + coprocessor_flush_release_all(ti); err |= __copy_to_user(&frame->xtregs.cp, &ti->xtregs_cp, sizeof (frame->xtregs.cp)); #endif diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c index 1254da07ead1..4dc109dd6214 100644 --- a/arch/xtensa/kernel/smp.c +++ b/arch/xtensa/kernel/smp.c @@ -30,6 +30,7 @@ #include <linux/thread_info.h> #include <asm/cacheflush.h> +#include <asm/coprocessor.h> #include <asm/kdebug.h> #include <asm/mmu_context.h> #include <asm/mxregs.h> @@ -272,6 +273,12 @@ int __cpu_disable(void) */ set_cpu_online(cpu, false); +#if XTENSA_HAVE_COPROCESSORS + /* + * Flush coprocessor contexts that are active on the current CPU. + */ + local_coprocessors_flush_release_all(); +#endif /* * OK - migrate IRQs away from this CPU */ diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 9345007d474d..0c25e035ff10 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -48,25 +48,20 @@ * Machine specific interrupt handlers */ -extern void kernel_exception(void); -extern void user_exception(void); - -extern void fast_illegal_instruction_user(void); -extern void fast_syscall_user(void); -extern void fast_alloca(void); -extern void fast_unaligned(void); -extern void fast_second_level_miss(void); -extern void fast_store_prohibited(void); -extern void fast_coprocessor(void); - -extern void do_illegal_instruction (struct pt_regs*); -extern void do_interrupt (struct pt_regs*); -extern void do_nmi(struct pt_regs *); -extern void do_unaligned_user (struct pt_regs*); -extern void do_multihit (struct pt_regs*, unsigned long); -extern void do_page_fault (struct pt_regs*, unsigned long); -extern void do_debug (struct pt_regs*); -extern void system_call (struct pt_regs*); +static void do_illegal_instruction(struct pt_regs *regs); +static void do_div0(struct pt_regs *regs); +static void do_interrupt(struct pt_regs *regs); +#if XTENSA_FAKE_NMI +static void do_nmi(struct pt_regs *regs); +#endif +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || XCHAL_UNALIGNED_STORE_EXCEPTION +static void do_unaligned_user(struct pt_regs *regs); +#endif +static void do_multihit(struct pt_regs *regs); +#if XTENSA_HAVE_COPROCESSORS +static void do_coprocessor(struct pt_regs *regs); +#endif +static void do_debug(struct pt_regs *regs); /* * The vector table must be preceded by a save area (which @@ -78,7 +73,8 @@ extern void system_call (struct pt_regs*); #define USER 0x02 #define COPROCESSOR(x) \ -{ EXCCAUSE_COPROCESSOR ## x ## _DISABLED, USER, fast_coprocessor } +{ EXCCAUSE_COPROCESSOR ## x ## _DISABLED, USER|KRNL, fast_coprocessor },\ +{ EXCCAUSE_COPROCESSOR ## x ## _DISABLED, 0, do_coprocessor } typedef struct { int cause; @@ -100,7 +96,7 @@ static dispatch_init_table_t __initdata dispatch_init_table[] = { #ifdef SUPPORT_WINDOWED { EXCCAUSE_ALLOCA, USER|KRNL, fast_alloca }, #endif -/* EXCCAUSE_INTEGER_DIVIDE_BY_ZERO unhandled */ +{ EXCCAUSE_INTEGER_DIVIDE_BY_ZERO, 0, do_div0 }, /* EXCCAUSE_PRIVILEGED unhandled */ #if XCHAL_UNALIGNED_LOAD_EXCEPTION || XCHAL_UNALIGNED_STORE_EXCEPTION #ifdef CONFIG_XTENSA_UNALIGNED_USER @@ -110,21 +106,21 @@ static dispatch_init_table_t __initdata dispatch_init_table[] = { { EXCCAUSE_UNALIGNED, KRNL, fast_unaligned }, #endif #ifdef CONFIG_MMU -{ EXCCAUSE_ITLB_MISS, 0, do_page_fault }, -{ EXCCAUSE_ITLB_MISS, USER|KRNL, fast_second_level_miss}, +{ EXCCAUSE_ITLB_MISS, 0, do_page_fault }, +{ EXCCAUSE_ITLB_MISS, USER|KRNL, fast_second_level_miss}, +{ EXCCAUSE_DTLB_MISS, USER|KRNL, fast_second_level_miss}, +{ EXCCAUSE_DTLB_MISS, 0, do_page_fault }, +{ EXCCAUSE_STORE_CACHE_ATTRIBUTE, USER|KRNL, fast_store_prohibited }, +#endif /* CONFIG_MMU */ +#ifdef CONFIG_PFAULT { EXCCAUSE_ITLB_MULTIHIT, 0, do_multihit }, -{ EXCCAUSE_ITLB_PRIVILEGE, 0, do_page_fault }, -/* EXCCAUSE_SIZE_RESTRICTION unhandled */ +{ EXCCAUSE_ITLB_PRIVILEGE, 0, do_page_fault }, { EXCCAUSE_FETCH_CACHE_ATTRIBUTE, 0, do_page_fault }, -{ EXCCAUSE_DTLB_MISS, USER|KRNL, fast_second_level_miss}, -{ EXCCAUSE_DTLB_MISS, 0, do_page_fault }, { EXCCAUSE_DTLB_MULTIHIT, 0, do_multihit }, -{ EXCCAUSE_DTLB_PRIVILEGE, 0, do_page_fault }, -/* EXCCAUSE_DTLB_SIZE_RESTRICTION unhandled */ -{ EXCCAUSE_STORE_CACHE_ATTRIBUTE, USER|KRNL, fast_store_prohibited }, +{ EXCCAUSE_DTLB_PRIVILEGE, 0, do_page_fault }, { EXCCAUSE_STORE_CACHE_ATTRIBUTE, 0, do_page_fault }, { EXCCAUSE_LOAD_CACHE_ATTRIBUTE, 0, do_page_fault }, -#endif /* CONFIG_MMU */ +#endif /* XCCHAL_EXCCAUSE_FLOATING_POINT unhandled */ #if XTENSA_HAVE_COPROCESSOR(0) COPROCESSOR(0), @@ -179,7 +175,7 @@ __die_if_kernel(const char *str, struct pt_regs *regs, long err) * Unhandled Exceptions. Kill user task or panic if in kernel space. */ -void do_unhandled(struct pt_regs *regs, unsigned long exccause) +void do_unhandled(struct pt_regs *regs) { __die_if_kernel("Caught unhandled exception - should not happen", regs, SIGKILL); @@ -189,7 +185,7 @@ void do_unhandled(struct pt_regs *regs, unsigned long exccause) "(pid = %d, pc = %#010lx) - should not happen\n" "\tEXCCAUSE is %ld\n", current->comm, task_pid_nr(current), regs->pc, - exccause); + regs->exccause); force_sig(SIGILL); } @@ -197,7 +193,7 @@ void do_unhandled(struct pt_regs *regs, unsigned long exccause) * Multi-hit exception. This if fatal! */ -void do_multihit(struct pt_regs *regs, unsigned long exccause) +static void do_multihit(struct pt_regs *regs) { die("Caught multihit exception", regs, SIGKILL); } @@ -206,8 +202,6 @@ void do_multihit(struct pt_regs *regs, unsigned long exccause) * IRQ handler. */ -extern void do_IRQ(int, struct pt_regs *); - #if XTENSA_FAKE_NMI #define IS_POW2(v) (((v) & ((v) - 1)) == 0) @@ -240,14 +234,10 @@ irqreturn_t xtensa_pmu_irq_handler(int irq, void *dev_id); DEFINE_PER_CPU(unsigned long, nmi_count); -void do_nmi(struct pt_regs *regs) +static void do_nmi(struct pt_regs *regs) { - struct pt_regs *old_regs; - - if ((regs->ps & PS_INTLEVEL_MASK) < LOCKLEVEL) - trace_hardirqs_off(); + struct pt_regs *old_regs = set_irq_regs(regs); - old_regs = set_irq_regs(regs); nmi_enter(); ++*this_cpu_ptr(&nmi_count); check_valid_nmi(); @@ -257,7 +247,7 @@ void do_nmi(struct pt_regs *regs) } #endif -void do_interrupt(struct pt_regs *regs) +static void do_interrupt(struct pt_regs *regs) { static const unsigned int_level_mask[] = { 0, @@ -269,12 +259,9 @@ void do_interrupt(struct pt_regs *regs) XCHAL_INTLEVEL6_MASK, XCHAL_INTLEVEL7_MASK, }; - struct pt_regs *old_regs; + struct pt_regs *old_regs = set_irq_regs(regs); unsigned unhandled = ~0u; - trace_hardirqs_off(); - - old_regs = set_irq_regs(regs); irq_enter(); for (;;) { @@ -306,13 +293,47 @@ void do_interrupt(struct pt_regs *regs) set_irq_regs(old_regs); } +static bool check_div0(struct pt_regs *regs) +{ + static const u8 pattern[] = {'D', 'I', 'V', '0'}; + const u8 *p; + u8 buf[5]; + + if (user_mode(regs)) { + if (copy_from_user(buf, (void __user *)regs->pc + 2, 5)) + return false; + p = buf; + } else { + p = (const u8 *)regs->pc + 2; + } + + return memcmp(p, pattern, sizeof(pattern)) == 0 || + memcmp(p + 1, pattern, sizeof(pattern)) == 0; +} + /* * Illegal instruction. Fatal if in kernel space. */ -void -do_illegal_instruction(struct pt_regs *regs) +static void do_illegal_instruction(struct pt_regs *regs) { +#ifdef CONFIG_USER_ABI_CALL0_PROBE + /* + * When call0 application encounters an illegal instruction fast + * exception handler will attempt to set PS.WOE and retry failing + * instruction. + * If we get here we know that that instruction is also illegal + * with PS.WOE set, so it's not related to the windowed option + * hence PS.WOE may be cleared. + */ + if (regs->pc == current_thread_info()->ps_woe_fix_addr) + regs->ps &= ~PS_WOE_MASK; +#endif + if (check_div0(regs)) { + do_div0(regs); + return; + } + __die_if_kernel("Illegal instruction in kernel", regs, SIGKILL); /* If in user mode, send SIGILL signal to current process. */ @@ -322,6 +343,11 @@ do_illegal_instruction(struct pt_regs *regs) force_sig(SIGILL); } +static void do_div0(struct pt_regs *regs) +{ + __die_if_kernel("Unhandled division by 0 in kernel", regs, SIGKILL); + force_sig_fault(SIGFPE, FPE_INTDIV, (void __user *)regs->pc); +} /* * Handle unaligned memory accesses from user space. Kill task. @@ -331,8 +357,7 @@ do_illegal_instruction(struct pt_regs *regs) */ #if XCHAL_UNALIGNED_LOAD_EXCEPTION || XCHAL_UNALIGNED_STORE_EXCEPTION -void -do_unaligned_user (struct pt_regs *regs) +static void do_unaligned_user(struct pt_regs *regs) { __die_if_kernel("Unhandled unaligned exception in kernel", regs, SIGKILL); @@ -347,14 +372,20 @@ do_unaligned_user (struct pt_regs *regs) } #endif +#if XTENSA_HAVE_COPROCESSORS +static void do_coprocessor(struct pt_regs *regs) +{ + coprocessor_flush_release_all(current_thread_info()); +} +#endif + /* Handle debug events. * When CONFIG_HAVE_HW_BREAKPOINT is on this handler is called with * preemption disabled to avoid rescheduling and keep mapping of hardware * breakpoint structures to debug registers intact, so that * DEBUGCAUSE.DBNUM could be used in case of data breakpoint hit. */ -void -do_debug(struct pt_regs *regs) +static void do_debug(struct pt_regs *regs) { #ifdef CONFIG_HAVE_HW_BREAKPOINT int ret = check_hw_breakpoint(regs); @@ -381,7 +412,8 @@ do_debug(struct pt_regs *regs) /* Set exception C handler - for temporary use when probing exceptions */ -void * __init trap_set_handler(int cause, void *handler) +xtensa_exception_handler * +__init trap_set_handler(int cause, xtensa_exception_handler *handler) { void *previous = per_cpu(exc_table, 0).default_handler[cause]; @@ -392,8 +424,7 @@ void * __init trap_set_handler(int cause, void *handler) static void trap_init_excsave(void) { - unsigned long excsave1 = (unsigned long)this_cpu_ptr(&exc_table); - __asm__ __volatile__("wsr %0, excsave1\n" : : "a" (excsave1)); + xtensa_set_sr(this_cpu_ptr(&exc_table), excsave1); } static void trap_init_debug(void) diff --git a/arch/xtensa/lib/Makefile b/arch/xtensa/lib/Makefile index 5848c133f7ea..d4e9c397e3fd 100644 --- a/arch/xtensa/lib/Makefile +++ b/arch/xtensa/lib/Makefile @@ -8,3 +8,5 @@ lib-y += memcopy.o memset.o checksum.o \ divsi3.o udivsi3.o modsi3.o umodsi3.o mulsi3.o \ usercopy.o strncpy_user.o strnlen_user.o lib-$(CONFIG_PCI) += pci-auto.o +lib-$(CONFIG_KCSAN) += kcsan-stubs.o +KCSAN_SANITIZE_kcsan-stubs.o := n diff --git a/arch/xtensa/lib/kcsan-stubs.c b/arch/xtensa/lib/kcsan-stubs.c new file mode 100644 index 000000000000..2b08faa62b86 --- /dev/null +++ b/arch/xtensa/lib/kcsan-stubs.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bug.h> +#include <linux/types.h> + +void __atomic_store_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +u64 __atomic_load_8(const volatile void *p, int i) +{ + BUG(); +} + +u64 __atomic_exchange_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +bool __atomic_compare_exchange_8(volatile void *p1, void *p2, u64 v, bool b, int i1, int i2) +{ + BUG(); +} + +u64 __atomic_fetch_add_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +u64 __atomic_fetch_sub_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +u64 __atomic_fetch_and_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +u64 __atomic_fetch_or_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +u64 __atomic_fetch_xor_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +u64 __atomic_fetch_nand_8(volatile void *p, u64 v, int i) +{ + BUG(); +} diff --git a/arch/xtensa/lib/memcopy.S b/arch/xtensa/lib/memcopy.S index 582d817979ed..b20d206bcb71 100644 --- a/arch/xtensa/lib/memcopy.S +++ b/arch/xtensa/lib/memcopy.S @@ -402,13 +402,13 @@ WEAK(memmove) */ # copy 16 bytes per iteration for word-aligned dst and word-aligned src #if XCHAL_HAVE_LOOPS - loopnez a7, .backLoop1done + loopnez a7, .LbackLoop1done #else /* !XCHAL_HAVE_LOOPS */ - beqz a7, .backLoop1done + beqz a7, .LbackLoop1done slli a8, a7, 4 sub a8, a3, a8 # a8 = start of first 16B source chunk #endif /* !XCHAL_HAVE_LOOPS */ -.backLoop1: +.LbackLoop1: addi a3, a3, -16 l32i a7, a3, 12 l32i a6, a3, 8 @@ -420,9 +420,9 @@ WEAK(memmove) s32i a7, a5, 4 s32i a6, a5, 0 #if !XCHAL_HAVE_LOOPS - bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start + bne a3, a8, .LbackLoop1 # continue loop if a3:src != a8:src_start #endif /* !XCHAL_HAVE_LOOPS */ -.backLoop1done: +.LbackLoop1done: bbci.l a4, 3, .Lback2 # copy 8 bytes addi a3, a3, -8 @@ -479,13 +479,13 @@ WEAK(memmove) #endif l32i a6, a3, 0 # load first word #if XCHAL_HAVE_LOOPS - loopnez a7, .backLoop2done + loopnez a7, .LbackLoop2done #else /* !XCHAL_HAVE_LOOPS */ - beqz a7, .backLoop2done + beqz a7, .LbackLoop2done slli a10, a7, 4 sub a10, a3, a10 # a10 = start of first 16B source chunk #endif /* !XCHAL_HAVE_LOOPS */ -.backLoop2: +.LbackLoop2: addi a3, a3, -16 l32i a7, a3, 12 l32i a8, a3, 8 @@ -501,9 +501,9 @@ WEAK(memmove) __src_b a9, a6, a9 s32i a9, a5, 0 #if !XCHAL_HAVE_LOOPS - bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start + bne a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start #endif /* !XCHAL_HAVE_LOOPS */ -.backLoop2done: +.LbackLoop2done: bbci.l a4, 3, .Lback12 # copy 8 bytes addi a3, a3, -8 diff --git a/arch/xtensa/mm/Makefile b/arch/xtensa/mm/Makefile index f7fb08ae768f..44153a335951 100644 --- a/arch/xtensa/mm/Makefile +++ b/arch/xtensa/mm/Makefile @@ -4,7 +4,8 @@ # obj-y := init.o misc.o -obj-$(CONFIG_MMU) += cache.o fault.o ioremap.o mmu.o tlb.o +obj-$(CONFIG_PFAULT) += fault.o +obj-$(CONFIG_MMU) += cache.o ioremap.o mmu.o tlb.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_KASAN) += kasan_init.o diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 06d0973a0d74..16f0a5ff5799 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -21,9 +21,61 @@ #include <asm/cacheflush.h> #include <asm/hardirq.h> -DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST; void bad_page_fault(struct pt_regs*, unsigned long, int); +static void vmalloc_fault(struct pt_regs *regs, unsigned int address) +{ +#ifdef CONFIG_MMU + /* Synchronize this task's top level page-table + * with the 'reference' page table. + */ + struct mm_struct *act_mm = current->active_mm; + int index = pgd_index(address); + pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + pte_t *pte_k; + + if (act_mm == NULL) + goto bad_page_fault; + + pgd = act_mm->pgd + index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + goto bad_page_fault; + + pgd_val(*pgd) = pgd_val(*pgd_k); + + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d) || !p4d_present(*p4d_k)) + goto bad_page_fault; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); + if (!pud_present(*pud) || !pud_present(*pud_k)) + goto bad_page_fault; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd) || !pmd_present(*pmd_k)) + goto bad_page_fault; + + pmd_val(*pmd) = pmd_val(*pmd_k); + pte_k = pte_offset_kernel(pmd_k, address); + + if (!pte_present(*pte_k)) + goto bad_page_fault; + return; + +bad_page_fault: + bad_page_fault(regs, address, SIGKILL); +#else + WARN_ONCE(1, "%s in noMMU configuration\n", __func__); +#endif +} /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -49,8 +101,10 @@ void do_page_fault(struct pt_regs *regs) /* We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. */ - if (address >= TASK_SIZE && !user_mode(regs)) - goto vmalloc_fault; + if (address >= TASK_SIZE && !user_mode(regs)) { + vmalloc_fault(regs, address); + return; + } /* If we're in an interrupt or have no user * context, we must not take the fault.. @@ -114,7 +168,7 @@ good_area: if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) - goto bad_page_fault; + bad_page_fault(regs, address, SIGKILL); return; } @@ -181,56 +235,6 @@ do_sigbus: if (!user_mode(regs)) bad_page_fault(regs, address, SIGBUS); return; - -vmalloc_fault: - { - /* Synchronize this task's top level page-table - * with the 'reference' page table. - */ - struct mm_struct *act_mm = current->active_mm; - int index = pgd_index(address); - pgd_t *pgd, *pgd_k; - p4d_t *p4d, *p4d_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - pte_t *pte_k; - - if (act_mm == NULL) - goto bad_page_fault; - - pgd = act_mm->pgd + index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - goto bad_page_fault; - - pgd_val(*pgd) = pgd_val(*pgd_k); - - p4d = p4d_offset(pgd, address); - p4d_k = p4d_offset(pgd_k, address); - if (!p4d_present(*p4d) || !p4d_present(*p4d_k)) - goto bad_page_fault; - - pud = pud_offset(p4d, address); - pud_k = pud_offset(p4d_k, address); - if (!pud_present(*pud) || !pud_present(*pud_k)) - goto bad_page_fault; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd) || !pmd_present(*pmd_k)) - goto bad_page_fault; - - pmd_val(*pmd) = pmd_val(*pmd_k); - pte_k = pte_offset_kernel(pmd_k, address); - - if (!pte_present(*pte_k)) - goto bad_page_fault; - return; - } -bad_page_fault: - bad_page_fault(regs, address, SIGKILL); - return; } diff --git a/arch/xtensa/mm/mmu.c b/arch/xtensa/mm/mmu.c index 38acda4f04e8..92e158c69c10 100644 --- a/arch/xtensa/mm/mmu.c +++ b/arch/xtensa/mm/mmu.c @@ -18,6 +18,8 @@ #include <asm/initialize_mmu.h> #include <asm/io.h> +DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST; + #if defined(CONFIG_HIGHMEM) static void * __init init_pmd(unsigned long vaddr, unsigned long n_pages) { diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c index be3aaaad8bee..fd84d4891758 100644 --- a/arch/xtensa/platforms/iss/network.c +++ b/arch/xtensa/platforms/iss/network.c @@ -38,9 +38,6 @@ #define ISS_NET_TIMER_VALUE (HZ / 10) -static DEFINE_SPINLOCK(opened_lock); -static LIST_HEAD(opened); - static DEFINE_SPINLOCK(devices_lock); static LIST_HEAD(devices); @@ -59,17 +56,27 @@ struct tuntap_info { /* ------------------------------------------------------------------------- */ +struct iss_net_private; + +struct iss_net_ops { + int (*open)(struct iss_net_private *lp); + void (*close)(struct iss_net_private *lp); + int (*read)(struct iss_net_private *lp, struct sk_buff **skb); + int (*write)(struct iss_net_private *lp, struct sk_buff **skb); + unsigned short (*protocol)(struct sk_buff *skb); + int (*poll)(struct iss_net_private *lp); +}; + /* This structure contains out private information for the driver. */ struct iss_net_private { struct list_head device_list; - struct list_head opened_list; spinlock_t lock; struct net_device *dev; struct platform_device pdev; struct timer_list tl; - struct net_device_stats stats; + struct rtnl_link_stats64 stats; struct timer_list timer; unsigned int timer_val; @@ -82,12 +89,7 @@ struct iss_net_private { struct tuntap_info tuntap; } info; - int (*open)(struct iss_net_private *lp); - void (*close)(struct iss_net_private *lp); - int (*read)(struct iss_net_private *lp, struct sk_buff **skb); - int (*write)(struct iss_net_private *lp, struct sk_buff **skb); - unsigned short (*protocol)(struct sk_buff *skb); - int (*poll)(struct iss_net_private *lp); + const struct iss_net_ops *net_ops; } tp; }; @@ -215,6 +217,15 @@ static int tuntap_poll(struct iss_net_private *lp) return simc_poll(lp->tp.info.tuntap.fd); } +static const struct iss_net_ops tuntap_ops = { + .open = tuntap_open, + .close = tuntap_close, + .read = tuntap_read, + .write = tuntap_write, + .protocol = tuntap_protocol, + .poll = tuntap_poll, +}; + /* * ethX=tuntap,[mac address],device name */ @@ -257,13 +268,7 @@ static int tuntap_probe(struct iss_net_private *lp, int index, char *init) lp->mtu = TRANSPORT_TUNTAP_MTU; lp->tp.info.tuntap.fd = -1; - - lp->tp.open = tuntap_open; - lp->tp.close = tuntap_close; - lp->tp.read = tuntap_read; - lp->tp.write = tuntap_write; - lp->tp.protocol = tuntap_protocol; - lp->tp.poll = tuntap_poll; + lp->tp.net_ops = &tuntap_ops; return 1; } @@ -278,14 +283,16 @@ static int iss_net_rx(struct net_device *dev) /* Check if there is any new data. */ - if (lp->tp.poll(lp) == 0) + if (lp->tp.net_ops->poll(lp) == 0) return 0; /* Try to allocate memory, if it fails, try again next round. */ skb = dev_alloc_skb(dev->mtu + 2 + ETH_HEADER_OTHER); if (skb == NULL) { + spin_lock_bh(&lp->lock); lp->stats.rx_dropped++; + spin_unlock_bh(&lp->lock); return 0; } @@ -295,15 +302,17 @@ static int iss_net_rx(struct net_device *dev) skb->dev = dev; skb_reset_mac_header(skb); - pkt_len = lp->tp.read(lp, &skb); + pkt_len = lp->tp.net_ops->read(lp, &skb); skb_put(skb, pkt_len); if (pkt_len > 0) { skb_trim(skb, pkt_len); - skb->protocol = lp->tp.protocol(skb); + skb->protocol = lp->tp.net_ops->protocol(skb); + spin_lock_bh(&lp->lock); lp->stats.rx_bytes += skb->len; lp->stats.rx_packets++; + spin_unlock_bh(&lp->lock); netif_rx(skb); return pkt_len; } @@ -311,38 +320,24 @@ static int iss_net_rx(struct net_device *dev) return pkt_len; } -static int iss_net_poll(void) +static int iss_net_poll(struct iss_net_private *lp) { - struct list_head *ele; int err, ret = 0; - spin_lock(&opened_lock); - - list_for_each(ele, &opened) { - struct iss_net_private *lp; - - lp = list_entry(ele, struct iss_net_private, opened_list); - - if (!netif_running(lp->dev)) - break; - - spin_lock(&lp->lock); - - while ((err = iss_net_rx(lp->dev)) > 0) - ret++; + if (!netif_running(lp->dev)) + return 0; - spin_unlock(&lp->lock); + while ((err = iss_net_rx(lp->dev)) > 0) + ret++; - if (err < 0) { - pr_err("Device '%s' read returned %d, shutting it down\n", - lp->dev->name, err); - dev_close(lp->dev); - } else { - /* FIXME reactivate_fd(lp->fd, ISS_ETH_IRQ); */ - } + if (err < 0) { + pr_err("Device '%s' read returned %d, shutting it down\n", + lp->dev->name, err); + dev_close(lp->dev); + } else { + /* FIXME reactivate_fd(lp->fd, ISS_ETH_IRQ); */ } - spin_unlock(&opened_lock); return ret; } @@ -351,10 +346,8 @@ static void iss_net_timer(struct timer_list *t) { struct iss_net_private *lp = from_timer(lp, t, timer); - iss_net_poll(); - spin_lock(&lp->lock); + iss_net_poll(lp); mod_timer(&lp->timer, jiffies + lp->timer_val); - spin_unlock(&lp->lock); } @@ -363,11 +356,9 @@ static int iss_net_open(struct net_device *dev) struct iss_net_private *lp = netdev_priv(dev); int err; - spin_lock_bh(&lp->lock); - - err = lp->tp.open(lp); + err = lp->tp.net_ops->open(lp); if (err < 0) - goto out; + return err; netif_start_queue(dev); @@ -378,36 +369,21 @@ static int iss_net_open(struct net_device *dev) while ((err = iss_net_rx(dev)) > 0) ; - spin_unlock_bh(&lp->lock); - spin_lock_bh(&opened_lock); - list_add(&lp->opened_list, &opened); - spin_unlock_bh(&opened_lock); - spin_lock_bh(&lp->lock); - timer_setup(&lp->timer, iss_net_timer, 0); lp->timer_val = ISS_NET_TIMER_VALUE; mod_timer(&lp->timer, jiffies + lp->timer_val); -out: - spin_unlock_bh(&lp->lock); return err; } static int iss_net_close(struct net_device *dev) { struct iss_net_private *lp = netdev_priv(dev); - netif_stop_queue(dev); - spin_lock_bh(&lp->lock); - - spin_lock(&opened_lock); - list_del(&opened); - spin_unlock(&opened_lock); + netif_stop_queue(dev); del_timer_sync(&lp->timer); + lp->tp.net_ops->close(lp); - lp->tp.close(lp); - - spin_unlock_bh(&lp->lock); return 0; } @@ -417,13 +393,14 @@ static int iss_net_start_xmit(struct sk_buff *skb, struct net_device *dev) int len; netif_stop_queue(dev); - spin_lock_bh(&lp->lock); - len = lp->tp.write(lp, &skb); + len = lp->tp.net_ops->write(lp, &skb); if (len == skb->len) { + spin_lock_bh(&lp->lock); lp->stats.tx_packets++; lp->stats.tx_bytes += skb->len; + spin_unlock_bh(&lp->lock); netif_trans_update(dev); netif_start_queue(dev); @@ -432,24 +409,29 @@ static int iss_net_start_xmit(struct sk_buff *skb, struct net_device *dev) } else if (len == 0) { netif_start_queue(dev); + spin_lock_bh(&lp->lock); lp->stats.tx_dropped++; + spin_unlock_bh(&lp->lock); } else { netif_start_queue(dev); pr_err("%s: %s failed(%d)\n", dev->name, __func__, len); } - spin_unlock_bh(&lp->lock); dev_kfree_skb(skb); return NETDEV_TX_OK; } -static struct net_device_stats *iss_net_get_stats(struct net_device *dev) +static void iss_net_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) { struct iss_net_private *lp = netdev_priv(dev); - return &lp->stats; + + spin_lock_bh(&lp->lock); + *stats = lp->stats; + spin_unlock_bh(&lp->lock); } static void iss_net_set_multicast_list(struct net_device *dev) @@ -460,19 +442,6 @@ static void iss_net_tx_timeout(struct net_device *dev, unsigned int txqueue) { } -static int iss_net_set_mac(struct net_device *dev, void *addr) -{ - struct iss_net_private *lp = netdev_priv(dev); - struct sockaddr *hwaddr = addr; - - if (!is_valid_ether_addr(hwaddr->sa_data)) - return -EADDRNOTAVAIL; - spin_lock_bh(&lp->lock); - eth_hw_addr_set(dev, hwaddr->sa_data); - spin_unlock_bh(&lp->lock); - return 0; -} - static int iss_net_change_mtu(struct net_device *dev, int new_mtu) { return -EINVAL; @@ -494,11 +463,11 @@ static int driver_registered; static const struct net_device_ops iss_netdev_ops = { .ndo_open = iss_net_open, .ndo_stop = iss_net_close, - .ndo_get_stats = iss_net_get_stats, + .ndo_get_stats64 = iss_net_get_stats64, .ndo_start_xmit = iss_net_start_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = iss_net_change_mtu, - .ndo_set_mac_address = iss_net_set_mac, + .ndo_set_mac_address = eth_mac_addr, .ndo_tx_timeout = iss_net_tx_timeout, .ndo_set_rx_mode = iss_net_set_multicast_list, }; @@ -520,7 +489,6 @@ static int iss_net_configure(int index, char *init) lp = netdev_priv(dev); *lp = (struct iss_net_private) { .device_list = LIST_HEAD_INIT(lp->device_list), - .opened_list = LIST_HEAD_INIT(lp->opened_list), .dev = dev, .index = index, }; diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 0f0e0724397f..4255b92fa3eb 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -211,12 +211,18 @@ static ssize_t proc_read_simdisk(struct file *file, char __user *buf, struct simdisk *dev = pde_data(file_inode(file)); const char *s = dev->filename; if (s) { - ssize_t n = simple_read_from_buffer(buf, size, ppos, - s, strlen(s)); - if (n < 0) - return n; - buf += n; - size -= n; + ssize_t len = strlen(s); + char *temp = kmalloc(len + 2, GFP_KERNEL); + + if (!temp) + return -ENOMEM; + + len = scnprintf(temp, len + 2, "%s\n", s); + len = simple_read_from_buffer(buf, size, ppos, + temp, len); + + kfree(temp); + return len; } return simple_read_from_buffer(buf, size, ppos, "\n", 1); } diff --git a/arch/xtensa/platforms/xt2000/setup.c b/arch/xtensa/platforms/xt2000/setup.c index 145d129be76f..0dc22c371614 100644 --- a/arch/xtensa/platforms/xt2000/setup.c +++ b/arch/xtensa/platforms/xt2000/setup.c @@ -78,7 +78,7 @@ void __init platform_init(bp_tag_t *first) void platform_heartbeat(void) { - static int i=0, t = 0; + static int i, t; if (--t < 0) { |