diff options
Diffstat (limited to 'arch/arm64')
91 files changed, 3299 insertions, 783 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a2f8ff354ca6..ed15b876fa74 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -34,6 +34,7 @@ config ARM64 select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP @@ -423,7 +424,7 @@ config AMPERE_ERRATUM_AC03_CPU_38 default y help This option adds an alternative code sequence to work around Ampere - erratum AC03_CPU_38 on AmpereOne. + errata AC03_CPU_38 and AC04_CPU_10 on AmpereOne. The affected design reports FEAT_HAFDBS as not implemented in ID_AA64MMFR1_EL1.HAFDBS, but (V)TCR_ELx.{HA,HD} are not RES0 @@ -2137,6 +2138,29 @@ config ARM64_EPAN if the cpu does not implement the feature. endmenu # "ARMv8.7 architectural features" +menu "ARMv8.9 architectural features" + +config ARM64_POE + prompt "Permission Overlay Extension" + def_bool y + select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_HAS_PKEYS + help + The Permission Overlay Extension is used to implement Memory + Protection Keys. Memory Protection Keys provides a mechanism for + enforcing page-based protections, but without requiring modification + of the page tables when an application changes protection domains. + + For details, see Documentation/core-api/protection-keys.rst + + If unsure, say y. + +config ARCH_PKEY_BITS + int + default 3 + +endmenu # "ARMv8.9 architectural features" + config ARM64_SVE bool "ARM Scalable Vector Extension support" default y diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts index a608a219543e..3e08e2fd0a78 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts @@ -387,7 +387,7 @@ pmic { pmic_int_l: pmic-int-l { - rockchip,pins = <2 RK_PA6 RK_FUNC_GPIO &pcfg_pull_up>; + rockchip,pins = <0 RK_PA2 RK_FUNC_GPIO &pcfg_pull_up>; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi index ccbe3a7a1d2c..d24444cdf54a 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi @@ -154,6 +154,22 @@ }; }; +&gpio3 { + /* + * The Qseven BIOS_DISABLE signal on the RK3399-Q7 keeps the on-module + * eMMC and SPI flash powered-down initially (in fact it keeps the + * reset signal asserted). BIOS_DISABLE_OVERRIDE pin allows to override + * that signal so that eMMC and SPI can be used regardless of the state + * of the signal. + */ + bios-disable-override-hog { + gpios = <RK_PD5 GPIO_ACTIVE_LOW>; + gpio-hog; + line-name = "bios_disable_override"; + output-high; + }; +}; + &gmac { assigned-clocks = <&cru SCLK_RMII_SRC>; assigned-clock-parents = <&clkin_gmac>; @@ -409,6 +425,7 @@ &i2s0 { pinctrl-0 = <&i2s0_2ch_bus>; + pinctrl-1 = <&i2s0_2ch_bus_bclk_off>; rockchip,playback-channels = <2>; rockchip,capture-channels = <2>; status = "okay"; @@ -417,8 +434,8 @@ /* * As Q7 does not specify neither a global nor a RX clock for I2S these * signals are not used. Furthermore I2S0_LRCK_RX is used as GPIO. - * Therefore we have to redefine the i2s0_2ch_bus definition to prevent - * conflicts. + * Therefore we have to redefine the i2s0_2ch_bus and i2s0_2ch_bus_bclk_off + * definitions to prevent conflicts. */ &i2s0_2ch_bus { rockchip,pins = @@ -428,6 +445,14 @@ <3 RK_PD7 1 &pcfg_pull_none>; }; +&i2s0_2ch_bus_bclk_off { + rockchip,pins = + <3 RK_PD0 RK_FUNC_GPIO &pcfg_pull_none>, + <3 RK_PD2 1 &pcfg_pull_none>, + <3 RK_PD3 1 &pcfg_pull_none>, + <3 RK_PD7 1 &pcfg_pull_none>; +}; + &io_domains { status = "okay"; bt656-supply = <&vcc_1v8>; @@ -449,9 +474,14 @@ &pinctrl { pinctrl-names = "default"; - pinctrl-0 = <&q7_thermal_pin>; + pinctrl-0 = <&q7_thermal_pin &bios_disable_override_hog_pin>; gpios { + bios_disable_override_hog_pin: bios-disable-override-hog-pin { + rockchip,pins = + <3 RK_PD5 RK_FUNC_GPIO &pcfg_pull_down>; + }; + q7_thermal_pin: q7-thermal-pin { rockchip,pins = <0 RK_PA3 RK_FUNC_GPIO &pcfg_pull_up>; diff --git a/arch/arm64/boot/dts/rockchip/rk356x.dtsi b/arch/arm64/boot/dts/rockchip/rk356x.dtsi index 4690be841a1c..c72b3a608edd 100644 --- a/arch/arm64/boot/dts/rockchip/rk356x.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk356x.dtsi @@ -1592,10 +1592,9 @@ <&cru SRST_TSADCPHY>; rockchip,grf = <&grf>; rockchip,hw-tshut-temp = <95000>; - pinctrl-names = "init", "default", "sleep"; - pinctrl-0 = <&tsadc_pin>; - pinctrl-1 = <&tsadc_shutorg>; - pinctrl-2 = <&tsadc_pin>; + pinctrl-names = "default", "sleep"; + pinctrl-0 = <&tsadc_shutorg>; + pinctrl-1 = <&tsadc_pin>; #thermal-sensor-cells = <1>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi index b6e4df180f0b..ee99166ebd46 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi @@ -582,14 +582,14 @@ }; vo0_grf: syscon@fd5a6000 { - compatible = "rockchip,rk3588-vo-grf", "syscon"; + compatible = "rockchip,rk3588-vo0-grf", "syscon"; reg = <0x0 0xfd5a6000 0x0 0x2000>; clocks = <&cru PCLK_VO0GRF>; }; vo1_grf: syscon@fd5a8000 { - compatible = "rockchip,rk3588-vo-grf", "syscon"; - reg = <0x0 0xfd5a8000 0x0 0x100>; + compatible = "rockchip,rk3588-vo1-grf", "syscon"; + reg = <0x0 0xfd5a8000 0x0 0x4000>; clocks = <&cru PCLK_VO1GRF>; }; diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl index cbc980fb02e3..22c9069c0650 100644 --- a/arch/arm64/crypto/poly1305-armv8.pl +++ b/arch/arm64/crypto/poly1305-armv8.pl @@ -473,7 +473,8 @@ poly1305_blocks_neon: subs $len,$len,#64 ldp x9,x13,[$inp,#48] add $in2,$inp,#96 - adr $zeros,.Lzeros + adrp $zeros,.Lzeros + add $zeros,$zeros,#:lo12:.Lzeros lsl $padbit,$padbit,#24 add x15,$ctx,#48 @@ -885,10 +886,13 @@ poly1305_blocks_neon: ret .size poly1305_blocks_neon,.-poly1305_blocks_neon +.pushsection .rodata .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0 .asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" +.popsection + .align 2 #if !defined(__KERNEL__) && !defined(_WIN64) .comm OPENSSL_armcap_P,4,4 diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h index a4697a0b6835..468a049bc63b 100644 --- a/arch/arm64/include/asm/arm_pmuv3.h +++ b/arch/arm64/include/asm/arm_pmuv3.h @@ -33,6 +33,14 @@ static inline void write_pmevtypern(int n, unsigned long val) PMEVN_SWITCH(n, WRITE_PMEVTYPERN); } +#define RETURN_READ_PMEVTYPERN(n) \ + return read_sysreg(pmevtyper##n##_el0) +static inline unsigned long read_pmevtypern(int n) +{ + PMEVN_SWITCH(n, RETURN_READ_PMEVTYPERN); + return 0; +} + static inline unsigned long read_pmmir(void) { return read_cpuid(PMMIR_EL1); @@ -46,6 +54,14 @@ static inline u32 read_pmuver(void) ID_AA64DFR0_EL1_PMUVer_SHIFT); } +static inline bool pmuv3_has_icntr(void) +{ + u64 dfr1 = read_sysreg(id_aa64dfr1_el1); + + return !!cpuid_feature_extract_unsigned_field(dfr1, + ID_AA64DFR1_EL1_PMICNTR_SHIFT); +} + static inline void write_pmcr(u64 val) { write_sysreg(val, pmcr_el0); @@ -71,22 +87,32 @@ static inline u64 read_pmccntr(void) return read_sysreg(pmccntr_el0); } -static inline void write_pmcntenset(u32 val) +static inline void write_pmicntr(u64 val) +{ + write_sysreg_s(val, SYS_PMICNTR_EL0); +} + +static inline u64 read_pmicntr(void) +{ + return read_sysreg_s(SYS_PMICNTR_EL0); +} + +static inline void write_pmcntenset(u64 val) { write_sysreg(val, pmcntenset_el0); } -static inline void write_pmcntenclr(u32 val) +static inline void write_pmcntenclr(u64 val) { write_sysreg(val, pmcntenclr_el0); } -static inline void write_pmintenset(u32 val) +static inline void write_pmintenset(u64 val) { write_sysreg(val, pmintenset_el1); } -static inline void write_pmintenclr(u32 val) +static inline void write_pmintenclr(u64 val) { write_sysreg(val, pmintenclr_el1); } @@ -96,12 +122,27 @@ static inline void write_pmccfiltr(u64 val) write_sysreg(val, pmccfiltr_el0); } -static inline void write_pmovsclr(u32 val) +static inline u64 read_pmccfiltr(void) +{ + return read_sysreg(pmccfiltr_el0); +} + +static inline void write_pmicfiltr(u64 val) +{ + write_sysreg_s(val, SYS_PMICFILTR_EL0); +} + +static inline u64 read_pmicfiltr(void) +{ + return read_sysreg_s(SYS_PMICFILTR_EL0); +} + +static inline void write_pmovsclr(u64 val) { write_sysreg(val, pmovsclr_el0); } -static inline u32 read_pmovsclr(void) +static inline u64 read_pmovsclr(void) { return read_sysreg(pmovsclr_el0); } diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 558434267271..3d261cc123c1 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -832,6 +832,12 @@ static inline bool system_supports_lpa2(void) return cpus_have_final_cap(ARM64_HAS_LPA2); } +static inline bool system_supports_poe(void) +{ + return IS_ENABLED(CONFIG_ARM64_POE) && + alternative_has_cap_unlikely(ARM64_HAS_S1POE); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 5fd7caea4419..5a7dfeb8e8eb 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -143,6 +143,7 @@ #define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039 #define AMPERE_CPU_PART_AMPERE1 0xAC3 +#define AMPERE_CPU_PART_AMPERE1A 0xAC4 #define MICROSOFT_CPU_PART_AZURE_COBALT_100 0xD49 /* Based on r0p0 of ARM Neoverse N2 */ @@ -212,6 +213,7 @@ #define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX) #define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) +#define MIDR_AMPERE1A MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1A) #define MIDR_MICROSOFT_AZURE_COBALT_100 MIDR_CPU_MODEL(ARM_CPU_IMP_MICROSOFT, MICROSOFT_CPU_PART_AZURE_COBALT_100) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index fd87c4b8f984..e0ffdf13a18b 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -165,42 +165,53 @@ mrs x1, id_aa64dfr0_el1 ubfx x1, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 cmp x1, #3 - b.lt .Lset_debug_fgt_\@ + b.lt .Lskip_spe_fgt_\@ /* Disable PMSNEVFR_EL1 read and write traps */ orr x0, x0, #(1 << 62) -.Lset_debug_fgt_\@: +.Lskip_spe_fgt_\@: msr_s SYS_HDFGRTR_EL2, x0 msr_s SYS_HDFGWTR_EL2, x0 mov x0, xzr mrs x1, id_aa64pfr1_el1 ubfx x1, x1, #ID_AA64PFR1_EL1_SME_SHIFT, #4 - cbz x1, .Lset_pie_fgt_\@ + cbz x1, .Lskip_debug_fgt_\@ /* Disable nVHE traps of TPIDR2 and SMPRI */ orr x0, x0, #HFGxTR_EL2_nSMPRI_EL1_MASK orr x0, x0, #HFGxTR_EL2_nTPIDR2_EL0_MASK -.Lset_pie_fgt_\@: +.Lskip_debug_fgt_\@: mrs_s x1, SYS_ID_AA64MMFR3_EL1 ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 - cbz x1, .Lset_fgt_\@ + cbz x1, .Lskip_pie_fgt_\@ /* Disable trapping of PIR_EL1 / PIRE0_EL1 */ orr x0, x0, #HFGxTR_EL2_nPIR_EL1 orr x0, x0, #HFGxTR_EL2_nPIRE0_EL1 -.Lset_fgt_\@: +.Lskip_pie_fgt_\@: + mrs_s x1, SYS_ID_AA64MMFR3_EL1 + ubfx x1, x1, #ID_AA64MMFR3_EL1_S1POE_SHIFT, #4 + cbz x1, .Lskip_poe_fgt_\@ + + /* Disable trapping of POR_EL0 */ + orr x0, x0, #HFGxTR_EL2_nPOR_EL0 + +.Lskip_poe_fgt_\@: msr_s SYS_HFGRTR_EL2, x0 msr_s SYS_HFGWTR_EL2, x0 msr_s SYS_HFGITR_EL2, xzr mrs x1, id_aa64pfr0_el1 // AMU traps UNDEF without AMU ubfx x1, x1, #ID_AA64PFR0_EL1_AMU_SHIFT, #4 - cbz x1, .Lskip_fgt_\@ + cbz x1, .Lskip_amu_fgt_\@ msr_s SYS_HAFGRTR_EL2, xzr + +.Lskip_amu_fgt_\@: + .Lskip_fgt_\@: .endm diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 56c148890daf..da6d2c1c0b03 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -10,63 +10,63 @@ #include <asm/memory.h> #include <asm/sysreg.h> -#define ESR_ELx_EC_UNKNOWN (0x00) -#define ESR_ELx_EC_WFx (0x01) +#define ESR_ELx_EC_UNKNOWN UL(0x00) +#define ESR_ELx_EC_WFx UL(0x01) /* Unallocated EC: 0x02 */ -#define ESR_ELx_EC_CP15_32 (0x03) -#define ESR_ELx_EC_CP15_64 (0x04) -#define ESR_ELx_EC_CP14_MR (0x05) -#define ESR_ELx_EC_CP14_LS (0x06) -#define ESR_ELx_EC_FP_ASIMD (0x07) -#define ESR_ELx_EC_CP10_ID (0x08) /* EL2 only */ -#define ESR_ELx_EC_PAC (0x09) /* EL2 and above */ +#define ESR_ELx_EC_CP15_32 UL(0x03) +#define ESR_ELx_EC_CP15_64 UL(0x04) +#define ESR_ELx_EC_CP14_MR UL(0x05) +#define ESR_ELx_EC_CP14_LS UL(0x06) +#define ESR_ELx_EC_FP_ASIMD UL(0x07) +#define ESR_ELx_EC_CP10_ID UL(0x08) /* EL2 only */ +#define ESR_ELx_EC_PAC UL(0x09) /* EL2 and above */ /* Unallocated EC: 0x0A - 0x0B */ -#define ESR_ELx_EC_CP14_64 (0x0C) -#define ESR_ELx_EC_BTI (0x0D) -#define ESR_ELx_EC_ILL (0x0E) +#define ESR_ELx_EC_CP14_64 UL(0x0C) +#define ESR_ELx_EC_BTI UL(0x0D) +#define ESR_ELx_EC_ILL UL(0x0E) /* Unallocated EC: 0x0F - 0x10 */ -#define ESR_ELx_EC_SVC32 (0x11) -#define ESR_ELx_EC_HVC32 (0x12) /* EL2 only */ -#define ESR_ELx_EC_SMC32 (0x13) /* EL2 and above */ +#define ESR_ELx_EC_SVC32 UL(0x11) +#define ESR_ELx_EC_HVC32 UL(0x12) /* EL2 only */ +#define ESR_ELx_EC_SMC32 UL(0x13) /* EL2 and above */ /* Unallocated EC: 0x14 */ -#define ESR_ELx_EC_SVC64 (0x15) -#define ESR_ELx_EC_HVC64 (0x16) /* EL2 and above */ -#define ESR_ELx_EC_SMC64 (0x17) /* EL2 and above */ -#define ESR_ELx_EC_SYS64 (0x18) -#define ESR_ELx_EC_SVE (0x19) -#define ESR_ELx_EC_ERET (0x1a) /* EL2 only */ +#define ESR_ELx_EC_SVC64 UL(0x15) +#define ESR_ELx_EC_HVC64 UL(0x16) /* EL2 and above */ +#define ESR_ELx_EC_SMC64 UL(0x17) /* EL2 and above */ +#define ESR_ELx_EC_SYS64 UL(0x18) +#define ESR_ELx_EC_SVE UL(0x19) +#define ESR_ELx_EC_ERET UL(0x1a) /* EL2 only */ /* Unallocated EC: 0x1B */ -#define ESR_ELx_EC_FPAC (0x1C) /* EL1 and above */ -#define ESR_ELx_EC_SME (0x1D) +#define ESR_ELx_EC_FPAC UL(0x1C) /* EL1 and above */ +#define ESR_ELx_EC_SME UL(0x1D) /* Unallocated EC: 0x1E */ -#define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */ -#define ESR_ELx_EC_IABT_LOW (0x20) -#define ESR_ELx_EC_IABT_CUR (0x21) -#define ESR_ELx_EC_PC_ALIGN (0x22) +#define ESR_ELx_EC_IMP_DEF UL(0x1f) /* EL3 only */ +#define ESR_ELx_EC_IABT_LOW UL(0x20) +#define ESR_ELx_EC_IABT_CUR UL(0x21) +#define ESR_ELx_EC_PC_ALIGN UL(0x22) /* Unallocated EC: 0x23 */ -#define ESR_ELx_EC_DABT_LOW (0x24) -#define ESR_ELx_EC_DABT_CUR (0x25) -#define ESR_ELx_EC_SP_ALIGN (0x26) -#define ESR_ELx_EC_MOPS (0x27) -#define ESR_ELx_EC_FP_EXC32 (0x28) +#define ESR_ELx_EC_DABT_LOW UL(0x24) +#define ESR_ELx_EC_DABT_CUR UL(0x25) +#define ESR_ELx_EC_SP_ALIGN UL(0x26) +#define ESR_ELx_EC_MOPS UL(0x27) +#define ESR_ELx_EC_FP_EXC32 UL(0x28) /* Unallocated EC: 0x29 - 0x2B */ -#define ESR_ELx_EC_FP_EXC64 (0x2C) +#define ESR_ELx_EC_FP_EXC64 UL(0x2C) /* Unallocated EC: 0x2D - 0x2E */ -#define ESR_ELx_EC_SERROR (0x2F) -#define ESR_ELx_EC_BREAKPT_LOW (0x30) -#define ESR_ELx_EC_BREAKPT_CUR (0x31) -#define ESR_ELx_EC_SOFTSTP_LOW (0x32) -#define ESR_ELx_EC_SOFTSTP_CUR (0x33) -#define ESR_ELx_EC_WATCHPT_LOW (0x34) -#define ESR_ELx_EC_WATCHPT_CUR (0x35) +#define ESR_ELx_EC_SERROR UL(0x2F) +#define ESR_ELx_EC_BREAKPT_LOW UL(0x30) +#define ESR_ELx_EC_BREAKPT_CUR UL(0x31) +#define ESR_ELx_EC_SOFTSTP_LOW UL(0x32) +#define ESR_ELx_EC_SOFTSTP_CUR UL(0x33) +#define ESR_ELx_EC_WATCHPT_LOW UL(0x34) +#define ESR_ELx_EC_WATCHPT_CUR UL(0x35) /* Unallocated EC: 0x36 - 0x37 */ -#define ESR_ELx_EC_BKPT32 (0x38) +#define ESR_ELx_EC_BKPT32 UL(0x38) /* Unallocated EC: 0x39 */ -#define ESR_ELx_EC_VECTOR32 (0x3A) /* EL2 only */ +#define ESR_ELx_EC_VECTOR32 UL(0x3A) /* EL2 only */ /* Unallocated EC: 0x3B */ -#define ESR_ELx_EC_BRK64 (0x3C) +#define ESR_ELx_EC_BRK64 UL(0x3C) /* Unallocated EC: 0x3D - 0x3F */ -#define ESR_ELx_EC_MAX (0x3F) +#define ESR_ELx_EC_MAX UL(0x3F) #define ESR_ELx_EC_SHIFT (26) #define ESR_ELx_EC_WIDTH (6) @@ -122,8 +122,8 @@ #define ESR_ELx_FSC_SECC_TTW(n) (0x1c + (n)) /* Status codes for individual page table levels */ -#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + n) -#define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + n) +#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + (n)) +#define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + (n)) #define ESR_ELx_FSC_FAULT_nL (0x2C) #define ESR_ELx_FSC_FAULT_L(n) (((n) < 0 ? ESR_ELx_FSC_FAULT_nL : \ @@ -161,6 +161,7 @@ /* ISS field definitions for exceptions taken in to Hyp */ #define ESR_ELx_FSC_ADDRSZ (0x00) +#define ESR_ELx_FSC_ADDRSZ_L(n) (ESR_ELx_FSC_ADDRSZ + (n)) #define ESR_ELx_CV (UL(1) << 24) #define ESR_ELx_COND_SHIFT (20) #define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index bc69ac368d73..f2a84efc3618 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -155,8 +155,6 @@ extern void cpu_enable_sme2(const struct arm64_cpu_capabilities *__unused); extern void cpu_enable_fa64(const struct arm64_cpu_capabilities *__unused); extern void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__unused); -extern u64 read_smcr_features(void); - /* * Helpers to translate bit indices in sve_vq_map to VQ values (and * vice versa). This allows find_next_bit() to be used to find the diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 4edd3b61df11..a775adddecf2 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -157,6 +157,7 @@ #define KERNEL_HWCAP_SME_SF8FMA __khwcap2_feature(SME_SF8FMA) #define KERNEL_HWCAP_SME_SF8DP4 __khwcap2_feature(SME_SF8DP4) #define KERNEL_HWCAP_SME_SF8DP2 __khwcap2_feature(SME_SF8DP2) +#define KERNEL_HWCAP_POE __khwcap2_feature(POE) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h index 0ae427f352c8..409e239834d1 100644 --- a/arch/arm64/include/asm/hypervisor.h +++ b/arch/arm64/include/asm/hypervisor.h @@ -7,4 +7,15 @@ void kvm_init_hyp_services(void); bool kvm_arm_hyp_service_available(u32 func_id); +#ifdef CONFIG_ARM_PKVM_GUEST +void pkvm_init_hyp_services(void); +#else +static inline void pkvm_init_hyp_services(void) { }; +#endif + +static inline void kvm_arch_init_hyp_services(void) +{ + pkvm_init_hyp_services(); +}; + #endif diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 41fd90895dfc..1ada23a6ec19 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -271,6 +271,10 @@ __iowrite64_copy(void __iomem *to, const void *from, size_t count) * I/O memory mapping functions. */ +typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size, + pgprot_t *prot); +int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); + #define ioremap_prot ioremap_prot #define _PAGE_IOREMAP PROT_DEVICE_nGnRE diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index d81cc746e0eb..109a85ee6910 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -107,6 +107,7 @@ /* TCR_EL2 Registers bits */ #define TCR_EL2_DS (1UL << 32) #define TCR_EL2_RES1 ((1U << 31) | (1 << 23)) +#define TCR_EL2_HPD (1 << 24) #define TCR_EL2_TBI (1 << 20) #define TCR_EL2_PS_SHIFT 16 #define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 2181a11b9d92..b36a3b6cc011 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -10,6 +10,7 @@ #include <asm/hyp_image.h> #include <asm/insn.h> #include <asm/virt.h> +#include <asm/sysreg.h> #define ARM_EXIT_WITH_SERROR_BIT 31 #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) @@ -235,6 +236,9 @@ extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); extern int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding); extern void __kvm_timer_set_cntvoff(u64 cntvoff); +extern void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); @@ -259,7 +263,7 @@ extern u64 __kvm_get_mdcr_el2(void); asm volatile( \ " mrs %1, spsr_el2\n" \ " mrs %2, elr_el2\n" \ - "1: at "at_op", %3\n" \ + "1: " __msr_s(at_op, "%3") "\n" \ " isb\n" \ " b 9f\n" \ "2: msr spsr_el2, %1\n" \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a33f5996ca9f..329619c6fa96 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -446,6 +446,12 @@ enum vcpu_sysreg { GCR_EL1, /* Tag Control Register */ TFSRE0_EL1, /* Tag Fault Status Register (EL0) */ + POR_EL0, /* Permission Overlay Register 0 (EL0) */ + + /* FP/SIMD/SVE */ + SVCR, + FPMR, + /* 32bit specific registers. */ DACR32_EL2, /* Domain Access Control Register */ IFSR32_EL2, /* Instruction Fault Status Register */ @@ -517,6 +523,8 @@ enum vcpu_sysreg { VNCR(PIR_EL1), /* Permission Indirection Register 1 (EL1) */ VNCR(PIRE0_EL1), /* Permission Indirection Register 0 (EL1) */ + VNCR(POR_EL1), /* Permission Overlay Register 1 (EL1) */ + VNCR(HFGRTR_EL2), VNCR(HFGWTR_EL2), VNCR(HFGITR_EL2), @@ -530,6 +538,8 @@ enum vcpu_sysreg { VNCR(CNTP_CVAL_EL0), VNCR(CNTP_CTL_EL0), + VNCR(ICH_HCR_EL2), + NR_SYS_REGS /* Nothing after this line! */ }; @@ -595,6 +605,16 @@ struct kvm_host_data { struct cpu_sve_state *sve_state; }; + union { + /* HYP VA pointer to the host storage for FPMR */ + u64 *fpmr_ptr; + /* + * Used by pKVM only, as it needs to provide storage + * for the host + */ + u64 fpmr; + }; + /* Ownership of the FP regs */ enum { FP_STATE_FREE, @@ -664,8 +684,6 @@ struct kvm_vcpu_arch { void *sve_state; enum fp_type fp_type; unsigned int sve_max_vl; - u64 svcr; - u64 fpmr; /* Stage 2 paging state used by the hardware on next switch */ struct kvm_s2_mmu *hw_mmu; @@ -1330,12 +1348,12 @@ void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM -void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr); -void kvm_clr_pmu_events(u32 clr); +void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr); +void kvm_clr_pmu_events(u64 clr); bool kvm_set_pmuserenr(u64 val); #else -static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} -static inline void kvm_clr_pmu_events(u32 clr) {} +static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {} +static inline void kvm_clr_pmu_events(u64 clr) {} static inline bool kvm_set_pmuserenr(u64 val) { return false; @@ -1473,4 +1491,8 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); (pa + pi + pa3) == 1; \ }) +#define kvm_has_fpmr(k) \ + (system_supports_fpmr() && \ + kvm_has_feat((k), ID_AA64PFR2_EL1, FPMR, IMP)) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 216ca424bb16..cd4087fbda9a 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -352,5 +352,11 @@ static inline bool kvm_is_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) return &kvm->arch.mmu != mmu; } +#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm); +#else +static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {} +#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */ + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 5b06c31035a2..e8bc6d67aba2 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -85,7 +85,7 @@ struct kvm_s2_trans { bool readable; int level; u32 esr; - u64 upper_attr; + u64 desc; }; static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans) @@ -115,7 +115,7 @@ static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans) static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans) { - return !(trans->upper_attr & BIT(54)); + return !(trans->desc & BIT(54)); } extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, @@ -205,4 +205,40 @@ static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans) return FIELD_PREP(KVM_NV_GUEST_MAP_SZ, trans->level); } +/* Adjust alignment for the contiguous bit as per StageOA() */ +#define contiguous_bit_shift(d, wi, l) \ + ({ \ + u8 shift = 0; \ + \ + if ((d) & PTE_CONT) { \ + switch (BIT((wi)->pgshift)) { \ + case SZ_4K: \ + shift = 4; \ + break; \ + case SZ_16K: \ + shift = (l) == 2 ? 5 : 7; \ + break; \ + case SZ_64K: \ + shift = 5; \ + break; \ + } \ + } \ + \ + shift; \ + }) + +static inline unsigned int ps_to_output_size(unsigned int ps) +{ + switch (ps) { + case 0: return 32; + case 1: return 36; + case 2: return 40; + case 3: return 42; + case 4: return 44; + case 5: + default: + return 48; + } +} + #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 19278dfe7978..03f4c3d7839c 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -59,6 +59,48 @@ typedef u64 kvm_pte_t; #define KVM_PHYS_INVALID (-1ULL) +#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) + +#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ + ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ + ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) +#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) + +#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) + +#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) + +#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) + +#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) + +#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ + KVM_PTE_LEAF_ATTR_HI_S2_XN) + +#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) +#define KVM_MAX_OWNER_ID 1 + +/* + * Used to indicate a pte for which a 'break-before-make' sequence is in + * progress. + */ +#define KVM_INVALID_PTE_LOCKED BIT(10) + static inline bool kvm_pte_valid(kvm_pte_t pte) { return pte & KVM_PTE_VALID; diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..b0c9a86b13a4 --- /dev/null +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_MEM_ENCRYPT_H +#define __ASM_MEM_ENCRYPT_H + +struct arm64_mem_crypt_ops { + int (*encrypt)(unsigned long addr, int numpages); + int (*decrypt)(unsigned long addr, int numpages); +}; + +int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops); + +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + +#endif /* __ASM_MEM_ENCRYPT_H */ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 54fb014eba05..0480c61dbb4f 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -110,6 +110,8 @@ #define PAGE_END (_PAGE_END(VA_BITS_MIN)) #endif /* CONFIG_KASAN */ +#define PHYSMEM_END __pa(PAGE_END - 1) + #define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) /* diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index 5966ee4a6154..52791715f6e6 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -7,7 +7,7 @@ #include <uapi/asm/mman.h> static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, - unsigned long pkey __always_unused) + unsigned long pkey) { unsigned long ret = 0; @@ -17,6 +17,14 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, if (system_supports_mte() && (prot & PROT_MTE)) ret |= VM_MTE; +#ifdef CONFIG_ARCH_HAS_PKEYS + if (system_supports_poe()) { + ret |= pkey & BIT(0) ? VM_PKEY_BIT0 : 0; + ret |= pkey & BIT(1) ? VM_PKEY_BIT1 : 0; + ret |= pkey & BIT(2) ? VM_PKEY_BIT2 : 0; + } +#endif + return ret; } #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 65977c7783c5..2ec96d91acc6 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -25,6 +25,7 @@ typedef struct { refcount_t pinned; void *vdso; unsigned long flags; + u8 pkey_allocation_map; } mm_context_t; /* @@ -63,7 +64,6 @@ static inline bool arm64_kernel_unmapped_at_el0(void) extern void arm64_memblock_init(void); extern void paging_init(void); extern void bootmem_init(void); -extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); extern void create_mapping_noalloc(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot); extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index bd19f4c758b7..7c09d47e09cb 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -15,12 +15,12 @@ #include <linux/sched/hotplug.h> #include <linux/mm_types.h> #include <linux/pgtable.h> +#include <linux/pkeys.h> #include <asm/cacheflush.h> #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/proc-fns.h> -#include <asm-generic/mm_hooks.h> #include <asm/cputype.h> #include <asm/sysreg.h> #include <asm/tlbflush.h> @@ -175,9 +175,36 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm) { atomic64_set(&mm->context.id, 0); refcount_set(&mm->context.pinned, 0); + + /* pkey 0 is the default, so always reserve it. */ + mm->context.pkey_allocation_map = BIT(0); + + return 0; +} + +static inline void arch_dup_pkeys(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + /* Duplicate the oldmm pkey state in mm: */ + mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; +} + +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + arch_dup_pkeys(oldmm, mm); + return 0; } +static inline void arch_exit_mmap(struct mm_struct *mm) +{ +} + +static inline void arch_unmap(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + #ifdef CONFIG_ARM64_SW_TTBR0_PAN static inline void update_saved_ttbr0(struct task_struct *tsk, struct mm_struct *mm) @@ -267,6 +294,23 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm) return -1UL >> 8; } +/* + * Only enforce protection keys on the current process, because there is no + * user context to access POR_EL0 for another address space. + */ +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + bool write, bool execute, bool foreign) +{ + if (!system_supports_poe()) + return true; + + /* allow access if the VMA is not one from this process */ + if (foreign || vma_is_foreign(vma)) + return true; + + return por_el0_allows_pkey(vma_pkey(vma), write, execute); +} + #include <asm-generic/mmu_context.h> #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 1f60aa1bc750..fd330c1db289 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -135,7 +135,6 @@ /* * Section */ -#define PMD_SECT_VALID (_AT(pmdval_t, 1) << 0) #define PMD_SECT_USER (_AT(pmdval_t, 1) << 6) /* AP[1] */ #define PMD_SECT_RDONLY (_AT(pmdval_t, 1) << 7) /* AP[2] */ #define PMD_SECT_S (_AT(pmdval_t, 3) << 8) @@ -200,11 +199,26 @@ #define PTE_PI_IDX_3 54 /* UXN */ /* + * POIndex[2:0] encoding (Permission Overlay Extension) + */ +#define PTE_PO_IDX_0 (_AT(pteval_t, 1) << 60) +#define PTE_PO_IDX_1 (_AT(pteval_t, 1) << 61) +#define PTE_PO_IDX_2 (_AT(pteval_t, 1) << 62) + +#define PTE_PO_IDX_MASK GENMASK_ULL(62, 60) + + +/* * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ #define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2) /* + * Hierarchical permission for Stage-1 tables + */ +#define S1_TABLE_AP (_AT(pmdval_t, 3) << 61) + +/* * Highest possible physical address supported. */ #define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS) @@ -298,6 +312,10 @@ #define TCR_TBI1 (UL(1) << 38) #define TCR_HA (UL(1) << 39) #define TCR_HD (UL(1) << 40) +#define TCR_HPD0_SHIFT 41 +#define TCR_HPD0 (UL(1) << TCR_HPD0_SHIFT) +#define TCR_HPD1_SHIFT 42 +#define TCR_HPD1 (UL(1) << TCR_HPD1_SHIFT) #define TCR_TBID0 (UL(1) << 51) #define TCR_TBID1 (UL(1) << 52) #define TCR_NFD0 (UL(1) << 53) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index b11cfb9fdd37..2a11d0c10760 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -154,10 +154,10 @@ static inline bool __pure lpa2_is_enabled(void) #define PIE_E0 ( \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW)) + PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) #define PIE_E1 ( \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7a4f5604be3f..96c2b0b07c4c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -34,6 +34,7 @@ #include <asm/cmpxchg.h> #include <asm/fixmap.h> +#include <asm/por.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/sched.h> @@ -149,6 +150,24 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) #define pte_accessible(mm, pte) \ (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte)) +static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute) +{ + u64 por; + + if (!system_supports_poe()) + return true; + + por = read_sysreg_s(SYS_POR_EL0); + + if (write) + return por_elx_allows_write(por, pkey); + + if (execute) + return por_elx_allows_exec(por, pkey); + + return por_elx_allows_read(por, pkey); +} + /* * p??_access_permitted() is true for valid user mappings (PTE_USER * bit set, subject to the write permission check). For execute-only @@ -156,8 +175,11 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) * not set) must return false. PROT_NONE mappings do not have the * PTE_VALID bit set. */ -#define pte_access_permitted(pte, write) \ +#define pte_access_permitted_no_overlay(pte, write) \ (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) +#define pte_access_permitted(pte, write) \ + (pte_access_permitted_no_overlay(pte, write) && \ + por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false)) #define pmd_access_permitted(pmd, write) \ (pte_access_permitted(pmd_pte(pmd), (write))) #define pud_access_permitted(pud, write) \ @@ -373,10 +395,11 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) /* * If the PTE would provide user space access to the tags associated * with it then ensure that the MTE tags are synchronised. Although - * pte_access_permitted() returns false for exec only mappings, they - * don't expose tags (instruction fetches don't check tags). + * pte_access_permitted_no_overlay() returns false for exec only + * mappings, they don't expose tags (instruction fetches don't check + * tags). */ - if (system_supports_mte() && pte_access_permitted(pte, false) && + if (system_supports_mte() && pte_access_permitted_no_overlay(pte, false) && !pte_special(pte) && pte_tagged(pte)) mte_sync_tags(pte, nr_pages); } @@ -1103,7 +1126,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | PTE_PRESENT_INVALID | PTE_VALID | PTE_WRITE | - PTE_GP | PTE_ATTRINDX_MASK; + PTE_GP | PTE_ATTRINDX_MASK | PTE_PO_IDX_MASK; + /* preserve the hardware dirty information */ if (pte_hw_dirty(pte)) pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h new file mode 100644 index 000000000000..0ca5f83ce148 --- /dev/null +++ b/arch/arm64/include/asm/pkeys.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + * + * Based on arch/x86/include/asm/pkeys.h + */ + +#ifndef _ASM_ARM64_PKEYS_H +#define _ASM_ARM64_PKEYS_H + +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2) + +#define arch_max_pkey() 8 + +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val); + +static inline bool arch_pkeys_enabled(void) +{ + return system_supports_poe(); +} + +static inline int vma_pkey(struct vm_area_struct *vma) +{ + return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT; +} + +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, + int prot, int pkey) +{ + if (pkey != -1) + return pkey; + + return vma_pkey(vma); +} + +static inline int execute_only_pkey(struct mm_struct *mm) +{ + // Execute-only mappings are handled by EPAN/FEAT_PAN3. + return -1; +} + +#define mm_pkey_allocation_map(mm) (mm)->context.pkey_allocation_map +#define mm_set_pkey_allocated(mm, pkey) do { \ + mm_pkey_allocation_map(mm) |= (1U << pkey); \ +} while (0) +#define mm_set_pkey_free(mm, pkey) do { \ + mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ +} while (0) + +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) +{ + /* + * "Allocated" pkeys are those that have been returned + * from pkey_alloc() or pkey 0 which is allocated + * implicitly when the mm is created. + */ + if (pkey < 0 || pkey >= arch_max_pkey()) + return false; + + return mm_pkey_allocation_map(mm) & (1U << pkey); +} + +/* + * Returns a positive, 3-bit key on success, or -1 on failure. + */ +static inline int mm_pkey_alloc(struct mm_struct *mm) +{ + /* + * Note: this is the one and only place we make sure + * that the pkey is valid as far as the hardware is + * concerned. The rest of the kernel trusts that + * only good, valid pkeys come out of here. + */ + u8 all_pkeys_mask = GENMASK(arch_max_pkey() - 1, 0); + int ret; + + if (!arch_pkeys_enabled()) + return -1; + + /* + * Are we out of pkeys? We must handle this specially + * because ffz() behavior is undefined if there are no + * zeros. + */ + if (mm_pkey_allocation_map(mm) == all_pkeys_mask) + return -1; + + ret = ffz(mm_pkey_allocation_map(mm)); + + mm_set_pkey_allocated(mm, ret); + + return ret; +} + +static inline int mm_pkey_free(struct mm_struct *mm, int pkey) +{ + if (!mm_pkey_is_allocated(mm, pkey)) + return -EINVAL; + + mm_set_pkey_free(mm, pkey); + + return 0; +} + +#endif /* _ASM_ARM64_PKEYS_H */ diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h new file mode 100644 index 000000000000..e06e9f473675 --- /dev/null +++ b/arch/arm64/include/asm/por.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + */ + +#ifndef _ASM_ARM64_POR_H +#define _ASM_ARM64_POR_H + +#define POR_BITS_PER_PKEY 4 +#define POR_ELx_IDX(por_elx, idx) (((por_elx) >> ((idx) * POR_BITS_PER_PKEY)) & 0xf) + +static inline bool por_elx_allows_read(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_R; +} + +static inline bool por_elx_allows_write(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_W; +} + +static inline bool por_elx_allows_exec(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_X; +} + +#endif /* _ASM_ARM64_POR_H */ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index f77371232d8c..1438424f0064 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -184,6 +184,7 @@ struct thread_struct { u64 sctlr_user; u64 svcr; u64 tpidr2_el0; + u64 por_el0; }; static inline unsigned int thread_get_vl(struct thread_struct *thread, @@ -402,5 +403,10 @@ long get_tagged_addr_ctrl(struct task_struct *task); #define GET_TAGGED_ADDR_CTRL() get_tagged_addr_ctrl(current) #endif +int get_tsc_mode(unsigned long adr); +int set_tsc_mode(unsigned int val); +#define GET_TSC_CTL(adr) get_tsc_mode((adr)) +#define SET_TSC_CTL(val) set_tsc_mode((val)) + #endif /* __ASSEMBLY__ */ #endif /* __ASM_PROCESSOR_H */ diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index 5b1701c76d1c..6cf4aae05219 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -5,6 +5,8 @@ #ifndef __ASM_PTDUMP_H #define __ASM_PTDUMP_H +#include <linux/ptdump.h> + #ifdef CONFIG_PTDUMP_CORE #include <linux/mm_types.h> @@ -21,14 +23,53 @@ struct ptdump_info { unsigned long base_addr; }; +struct ptdump_prot_bits { + u64 mask; + u64 val; + const char *set; + const char *clear; +}; + +struct ptdump_pg_level { + const struct ptdump_prot_bits *bits; + char name[4]; + int num; + u64 mask; +}; + +/* + * The page dumper groups page table entries of the same type into a single + * description. It uses pg_state to track the range information while + * iterating over the pte entries. When the continuity is broken it then + * dumps out a description of the range. + */ +struct ptdump_pg_state { + struct ptdump_state ptdump; + struct ptdump_pg_level *pg_level; + struct seq_file *seq; + const struct addr_marker *marker; + const struct mm_struct *mm; + unsigned long start_address; + int level; + u64 current_prot; + bool check_wx; + unsigned long wx_pages; + unsigned long uxn_pages; +}; + void ptdump_walk(struct seq_file *s, struct ptdump_info *info); +void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, + u64 val); #ifdef CONFIG_PTDUMP_DEBUGFS #define EFI_RUNTIME_MAP_END DEFAULT_MAP_WINDOW_64 void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name); #else static inline void ptdump_debugfs_register(struct ptdump_info *info, const char *name) { } -#endif +#endif /* CONFIG_PTDUMP_DEBUGFS */ +#else +static inline void note_page(struct ptdump_state *pt_st, unsigned long addr, + int level, u64 val) { } #endif /* CONFIG_PTDUMP_CORE */ #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h index 0f740b781187..917761feeffd 100644 --- a/arch/arm64/include/asm/set_memory.h +++ b/arch/arm64/include/asm/set_memory.h @@ -3,6 +3,7 @@ #ifndef _ASM_ARM64_SET_MEMORY_H #define _ASM_ARM64_SET_MEMORY_H +#include <asm/mem_encrypt.h> #include <asm-generic/set_memory.h> bool can_set_direct_map(void); diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 4a9ea103817e..9ea97dddefc4 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -109,6 +109,9 @@ #define set_pstate_ssbs(x) asm volatile(SET_PSTATE_SSBS(x)) #define set_pstate_dit(x) asm volatile(SET_PSTATE_DIT(x)) +/* Register-based PAN access, for save/restore purposes */ +#define SYS_PSTATE_PAN sys_reg(3, 0, 4, 2, 3) + #define __SYS_BARRIER_INSN(CRm, op2, Rt) \ __emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f)) @@ -325,7 +328,25 @@ #define SYS_PAR_EL1 sys_reg(3, 0, 7, 4, 0) #define SYS_PAR_EL1_F BIT(0) +/* When PAR_EL1.F == 1 */ #define SYS_PAR_EL1_FST GENMASK(6, 1) +#define SYS_PAR_EL1_PTW BIT(8) +#define SYS_PAR_EL1_S BIT(9) +#define SYS_PAR_EL1_AssuredOnly BIT(12) +#define SYS_PAR_EL1_TopLevel BIT(13) +#define SYS_PAR_EL1_Overlay BIT(14) +#define SYS_PAR_EL1_DirtyBit BIT(15) +#define SYS_PAR_EL1_F1_IMPDEF GENMASK_ULL(63, 48) +#define SYS_PAR_EL1_F1_RES0 (BIT(7) | BIT(10) | GENMASK_ULL(47, 16)) +#define SYS_PAR_EL1_RES1 BIT(11) +/* When PAR_EL1.F == 0 */ +#define SYS_PAR_EL1_SH GENMASK_ULL(8, 7) +#define SYS_PAR_EL1_NS BIT(9) +#define SYS_PAR_EL1_F0_IMPDEF BIT(10) +#define SYS_PAR_EL1_NSE BIT(11) +#define SYS_PAR_EL1_PA GENMASK_ULL(51, 12) +#define SYS_PAR_EL1_ATTR GENMASK_ULL(63, 56) +#define SYS_PAR_EL1_F0_RES0 (GENMASK_ULL(6, 1) | GENMASK_ULL(55, 52)) /*** Statistical Profiling Extension ***/ #define PMSEVFR_EL1_RES0_IMP \ @@ -403,7 +424,6 @@ #define SYS_PMCNTENCLR_EL0 sys_reg(3, 3, 9, 12, 2) #define SYS_PMOVSCLR_EL0 sys_reg(3, 3, 9, 12, 3) #define SYS_PMSWINC_EL0 sys_reg(3, 3, 9, 12, 4) -#define SYS_PMSELR_EL0 sys_reg(3, 3, 9, 12, 5) #define SYS_PMCEID0_EL0 sys_reg(3, 3, 9, 12, 6) #define SYS_PMCEID1_EL0 sys_reg(3, 3, 9, 12, 7) #define SYS_PMCCNTR_EL0 sys_reg(3, 3, 9, 13, 0) @@ -652,6 +672,7 @@ #define OP_AT_S12E1W sys_insn(AT_Op0, 4, AT_CRn, 8, 5) #define OP_AT_S12E0R sys_insn(AT_Op0, 4, AT_CRn, 8, 6) #define OP_AT_S12E0W sys_insn(AT_Op0, 4, AT_CRn, 8, 7) +#define OP_AT_S1E2A sys_insn(AT_Op0, 4, AT_CRn, 9, 2) /* TLBI instructions */ #define TLBI_Op0 1 @@ -1077,6 +1098,9 @@ #define POE_RXW UL(0x7) #define POE_MASK UL(0xf) +/* Initial value for Permission Overlay Extension for EL0 */ +#define POR_EL0_INIT POE_RXW + #define ARM64_FEATURE_FIELD_BITS 4 /* Defined for compatibility only, do not add new users. */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index e72a3bf9e563..1114c1c3300a 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -81,6 +81,7 @@ void arch_setup_new_exec(void); #define TIF_SME 27 /* SME in use */ #define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */ #define TIF_KERNEL_FPSTATE 29 /* Task is in a kernel mode FPSIMD section */ +#define TIF_TSC_SIGSEGV 30 /* SIGSEGV on counter-timer access */ #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) @@ -97,6 +98,7 @@ void arch_setup_new_exec(void); #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_TSC_SIGSEGV (1 << TIF_TSC_SIGSEGV) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index eefe766d6161..d780d1bd2eac 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -25,6 +25,7 @@ try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn) void force_signal_inject(int signal, int code, unsigned long address, unsigned long err); void arm64_notify_segfault(unsigned long addr); void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str); +void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey); void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str); void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str); diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h index df2c47c55972..06f8ec0906a6 100644 --- a/arch/arm64/include/asm/vncr_mapping.h +++ b/arch/arm64/include/asm/vncr_mapping.h @@ -52,6 +52,7 @@ #define VNCR_PIRE0_EL1 0x290 #define VNCR_PIRE0_EL2 0x298 #define VNCR_PIR_EL1 0x2A0 +#define VNCR_POR_EL1 0x2A8 #define VNCR_ICH_LR0_EL2 0x400 #define VNCR_ICH_LR1_EL2 0x408 #define VNCR_ICH_LR2_EL2 0x410 diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 285610e626f5..055381b2c615 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -122,5 +122,6 @@ #define HWCAP2_SME_SF8FMA (1UL << 60) #define HWCAP2_SME_SF8DP4 (1UL << 61) #define HWCAP2_SME_SF8DP2 (1UL << 62) +#define HWCAP2_POE (1UL << 63) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/include/uapi/asm/mman.h b/arch/arm64/include/uapi/asm/mman.h index 1e6482a838e1..e7e0c8216243 100644 --- a/arch/arm64/include/uapi/asm/mman.h +++ b/arch/arm64/include/uapi/asm/mman.h @@ -7,4 +7,13 @@ #define PROT_BTI 0x10 /* BTI guarded page */ #define PROT_MTE 0x20 /* Normal Tagged mapping */ +/* Override any generic PKEY permission defines */ +#define PKEY_DISABLE_EXECUTE 0x4 +#define PKEY_DISABLE_READ 0x8 +#undef PKEY_ACCESS_MASK +#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ + PKEY_DISABLE_WRITE |\ + PKEY_DISABLE_READ |\ + PKEY_DISABLE_EXECUTE) + #endif /* ! _UAPI__ASM_MMAN_H */ diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index 8a45b7a411e0..bb7af77a30a7 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -98,6 +98,13 @@ struct esr_context { __u64 esr; }; +#define POE_MAGIC 0x504f4530 + +struct poe_context { + struct _aarch64_ctx head; + __u64 por_el0; +}; + /* * extra_context: describes extra space in the signal frame for * additional structures that don't fit in sigcontext.__reserved[]. @@ -320,10 +327,10 @@ struct zt_context { ((sizeof(struct za_context) + (__SVE_VQ_BYTES - 1)) \ / __SVE_VQ_BYTES * __SVE_VQ_BYTES) -#define ZA_SIG_REGS_SIZE(vq) ((vq * __SVE_VQ_BYTES) * (vq * __SVE_VQ_BYTES)) +#define ZA_SIG_REGS_SIZE(vq) (((vq) * __SVE_VQ_BYTES) * ((vq) * __SVE_VQ_BYTES)) #define ZA_SIG_ZAV_OFFSET(vq, n) (ZA_SIG_REGS_OFFSET + \ - (SVE_SIG_ZREG_SIZE(vq) * n)) + (SVE_SIG_ZREG_SIZE(vq) * (n))) #define ZA_SIG_CONTEXT_SIZE(vq) \ (ZA_SIG_REGS_OFFSET + ZA_SIG_REGS_SIZE(vq)) @@ -334,7 +341,7 @@ struct zt_context { #define ZT_SIG_REGS_OFFSET sizeof(struct zt_context) -#define ZT_SIG_REGS_SIZE(n) (ZT_SIG_REG_BYTES * n) +#define ZT_SIG_REGS_SIZE(n) (ZT_SIG_REG_BYTES * (n)) #define ZT_SIG_CONTEXT_SIZE(n) \ (sizeof(struct zt_context) + ZT_SIG_REGS_SIZE(n)) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index f6b6b4507357..dfefbdf4073a 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -456,6 +456,14 @@ static const struct midr_range erratum_spec_ssbs_list[] = { }; #endif +#ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 +static const struct midr_range erratum_ac03_cpu_38_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { @@ -772,7 +780,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = { { .desc = "AmpereOne erratum AC03_CPU_38", .capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38, - ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1), + ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list), }, #endif { diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 646ecd3069fd..718728a85430 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -466,6 +466,8 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE), + FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0), ARM64_FTR_END, @@ -2348,6 +2350,14 @@ static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused) sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn); } +#ifdef CONFIG_ARM64_POE +static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1x_E0POE); + sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_E0POE); +} +#endif + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2870,6 +2880,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_nv1, ARM64_CPUID_FIELDS_NEG(ID_AA64MMFR4_EL1, E2H0, NI_NV1) }, +#ifdef CONFIG_ARM64_POE + { + .desc = "Stage-1 Permission Overlay Extension (S1POE)", + .capability = ARM64_HAS_S1POE, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_poe, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) + }, +#endif {}, }; @@ -3034,6 +3054,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), +#ifdef CONFIG_ARM64_POE + HWCAP_CAP(ID_AA64MMFR3_EL1, S1POE, IMP, CAP_HWCAP, KERNEL_HWCAP_POE), +#endif {}, }; diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 09eeaa24d456..44718d0482b3 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -143,6 +143,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SME_SF8FMA] = "smesf8fma", [KERNEL_HWCAP_SME_SF8DP4] = "smesf8dp4", [KERNEL_HWCAP_SME_SF8DP2] = "smesf8dp2", + [KERNEL_HWCAP_POE] = "poe", }; #ifdef CONFIG_COMPAT @@ -280,7 +281,7 @@ const struct seq_operations cpuinfo_op = { }; -static struct kobj_type cpuregs_kobj_type = { +static const struct kobj_type cpuregs_kobj_type = { .sysfs_ops = &kobj_sysfs_ops, }; diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 02870beb271e..7b11d84f533c 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -407,7 +407,7 @@ int swsusp_arch_resume(void) void *, phys_addr_t, phys_addr_t); struct trans_pgd_info trans_info = { .trans_alloc_page = hibernate_page_alloc, - .trans_alloc_arg = (void *)GFP_ATOMIC, + .trans_alloc_arg = (__force void *)GFP_ATOMIC, }; /* diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index f872c57e9909..fd9a7bed83ce 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -6,28 +6,7 @@ * Copyright (C) 2014 ARM Ltd. */ -#include <linux/acpi.h> -#include <linux/init.h> -#include <linux/io.h> -#include <linux/kernel.h> -#include <linux/mm.h> #include <linux/pci.h> -#include <linux/pci-acpi.h> -#include <linux/pci-ecam.h> -#include <linux/slab.h> - -#ifdef CONFIG_ACPI -/* - * Try to assign the IRQ number when probing a new device - */ -int pcibios_alloc_irq(struct pci_dev *dev) -{ - if (!acpi_disabled) - acpi_pci_irq_enable(dev); - - return 0; -} -#endif /* * raw_pci_read/write - Platform-specific PCI config space access. @@ -61,173 +40,3 @@ int pcibus_to_node(struct pci_bus *bus) EXPORT_SYMBOL(pcibus_to_node); #endif - -#ifdef CONFIG_ACPI - -struct acpi_pci_generic_root_info { - struct acpi_pci_root_info common; - struct pci_config_window *cfg; /* config space mapping */ -}; - -int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) -{ - struct pci_config_window *cfg = bus->sysdata; - struct acpi_device *adev = to_acpi_device(cfg->parent); - struct acpi_pci_root *root = acpi_driver_data(adev); - - return root->segment; -} - -int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) -{ - struct pci_config_window *cfg; - struct acpi_device *adev; - struct device *bus_dev; - - if (acpi_disabled) - return 0; - - cfg = bridge->bus->sysdata; - - /* - * On Hyper-V there is no corresponding ACPI device for a root bridge, - * therefore ->parent is set as NULL by the driver. And set 'adev' as - * NULL in this case because there is no proper ACPI device. - */ - if (!cfg->parent) - adev = NULL; - else - adev = to_acpi_device(cfg->parent); - - bus_dev = &bridge->bus->dev; - - ACPI_COMPANION_SET(&bridge->dev, adev); - set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev))); - - return 0; -} - -static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) -{ - struct resource_entry *entry, *tmp; - int status; - - status = acpi_pci_probe_root_resources(ci); - resource_list_for_each_entry_safe(entry, tmp, &ci->resources) { - if (!(entry->res->flags & IORESOURCE_WINDOW)) - resource_list_destroy_entry(entry); - } - return status; -} - -/* - * Lookup the bus range for the domain in MCFG, and set up config space - * mapping. - */ -static struct pci_config_window * -pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root) -{ - struct device *dev = &root->device->dev; - struct resource *bus_res = &root->secondary; - u16 seg = root->segment; - const struct pci_ecam_ops *ecam_ops; - struct resource cfgres; - struct acpi_device *adev; - struct pci_config_window *cfg; - int ret; - - ret = pci_mcfg_lookup(root, &cfgres, &ecam_ops); - if (ret) { - dev_err(dev, "%04x:%pR ECAM region not found\n", seg, bus_res); - return NULL; - } - - adev = acpi_resource_consumer(&cfgres); - if (adev) - dev_info(dev, "ECAM area %pR reserved by %s\n", &cfgres, - dev_name(&adev->dev)); - else - dev_warn(dev, FW_BUG "ECAM area %pR not reserved in ACPI namespace\n", - &cfgres); - - cfg = pci_ecam_create(dev, &cfgres, bus_res, ecam_ops); - if (IS_ERR(cfg)) { - dev_err(dev, "%04x:%pR error %ld mapping ECAM\n", seg, bus_res, - PTR_ERR(cfg)); - return NULL; - } - - return cfg; -} - -/* release_info: free resources allocated by init_info */ -static void pci_acpi_generic_release_info(struct acpi_pci_root_info *ci) -{ - struct acpi_pci_generic_root_info *ri; - - ri = container_of(ci, struct acpi_pci_generic_root_info, common); - pci_ecam_free(ri->cfg); - kfree(ci->ops); - kfree(ri); -} - -/* Interface called from ACPI code to setup PCI host controller */ -struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) -{ - struct acpi_pci_generic_root_info *ri; - struct pci_bus *bus, *child; - struct acpi_pci_root_ops *root_ops; - struct pci_host_bridge *host; - - ri = kzalloc(sizeof(*ri), GFP_KERNEL); - if (!ri) - return NULL; - - root_ops = kzalloc(sizeof(*root_ops), GFP_KERNEL); - if (!root_ops) { - kfree(ri); - return NULL; - } - - ri->cfg = pci_acpi_setup_ecam_mapping(root); - if (!ri->cfg) { - kfree(ri); - kfree(root_ops); - return NULL; - } - - root_ops->release_info = pci_acpi_generic_release_info; - root_ops->prepare_resources = pci_acpi_root_prepare_resources; - root_ops->pci_ops = (struct pci_ops *)&ri->cfg->ops->pci_ops; - bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg); - if (!bus) - return NULL; - - /* If we must preserve the resource configuration, claim now */ - host = pci_find_host_bridge(bus); - if (host->preserve_config) - pci_bus_claim_resources(bus); - - /* - * Assign whatever was left unassigned. If we didn't claim above, - * this will reassign everything. - */ - pci_assign_unassigned_root_bus_resources(bus); - - list_for_each_entry(child, &bus->children, node) - pcie_bus_configure_settings(child); - - return bus; -} - -void pcibios_add_bus(struct pci_bus *bus) -{ - acpi_pci_add_bus(bus); -} - -void pcibios_remove_bus(struct pci_bus *bus) -{ - acpi_pci_remove_bus(bus); -} - -#endif diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 4ae31b7af6c3..0540653fbf38 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -43,6 +43,7 @@ #include <linux/stacktrace.h> #include <asm/alternative.h> +#include <asm/arch_timer.h> #include <asm/compat.h> #include <asm/cpufeature.h> #include <asm/cacheflush.h> @@ -271,12 +272,21 @@ static void flush_tagged_addr_state(void) clear_thread_flag(TIF_TAGGED_ADDR); } +static void flush_poe(void) +{ + if (!system_supports_poe()) + return; + + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); +} + void flush_thread(void) { fpsimd_flush_thread(); tls_thread_flush(); flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); + flush_poe(); } void arch_release_task_struct(struct task_struct *tsk) @@ -371,6 +381,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (system_supports_tpidr2()) p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + if (system_supports_poe()) + p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (stack_start) { if (is_compat_thread(task_thread_info(p))) childregs->compat_sp = stack_start; @@ -472,27 +485,63 @@ static void entry_task_switch(struct task_struct *next) } /* - * ARM erratum 1418040 handling, affecting the 32bit view of CNTVCT. - * Ensure access is disabled when switching to a 32bit task, ensure - * access is enabled when switching to a 64bit task. + * Handle sysreg updates for ARM erratum 1418040 which affects the 32bit view of + * CNTVCT, various other errata which require trapping all CNTVCT{,_EL0} + * accesses and prctl(PR_SET_TSC). Ensure access is disabled iff a workaround is + * required or PR_TSC_SIGSEGV is set. */ -static void erratum_1418040_thread_switch(struct task_struct *next) +static void update_cntkctl_el1(struct task_struct *next) { - if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) || - !this_cpu_has_cap(ARM64_WORKAROUND_1418040)) - return; + struct thread_info *ti = task_thread_info(next); - if (is_compat_thread(task_thread_info(next))) + if (test_ti_thread_flag(ti, TIF_TSC_SIGSEGV) || + has_erratum_handler(read_cntvct_el0) || + (IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) && + this_cpu_has_cap(ARM64_WORKAROUND_1418040) && + is_compat_thread(ti))) sysreg_clear_set(cntkctl_el1, ARCH_TIMER_USR_VCT_ACCESS_EN, 0); else sysreg_clear_set(cntkctl_el1, 0, ARCH_TIMER_USR_VCT_ACCESS_EN); } -static void erratum_1418040_new_exec(void) +static void cntkctl_thread_switch(struct task_struct *prev, + struct task_struct *next) +{ + if ((read_ti_thread_flags(task_thread_info(prev)) & + (_TIF_32BIT | _TIF_TSC_SIGSEGV)) != + (read_ti_thread_flags(task_thread_info(next)) & + (_TIF_32BIT | _TIF_TSC_SIGSEGV))) + update_cntkctl_el1(next); +} + +static int do_set_tsc_mode(unsigned int val) { + bool tsc_sigsegv; + + if (val == PR_TSC_SIGSEGV) + tsc_sigsegv = true; + else if (val == PR_TSC_ENABLE) + tsc_sigsegv = false; + else + return -EINVAL; + preempt_disable(); - erratum_1418040_thread_switch(current); + update_thread_flag(TIF_TSC_SIGSEGV, tsc_sigsegv); + update_cntkctl_el1(current); preempt_enable(); + + return 0; +} + +static void permission_overlay_switch(struct task_struct *next) +{ + if (!system_supports_poe()) + return; + + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (current->thread.por_el0 != next->thread.por_el0) { + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); + } } /* @@ -528,8 +577,9 @@ struct task_struct *__switch_to(struct task_struct *prev, contextidr_thread_switch(next); entry_task_switch(next); ssbs_thread_switch(next); - erratum_1418040_thread_switch(next); + cntkctl_thread_switch(prev, next); ptrauth_thread_switch_user(next); + permission_overlay_switch(next); /* * Complete any pending TLB or cache maintenance on this CPU in case @@ -645,7 +695,7 @@ void arch_setup_new_exec(void) current->mm->context.flags = mmflags; ptrauth_thread_init_user(); mte_thread_init_user(); - erratum_1418040_new_exec(); + do_set_tsc_mode(PR_TSC_ENABLE); if (task_spec_ssb_noexec(current)) { arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS, @@ -754,3 +804,26 @@ int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, return prot; } #endif + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (is_compat_task()) + return -EINVAL; + + if (test_thread_flag(TIF_TSC_SIGSEGV)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (is_compat_task()) + return -EINVAL; + + return do_set_tsc_mode(val); +} diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 0d022599eb61..b756578aeaee 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1440,6 +1440,39 @@ static int tagged_addr_ctrl_set(struct task_struct *target, const struct } #endif +#ifdef CONFIG_ARM64_POE +static int poe_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_poe()) + return -EINVAL; + + return membuf_write(&to, &target->thread.por_el0, + sizeof(target->thread.por_el0)); +} + +static int poe_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + long ctrl; + + if (!system_supports_poe()) + return -EINVAL; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + target->thread.por_el0 = ctrl; + + return 0; +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1469,6 +1502,9 @@ enum aarch64_regset { #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI REGSET_TAGGED_ADDR_CTRL, #endif +#ifdef CONFIG_ARM64_POE + REGSET_POE +#endif }; static const struct user_regset aarch64_regsets[] = { @@ -1628,6 +1664,16 @@ static const struct user_regset aarch64_regsets[] = { .set = tagged_addr_ctrl_set, }, #endif +#ifdef CONFIG_ARM64_POE + [REGSET_POE] = { + .core_note_type = NT_ARM_POE, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = poe_get, + .set = poe_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 4a77f4976e11..561986947530 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -61,6 +61,7 @@ struct rt_sigframe_user_layout { unsigned long za_offset; unsigned long zt_offset; unsigned long fpmr_offset; + unsigned long poe_offset; unsigned long extra_offset; unsigned long end_offset; }; @@ -185,6 +186,8 @@ struct user_ctxs { u32 zt_size; struct fpmr_context __user *fpmr; u32 fpmr_size; + struct poe_context __user *poe; + u32 poe_size; }; static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) @@ -258,6 +261,32 @@ static int restore_fpmr_context(struct user_ctxs *user) return err; } +static int preserve_poe_context(struct poe_context __user *ctx) +{ + int err = 0; + + __put_user_error(POE_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(read_sysreg_s(SYS_POR_EL0), &ctx->por_el0, err); + + return err; +} + +static int restore_poe_context(struct user_ctxs *user) +{ + u64 por_el0; + int err = 0; + + if (user->poe_size != sizeof(*user->poe)) + return -EINVAL; + + __get_user_error(por_el0, &(user->poe->por_el0), err); + if (!err) + write_sysreg_s(por_el0, SYS_POR_EL0); + + return err; +} + #ifdef CONFIG_ARM64_SVE static int preserve_sve_context(struct sve_context __user *ctx) @@ -621,6 +650,7 @@ static int parse_user_sigframe(struct user_ctxs *user, user->za = NULL; user->zt = NULL; user->fpmr = NULL; + user->poe = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -671,6 +701,17 @@ static int parse_user_sigframe(struct user_ctxs *user, /* ignore */ break; + case POE_MAGIC: + if (!system_supports_poe()) + goto invalid; + + if (user->poe) + goto invalid; + + user->poe = (struct poe_context __user *)head; + user->poe_size = size; + break; + case SVE_MAGIC: if (!system_supports_sve() && !system_supports_sme()) goto invalid; @@ -857,6 +898,9 @@ static int restore_sigframe(struct pt_regs *regs, if (err == 0 && system_supports_sme2() && user.zt) err = restore_zt_context(&user); + if (err == 0 && system_supports_poe() && user.poe) + err = restore_poe_context(&user); + return err; } @@ -980,6 +1024,13 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } + if (system_supports_poe()) { + err = sigframe_alloc(user, &user->poe_offset, + sizeof(struct poe_context)); + if (err) + return err; + } + return sigframe_alloc_end(user); } @@ -1042,6 +1093,14 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, err |= preserve_fpmr_context(fpmr_ctx); } + if (system_supports_poe() && err == 0 && user->poe_offset) { + struct poe_context __user *poe_ctx = + apply_user_offset(user, user->poe_offset); + + err |= preserve_poe_context(poe_ctx); + } + + /* ZA state if present */ if (system_supports_sme() && err == 0 && user->za_offset) { struct za_context __user *za_ctx = @@ -1178,6 +1237,9 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, sme_smstop(); } + if (system_supports_poe()) + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); + if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index f01f0fd7b7fe..3b3f6b56e733 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -68,7 +68,7 @@ enum ipi_msg_type { IPI_RESCHEDULE, IPI_CALL_FUNC, IPI_CPU_STOP, - IPI_CPU_CRASH_STOP, + IPI_CPU_STOP_NMI, IPI_TIMER, IPI_IRQ_WORK, NR_IPI, @@ -85,6 +85,8 @@ static int ipi_irq_base __ro_after_init; static int nr_ipi __ro_after_init = NR_IPI; static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init; +static bool crash_stop; + static void ipi_setup(int cpu); #ifdef CONFIG_HOTPLUG_CPU @@ -823,7 +825,7 @@ static const char *ipi_types[MAX_IPI] __tracepoint_string = { [IPI_RESCHEDULE] = "Rescheduling interrupts", [IPI_CALL_FUNC] = "Function call interrupts", [IPI_CPU_STOP] = "CPU stop interrupts", - [IPI_CPU_CRASH_STOP] = "CPU stop (for crash dump) interrupts", + [IPI_CPU_STOP_NMI] = "CPU stop NMIs", [IPI_TIMER] = "Timer broadcast interrupts", [IPI_IRQ_WORK] = "IRQ work interrupts", [IPI_CPU_BACKTRACE] = "CPU backtrace interrupts", @@ -867,9 +869,9 @@ void arch_irq_work_raise(void) } #endif -static void __noreturn local_cpu_stop(void) +static void __noreturn local_cpu_stop(unsigned int cpu) { - set_cpu_online(smp_processor_id(), false); + set_cpu_online(cpu, false); local_daif_mask(); sdei_mask_local_cpu(); @@ -883,21 +885,26 @@ static void __noreturn local_cpu_stop(void) */ void __noreturn panic_smp_self_stop(void) { - local_cpu_stop(); + local_cpu_stop(smp_processor_id()); } -#ifdef CONFIG_KEXEC_CORE -static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0); -#endif - static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) { #ifdef CONFIG_KEXEC_CORE + /* + * Use local_daif_mask() instead of local_irq_disable() to make sure + * that pseudo-NMIs are disabled. The "crash stop" code starts with + * an IRQ and falls back to NMI (which might be pseudo). If the IRQ + * finally goes through right as we're timing out then the NMI could + * interrupt us. It's better to prevent the NMI and let the IRQ + * finish since the pt_regs will be better. + */ + local_daif_mask(); + crash_save_cpu(regs, cpu); - atomic_dec(&waiting_for_crash_ipi); + set_cpu_online(cpu, false); - local_irq_disable(); sdei_mask_local_cpu(); if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) @@ -962,14 +969,12 @@ static void do_handle_IPI(int ipinr) break; case IPI_CPU_STOP: - local_cpu_stop(); - break; - - case IPI_CPU_CRASH_STOP: - if (IS_ENABLED(CONFIG_KEXEC_CORE)) { + case IPI_CPU_STOP_NMI: + if (IS_ENABLED(CONFIG_KEXEC_CORE) && crash_stop) { ipi_cpu_crash_stop(cpu, get_irq_regs()); - unreachable(); + } else { + local_cpu_stop(cpu); } break; @@ -1024,8 +1029,7 @@ static bool ipi_should_be_nmi(enum ipi_msg_type ipi) return false; switch (ipi) { - case IPI_CPU_STOP: - case IPI_CPU_CRASH_STOP: + case IPI_CPU_STOP_NMI: case IPI_CPU_BACKTRACE: case IPI_KGDB_ROUNDUP: return true; @@ -1138,79 +1142,109 @@ static inline unsigned int num_other_online_cpus(void) void smp_send_stop(void) { + static unsigned long stop_in_progress; + cpumask_t mask; unsigned long timeout; - if (num_other_online_cpus()) { - cpumask_t mask; + /* + * If this cpu is the only one alive at this point in time, online or + * not, there are no stop messages to be sent around, so just back out. + */ + if (num_other_online_cpus() == 0) + goto skip_ipi; - cpumask_copy(&mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &mask); + /* Only proceed if this is the first CPU to reach this code */ + if (test_and_set_bit(0, &stop_in_progress)) + return; - if (system_state <= SYSTEM_RUNNING) - pr_crit("SMP: stopping secondary CPUs\n"); - smp_cross_call(&mask, IPI_CPU_STOP); - } + /* + * Send an IPI to all currently online CPUs except the CPU running + * this code. + * + * NOTE: we don't do anything here to prevent other CPUs from coming + * online after we snapshot `cpu_online_mask`. Ideally, the calling code + * should do something to prevent other CPUs from coming up. This code + * can be called in the panic path and thus it doesn't seem wise to + * grab the CPU hotplug mutex ourselves. Worst case: + * - If a CPU comes online as we're running, we'll likely notice it + * during the 1 second wait below and then we'll catch it when we try + * with an NMI (assuming NMIs are enabled) since we re-snapshot the + * mask before sending an NMI. + * - If we leave the function and see that CPUs are still online we'll + * at least print a warning. Especially without NMIs this function + * isn't foolproof anyway so calling code will just have to accept + * the fact that there could be cases where a CPU can't be stopped. + */ + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); - /* Wait up to one second for other CPUs to stop */ + if (system_state <= SYSTEM_RUNNING) + pr_crit("SMP: stopping secondary CPUs\n"); + + /* + * Start with a normal IPI and wait up to one second for other CPUs to + * stop. We do this first because it gives other processors a chance + * to exit critical sections / drop locks and makes the rest of the + * stop process (especially console flush) more robust. + */ + smp_cross_call(&mask, IPI_CPU_STOP); timeout = USEC_PER_SEC; while (num_other_online_cpus() && timeout--) udelay(1); - if (num_other_online_cpus()) + /* + * If CPUs are still online, try an NMI. There's no excuse for this to + * be slow, so we only give them an extra 10 ms to respond. + */ + if (num_other_online_cpus() && ipi_should_be_nmi(IPI_CPU_STOP_NMI)) { + smp_rmb(); + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + pr_info("SMP: retry stop with NMI for CPUs %*pbl\n", + cpumask_pr_args(&mask)); + + smp_cross_call(&mask, IPI_CPU_STOP_NMI); + timeout = USEC_PER_MSEC * 10; + while (num_other_online_cpus() && timeout--) + udelay(1); + } + + if (num_other_online_cpus()) { + smp_rmb(); + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(cpu_online_mask)); + cpumask_pr_args(&mask)); + } +skip_ipi: sdei_mask_local_cpu(); } #ifdef CONFIG_KEXEC_CORE void crash_smp_send_stop(void) { - static int cpus_stopped; - cpumask_t mask; - unsigned long timeout; - /* * This function can be called twice in panic path, but obviously * we execute this only once. + * + * We use this same boolean to tell whether the IPI we send was a + * stop or a "crash stop". */ - if (cpus_stopped) + if (crash_stop) return; + crash_stop = 1; - cpus_stopped = 1; + smp_send_stop(); - /* - * If this cpu is the only one alive at this point in time, online or - * not, there are no stop messages to be sent around, so just back out. - */ - if (num_other_online_cpus() == 0) - goto skip_ipi; - - cpumask_copy(&mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &mask); - - atomic_set(&waiting_for_crash_ipi, num_other_online_cpus()); - - pr_crit("SMP: stopping secondary CPUs\n"); - smp_cross_call(&mask, IPI_CPU_CRASH_STOP); - - /* Wait up to one second for other CPUs to stop */ - timeout = USEC_PER_SEC; - while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--) - udelay(1); - - if (atomic_read(&waiting_for_crash_ipi) > 0) - pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(&mask)); - -skip_ipi: - sdei_mask_local_cpu(); sdei_handler_abort(); } bool smp_crash_stop_failed(void) { - return (atomic_read(&waiting_for_crash_ipi) > 0); + return num_other_online_cpus() != 0; } #endif diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 9e22683aa921..563cbce11126 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -273,6 +273,12 @@ void arm64_force_sig_fault(int signo, int code, unsigned long far, force_sig_fault(signo, code, (void __user *)far); } +void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey) +{ + arm64_show_signal(SIGSEGV, str); + force_sig_pkuerr((void __user *)far, pkey); +} + void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { @@ -601,18 +607,26 @@ static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) static void cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = ESR_ELx_SYS64_ISS_RT(esr); + if (test_thread_flag(TIF_TSC_SIGSEGV)) { + force_sig(SIGSEGV); + } else { + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_timer_read_counter()); - arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + pt_regs_write_reg(regs, rt, arch_timer_read_counter()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } } static void cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = ESR_ELx_SYS64_ISS_RT(esr); + if (test_thread_flag(TIF_TSC_SIGSEGV)) { + force_sig(SIGSEGV); + } else { + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_timer_get_rate()); - arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + pt_regs_write_reg(regs, rt, arch_timer_get_rate()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } } static void mrs_handler(unsigned long esr, struct pt_regs *regs) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 8304eb342be9..ead632ad01b4 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -66,4 +66,21 @@ config PROTECTED_NVHE_STACKTRACE If unsure, or not using protected nVHE (pKVM), say N. +config PTDUMP_STAGE2_DEBUGFS + bool "Present the stage-2 pagetables to debugfs" + depends on KVM + depends on DEBUG_KERNEL + depends on DEBUG_FS + depends on GENERIC_PTDUMP + select PTDUMP_CORE + default n + help + Say Y here if you want to show the stage-2 kernel pagetables + layout in a debugfs file. This information is only useful for kernel developers + who are working in architecture specific areas of the kernel. + It is probably not a good idea to enable this feature in a production + kernel. + + If in doubt, say N. + endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 86a629aaf0a1..3cf7adb2b503 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -17,7 +17,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o stacktrace.o \ vgic-sys-reg-v3.o fpsimd.o pkvm.o \ - arch_timer.o trng.o vmid.o emulate-nested.o nested.o \ + arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \ vgic/vgic.o vgic/vgic-init.o \ vgic/vgic-irqfd.o vgic/vgic-v2.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ @@ -27,6 +27,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o +kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o always-y := hyp_constants.h hyp-constants.s diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9bef7638342e..fe0764173cd0 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -46,6 +46,8 @@ #include <kvm/arm_pmu.h> #include <kvm/arm_psci.h> +#include "sys_regs.h" + static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; enum kvm_wfx_trap_policy { @@ -228,6 +230,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) void kvm_arch_create_vm_debugfs(struct kvm *kvm) { kvm_sys_regs_create_debugfs(kvm); + kvm_s2_ptdump_create_debugfs(kvm); } static void kvm_destroy_mpidr_data(struct kvm *kvm) @@ -821,15 +824,13 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; } - if (vcpu_has_nv(vcpu)) { - ret = kvm_init_nv_sysregs(vcpu->kvm); - if (ret) - return ret; - } + ret = kvm_finalize_sys_regs(vcpu); + if (ret) + return ret; /* - * This needs to happen after NV has imposed its own restrictions on - * the feature set + * This needs to happen after any restriction has been applied + * to the feature set. */ kvm_calculate_traps(vcpu); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c new file mode 100644 index 000000000000..39f0e87a340e --- /dev/null +++ b/arch/arm64/kvm/at.c @@ -0,0 +1,1101 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 - Linaro Ltd + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/kvm_host.h> + +#include <asm/esr.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +enum trans_regime { + TR_EL10, + TR_EL20, + TR_EL2, +}; + +struct s1_walk_info { + u64 baddr; + enum trans_regime regime; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int txsz; + int sl; + bool hpd; + bool be; + bool s2; +}; + +struct s1_walk_result { + union { + struct { + u64 desc; + u64 pa; + s8 level; + u8 APTable; + bool UXNTable; + bool PXNTable; + }; + struct { + u8 fst; + bool ptw; + bool s2; + }; + }; + bool failed; +}; + +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) +{ + wr->fst = fst; + wr->ptw = ptw; + wr->s2 = s2; + wr->failed = true; +} + +#define S1_MMU_DISABLED (-127) + +static int get_ia_size(struct s1_walk_info *wi) +{ + return 64 - wi->txsz; +} + +/* Return true if the IPA is out of the OA range */ +static bool check_output_size(u64 ipa, struct s1_walk_info *wi) +{ + return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); +} + +/* Return the translation regime that applies to an AT instruction */ +static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) +{ + /* + * We only get here from guest EL2, so the translation + * regime AT applies to is solely defined by {E2H,TGE}. + */ + switch (op) { + case OP_AT_S1E2R: + case OP_AT_S1E2W: + case OP_AT_S1E2A: + return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; + break; + default: + return (vcpu_el2_e2h_is_set(vcpu) && + vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; + } +} + +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; + unsigned int stride, x; + bool va55, tbi, lva, as_el0; + + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + + wi->regime = compute_translation_regime(vcpu, op); + as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + + va55 = va & BIT(55); + + if (wi->regime == TR_EL2 && va55) + goto addrsz; + + wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + + switch (wi->regime) { + case TR_EL10: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL2: + case TR_EL20: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + tbi = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_TBI, tcr) : + (va55 ? + FIELD_GET(TCR_TBI1, tcr) : + FIELD_GET(TCR_TBI0, tcr))); + + if (!tbi && (u64)sign_extend64(va, 55) != va) + goto addrsz; + + va = (u64)sign_extend64(va, 55); + + /* Let's put the MMU disabled case aside immediately */ + switch (wi->regime) { + case TR_EL10: + /* + * If dealing with the EL1&0 translation regime, 3 things + * can disable the S1 translation: + * + * - HCR_EL2.DC = 1 + * - HCR_EL2.{E2H,TGE} = {0,1} + * - SCTLR_EL1.M = 0 + * + * The TGE part is interesting. If we have decided that this + * is EL1&0, then it means that either {E2H,TGE} == {1,0} or + * {0,x}, and we only need to test for TGE == 1. + */ + if (hcr & (HCR_DC | HCR_TGE)) { + wr->level = S1_MMU_DISABLED; + break; + } + fallthrough; + case TR_EL2: + case TR_EL20: + if (!(sctlr & SCTLR_ELx_M)) + wr->level = S1_MMU_DISABLED; + break; + } + + if (wr->level == S1_MMU_DISABLED) { + if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) + goto addrsz; + + wr->pa = va; + return 0; + } + + wi->be = sctlr & SCTLR_ELx_EE; + + wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); + wi->hpd &= (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HPD, tcr) : + (va55 ? + FIELD_GET(TCR_HPD1, tcr) : + FIELD_GET(TCR_HPD0, tcr))); + + /* Someone was silly enough to encode TG0/TG1 differently */ + if (va55) { + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG1_MASK, tcr); + + switch (tg << TCR_TG1_SHIFT) { + case TCR_TG1_4K: + wi->pgshift = 12; break; + case TCR_TG1_16K: + wi->pgshift = 14; break; + case TCR_TG1_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } else { + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG0_MASK, tcr); + + switch (tg << TCR_TG0_SHIFT) { + case TCR_TG0_4K: + wi->pgshift = 12; break; + case TCR_TG0_16K: + wi->pgshift = 14; break; + case TCR_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } + + /* R_PLCGL, R_YXNYW */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { + if (wi->txsz > 39) + goto transfault_l0; + } else { + if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) + goto transfault_l0; + } + + /* R_GTJBY, R_SXWGM */ + switch (BIT(wi->pgshift)) { + case SZ_4K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT); + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + break; + case SZ_16K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT); + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + break; + case SZ_64K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); + break; + } + + if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) + goto transfault_l0; + + ia_bits = get_ia_size(wi); + + /* R_YYVYV, I_THCZK */ + if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || + (va55 && va < GENMASK(63, ia_bits))) + goto transfault_l0; + + /* I_ZFSYQ */ + if (wi->regime != TR_EL2 && + (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) + goto transfault_l0; + + /* R_BNDVG and following statements */ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && + as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) + goto transfault_l0; + + /* AArch64.S1StartLevel() */ + stride = wi->pgshift - 3; + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + + ps = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); + + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); + + /* Compute minimal alignment */ + x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); + + wi->baddr = ttbr & TTBRx_EL1_BADDR; + + /* R_VPBBF */ + if (check_output_size(wi->baddr, wi)) + goto addrsz; + + wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); + + return 0; + +addrsz: /* Address Size Fault level 0 */ + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false); + return -EFAULT; + +transfault_l0: /* Translation Fault level 0 */ + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false); + return -EFAULT; +} + +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 va_top, va_bottom, baddr, desc; + int level, stride, ret; + + level = wi->sl; + stride = wi->pgshift - 3; + baddr = wi->baddr; + + va_top = get_ia_size(wi) - 1; + + while (1) { + u64 index, ipa; + + va_bottom = (3 - level) * stride + wi->pgshift; + index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); + + ipa = baddr | index; + + if (wi->s2) { + struct kvm_s2_trans s2_trans = {}; + + ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); + if (ret) { + fail_s1_walk(wr, + (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, + true, true); + return ret; + } + + if (!kvm_s2_trans_readable(&s2_trans)) { + fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), + true, true); + + return -EPERM; + } + + ipa = kvm_s2_trans_output(&s2_trans); + } + + ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); + if (ret) { + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), + true, false); + return ret; + } + + if (wi->be) + desc = be64_to_cpu((__force __be64)desc); + else + desc = le64_to_cpu((__force __le64)desc); + + /* Invalid descriptor */ + if (!(desc & BIT(0))) + goto transfault; + + /* Block mapping, check validity down the line */ + if (!(desc & BIT(1))) + break; + + /* Page mapping */ + if (level == 3) + break; + + /* Table handling */ + if (!wi->hpd) { + wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); + wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); + wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); + } + + baddr = desc & GENMASK_ULL(47, wi->pgshift); + + /* Check for out-of-range OA */ + if (check_output_size(baddr, wi)) + goto addrsz; + + /* Prepare for next round */ + va_top = va_bottom - 1; + level++; + } + + /* Block mapping, check the validity of the level */ + if (!(desc & BIT(1))) { + bool valid_block = false; + + switch (BIT(wi->pgshift)) { + case SZ_4K: + valid_block = level == 1 || level == 2; + break; + case SZ_16K: + case SZ_64K: + valid_block = level == 2; + break; + } + + if (!valid_block) + goto transfault; + } + + if (check_output_size(desc & GENMASK(47, va_bottom), wi)) + goto addrsz; + + va_bottom += contiguous_bit_shift(desc, wi, level); + + wr->failed = false; + wr->level = level; + wr->desc = desc; + wr->pa = desc & GENMASK(47, va_bottom); + wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); + + return 0; + +addrsz: + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false); + return -EINVAL; +transfault: + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false); + return -ENOENT; +} + +struct mmu_config { + u64 ttbr0; + u64 ttbr1; + u64 tcr; + u64 mair; + u64 sctlr; + u64 vttbr; + u64 vtcr; + u64 hcr; +}; + +static void __mmu_config_save(struct mmu_config *config) +{ + config->ttbr0 = read_sysreg_el1(SYS_TTBR0); + config->ttbr1 = read_sysreg_el1(SYS_TTBR1); + config->tcr = read_sysreg_el1(SYS_TCR); + config->mair = read_sysreg_el1(SYS_MAIR); + config->sctlr = read_sysreg_el1(SYS_SCTLR); + config->vttbr = read_sysreg(vttbr_el2); + config->vtcr = read_sysreg(vtcr_el2); + config->hcr = read_sysreg(hcr_el2); +} + +static void __mmu_config_restore(struct mmu_config *config) +{ + write_sysreg(config->hcr, hcr_el2); + + /* + * ARM errata 1165522 and 1530923 require TGE to be 1 before + * we update the guest state. + */ + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); + + write_sysreg_el1(config->ttbr0, SYS_TTBR0); + write_sysreg_el1(config->ttbr1, SYS_TTBR1); + write_sysreg_el1(config->tcr, SYS_TCR); + write_sysreg_el1(config->mair, SYS_MAIR); + write_sysreg_el1(config->sctlr, SYS_SCTLR); + write_sysreg(config->vttbr, vttbr_el2); + write_sysreg(config->vtcr, vtcr_el2); +} + +static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 host_pan; + bool fail; + + host_pan = read_sysreg_s(SYS_PSTATE_PAN); + write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); + + switch (op) { + case OP_AT_S1E1RP: + fail = __kvm_at(OP_AT_S1E1RP, vaddr); + break; + case OP_AT_S1E1WP: + fail = __kvm_at(OP_AT_S1E1WP, vaddr); + break; + } + + write_sysreg_s(host_pan, SYS_PSTATE_PAN); + + return fail; +} + +#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) +#define MEMATTR_NC 0b0100 +#define MEMATTR_Wt 0b1000 +#define MEMATTR_Wb 0b1100 +#define MEMATTR_WbRaWa 0b1111 + +#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) + +static u8 s2_memattr_to_attr(u8 memattr) +{ + memattr &= 0b1111; + + switch (memattr) { + case 0b0000: + case 0b0001: + case 0b0010: + case 0b0011: + return memattr << 2; + case 0b0100: + return MEMATTR(Wb, Wb); + case 0b0101: + return MEMATTR(NC, NC); + case 0b0110: + return MEMATTR(Wt, NC); + case 0b0111: + return MEMATTR(Wb, NC); + case 0b1000: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1001: + return MEMATTR(NC, Wt); + case 0b1010: + return MEMATTR(Wt, Wt); + case 0b1011: + return MEMATTR(Wb, Wt); + case 0b1100: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1101: + return MEMATTR(NC, Wb); + case 0b1110: + return MEMATTR(Wt, Wb); + case 0b1111: + return MEMATTR(Wb, Wb); + default: + unreachable(); + } +} + +static u8 combine_s1_s2_attr(u8 s1, u8 s2) +{ + bool transient; + u8 final = 0; + + /* Upgrade transient s1 to non-transient to simplify things */ + switch (s1) { + case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ + transient = true; + s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); + break; + case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ + transient = true; + s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); + break; + default: + transient = false; + } + + /* S2CombineS1AttrHints() */ + if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || + (s2 & GENMASK(3, 2)) == MEMATTR_NC) + final = MEMATTR_NC; + else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || + (s2 & GENMASK(3, 2)) == MEMATTR_Wt) + final = MEMATTR_Wt; + else + final = MEMATTR_Wb; + + if (final != MEMATTR_NC) { + /* Inherit RaWa hints form S1 */ + if (transient) { + switch (s1 & GENMASK(3, 2)) { + case MEMATTR_Wt: + final = 0; + break; + case MEMATTR_Wb: + final = MEMATTR_NC; + break; + } + } + + final |= s1 & GENMASK(1, 0); + } + + return final; +} + +#define ATTR_NSH 0b00 +#define ATTR_RSV 0b01 +#define ATTR_OSH 0b10 +#define ATTR_ISH 0b11 + +static u8 compute_sh(u8 attr, u64 desc) +{ + u8 sh; + + /* Any form of device, as well as NC has SH[1:0]=0b10 */ + if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) + return ATTR_OSH; + + sh = FIELD_GET(PTE_SHARED, desc); + if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ + sh = ATTR_NSH; + + return sh; +} + +static u8 combine_sh(u8 s1_sh, u8 s2_sh) +{ + if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) + return ATTR_OSH; + if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) + return ATTR_ISH; + + return ATTR_NSH; +} + +static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, + struct kvm_s2_trans *tr) +{ + u8 s1_parattr, s2_memattr, final_attr; + u64 par; + + /* If S2 has failed to translate, report the damage */ + if (tr->esr) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= SYS_PAR_EL1_S; + par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); + return par; + } + + s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); + s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); + + if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { + if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) + s2_memattr &= ~BIT(3); + + /* Combination of R_VRJSW and R_RHWZM */ + switch (s2_memattr) { + case 0b0101: + if (MEMATTR_IS_DEVICE(s1_parattr)) + final_attr = s1_parattr; + else + final_attr = MEMATTR(NC, NC); + break; + case 0b0110: + case 0b1110: + final_attr = MEMATTR(WbRaWa, WbRaWa); + break; + case 0b0111: + case 0b1111: + /* Preserve S1 attribute */ + final_attr = s1_parattr; + break; + case 0b0100: + case 0b1100: + case 0b1101: + /* Reserved, do something non-silly */ + final_attr = s1_parattr; + break; + default: + /* MemAttr[2]=0, Device from S2 */ + final_attr = s2_memattr & GENMASK(1,0) << 2; + } + } else { + /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ + u8 s2_parattr = s2_memattr_to_attr(s2_memattr); + + if (MEMATTR_IS_DEVICE(s1_parattr) || + MEMATTR_IS_DEVICE(s2_parattr)) { + final_attr = min(s1_parattr, s2_parattr); + } else { + /* At this stage, this is memory vs memory */ + final_attr = combine_s1_s2_attr(s1_parattr & 0xf, + s2_parattr & 0xf); + final_attr |= combine_s1_s2_attr(s1_parattr >> 4, + s2_parattr >> 4) << 4; + } + } + + if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && + !MEMATTR_IS_DEVICE(final_attr)) + final_attr = MEMATTR(NC, NC); + + par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); + par |= tr->output & GENMASK(47, 12); + par |= FIELD_PREP(SYS_PAR_EL1_SH, + combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), + compute_sh(final_attr, tr->desc))); + + return par; +} + +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, + enum trans_regime regime) +{ + u64 par; + + if (wr->failed) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); + par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; + par |= wr->s2 ? SYS_PAR_EL1_S : 0; + } else if (wr->level == S1_MMU_DISABLED) { + /* MMU off or HCR_EL2.DC == 1 */ + par = SYS_PAR_EL1_NSE; + par |= wr->pa & GENMASK_ULL(47, 12); + + if (regime == TR_EL10 && + (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, + MEMATTR(WbRaWa, WbRaWa)); + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); + } else { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); + } + } else { + u64 mair, sctlr; + u8 sh; + + par = SYS_PAR_EL1_NSE; + + mair = (regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, MAIR_EL1) : + vcpu_read_sys_reg(vcpu, MAIR_EL2)); + + mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; + mair &= 0xff; + + sctlr = (regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, SCTLR_EL1) : + vcpu_read_sys_reg(vcpu, SCTLR_EL2)); + + /* Force NC for memory if SCTLR_ELx.C is clear */ + if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) + mair = MEMATTR(NC, NC); + + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); + par |= wr->pa & GENMASK_ULL(47, 12); + + sh = compute_sh(mair, wr->desc); + par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); + } + + return par; +} + +static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + u64 sctlr; + + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + return false; + + if (regime == TR_EL10) + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + else + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + + return sctlr & SCTLR_EL1_EPAN; +} + +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + bool perm_fail, ur, uw, ux, pr, pw, px; + struct s1_walk_result wr = {}; + struct s1_walk_info wi = {}; + int ret, idx; + + ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); + if (ret) + goto compute_par; + + if (wr.level == S1_MMU_DISABLED) + goto compute_par; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = walk_s1(vcpu, &wi, &wr, vaddr); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (ret) + goto compute_par; + + /* FIXME: revisit when adding indirect permission support */ + /* AArch64.S1DirectBasePermissions() */ + if (wi.regime != TR_EL2) { + switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) { + case 0b00: + pr = pw = true; + ur = uw = false; + break; + case 0b01: + pr = pw = ur = uw = true; + break; + case 0b10: + pr = true; + pw = ur = uw = false; + break; + case 0b11: + pr = ur = true; + pw = uw = false; + break; + } + + switch (wr.APTable) { + case 0b00: + break; + case 0b01: + ur = uw = false; + break; + case 0b10: + pw = uw = false; + break; + case 0b11: + pw = ur = uw = false; + break; + } + + /* We don't use px for anything yet, but hey... */ + px = !((wr.desc & PTE_PXN) || wr.PXNTable || uw); + ux = !((wr.desc & PTE_UXN) || wr.UXNTable); + + if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) { + bool pan; + + pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT; + pan &= ur || uw || (pan3_enabled(vcpu, wi.regime) && ux); + pw &= !pan; + pr &= !pan; + } + } else { + ur = uw = ux = false; + + if (!(wr.desc & PTE_RDONLY)) { + pr = pw = true; + } else { + pr = true; + pw = false; + } + + if (wr.APTable & BIT(1)) + pw = false; + + /* XN maps to UXN */ + px = !((wr.desc & PTE_UXN) || wr.UXNTable); + } + + perm_fail = false; + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1R: + case OP_AT_S1E2R: + perm_fail = !pr; + break; + case OP_AT_S1E1WP: + case OP_AT_S1E1W: + case OP_AT_S1E2W: + perm_fail = !pw; + break; + case OP_AT_S1E0R: + perm_fail = !ur; + break; + case OP_AT_S1E0W: + perm_fail = !uw; + break; + case OP_AT_S1E1A: + case OP_AT_S1E2A: + break; + default: + BUG(); + } + + if (perm_fail) + fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false); + +compute_par: + return compute_par_s1(vcpu, &wr, wi.regime); +} + +/* + * Return the PAR_EL1 value as the result of a valid translation. + * + * If the translation is unsuccessful, the value may only contain + * PAR_EL1.F, and cannot be taken at face value. It isn't an + * indication of the translation having failed, only that the fast + * path did not succeed, *unless* it indicates a S1 permission fault. + */ +static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct mmu_config config; + struct kvm_s2_mmu *mmu; + bool fail; + u64 par; + + par = SYS_PAR_EL1_F; + + /* + * We've trapped, so everything is live on the CPU. As we will + * be switching contexts behind everybody's back, disable + * interrupts while holding the mmu lock. + */ + guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); + + /* + * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already + * the right one (as we trapped from vEL2). If not, save the + * full MMU context. + */ + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) + goto skip_mmu_switch; + + /* + * Obtaining the S2 MMU for a L2 is horribly racy, and we may not + * find it (recycled by another vcpu, for example). When this + * happens, admit defeat immediately and use the SW (slow) path. + */ + mmu = lookup_s2_mmu(vcpu); + if (!mmu) + return par; + + __mmu_config_save(&config); + + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); + __load_stage2(mmu, mmu->arch); + +skip_mmu_switch: + /* Clear TGE, enable S2 translation, we're rolling */ + write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2); + isb(); + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1WP: + fail = at_s1e1p_fast(vcpu, op, vaddr); + break; + case OP_AT_S1E1R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E1W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E0R: + fail = __kvm_at(OP_AT_S1E0R, vaddr); + break; + case OP_AT_S1E0W: + fail = __kvm_at(OP_AT_S1E0W, vaddr); + break; + case OP_AT_S1E1A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + break; + } + + if (!fail) + par = read_sysreg_par(); + + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + __mmu_config_restore(&config); + + return par; +} + +static bool par_check_s1_perm_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && + !(par & SYS_PAR_EL1_S)); +} + +void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); + + /* + * If PAR_EL1 reports that AT failed on a S1 permission fault, we + * know for sure that the PTW was able to walk the S1 tables and + * there's nothing else to do. + * + * If AT failed for any other reason, then we must walk the guest S1 + * to emulate the instruction. + */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + par = handle_at_slow(vcpu, op, vaddr); + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} + +void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par; + + /* + * We've trapped, so everything is live on the CPU. As we will be + * switching context behind everybody's back, disable interrupts... + */ + scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { + struct kvm_s2_mmu *mmu; + u64 val, hcr; + bool fail; + + mmu = &vcpu->kvm->arch.mmu; + + val = hcr = read_sysreg(hcr_el2); + val &= ~HCR_TGE; + val |= HCR_VM; + + if (!vcpu_el2_e2h_is_set(vcpu)) + val |= HCR_NV | HCR_NV1; + + write_sysreg(val, hcr_el2); + isb(); + + par = SYS_PAR_EL1_F; + + switch (op) { + case OP_AT_S1E2R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E2W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E2A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + } + + isb(); + + if (!fail) + par = read_sysreg_par(); + + write_sysreg(hcr, hcr_el2); + isb(); + } + + /* We failed the translation, let's replay it in slow motion */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + par = handle_at_slow(vcpu, op, vaddr); + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} + +void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct kvm_s2_trans out = {}; + u64 ipa, par; + bool write; + int ret; + + /* Do the stage-1 translation */ + switch (op) { + case OP_AT_S12E1R: + op = OP_AT_S1E1R; + write = false; + break; + case OP_AT_S12E1W: + op = OP_AT_S1E1W; + write = true; + break; + case OP_AT_S12E0R: + op = OP_AT_S1E0R; + write = false; + break; + case OP_AT_S12E0W: + op = OP_AT_S1E0W; + write = true; + break; + default: + WARN_ON_ONCE(1); + return; + } + + __kvm_at_s1e01(vcpu, op, vaddr); + par = vcpu_read_sys_reg(vcpu, PAR_EL1); + if (par & SYS_PAR_EL1_F) + return; + + /* + * If we only have a single stage of translation (E2H=0 or + * TGE=1), exit early. Same thing if {VM,DC}=={0,0}. + */ + if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) || + !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) + return; + + /* Do the stage-2 translation */ + ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); + out.esr = 0; + ret = kvm_walk_nested_s2(vcpu, ipa, &out); + if (ret < 0) + return; + + /* Check the access permission */ + if (!out.esr && + ((!write && !out.readable) || (write && !out.writable))) + out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); + + par = compute_par_s12(vcpu, par, &out); + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 05166eccea0a..05b6435d02a9 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -83,14 +83,20 @@ enum cgt_group_id { CGT_CPTR_TAM, CGT_CPTR_TCPAC, + CGT_HCRX_EnFPM, CGT_HCRX_TCR2En, + CGT_ICH_HCR_TC, + CGT_ICH_HCR_TALL0, + CGT_ICH_HCR_TALL1, + CGT_ICH_HCR_TDIR, + /* * Anything after this point is a combination of coarse trap * controls, which must all be evaluated to decide what to do. */ __MULTIPLE_CONTROL_BITS__, - CGT_HCR_IMO_FMO = __MULTIPLE_CONTROL_BITS__, + CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__, CGT_HCR_TID2_TID4, CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB_TTLBOS, @@ -105,6 +111,8 @@ enum cgt_group_id { CGT_MDCR_TDE_TDRA, CGT_MDCR_TDCC_TDE_TDA, + CGT_ICH_HCR_TC_TDIR, + /* * Anything after this point requires a callback evaluating a * complex trap condition. Ugly stuff. @@ -372,12 +380,42 @@ static const struct trap_bits coarse_trap_bits[] = { .mask = CPTR_EL2_TCPAC, .behaviour = BEHAVE_FORWARD_ANY, }, + [CGT_HCRX_EnFPM] = { + .index = HCRX_EL2, + .value = 0, + .mask = HCRX_EL2_EnFPM, + .behaviour = BEHAVE_FORWARD_ANY, + }, [CGT_HCRX_TCR2En] = { .index = HCRX_EL2, .value = 0, .mask = HCRX_EL2_TCR2En, .behaviour = BEHAVE_FORWARD_ANY, }, + [CGT_ICH_HCR_TC] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TC, + .mask = ICH_HCR_TC, + .behaviour = BEHAVE_FORWARD_ANY, + }, + [CGT_ICH_HCR_TALL0] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TALL0, + .mask = ICH_HCR_TALL0, + .behaviour = BEHAVE_FORWARD_ANY, + }, + [CGT_ICH_HCR_TALL1] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TALL1, + .mask = ICH_HCR_TALL1, + .behaviour = BEHAVE_FORWARD_ANY, + }, + [CGT_ICH_HCR_TDIR] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TDIR, + .mask = ICH_HCR_TDIR, + .behaviour = BEHAVE_FORWARD_ANY, + }, }; #define MCB(id, ...) \ @@ -387,7 +425,6 @@ static const struct trap_bits coarse_trap_bits[] = { } static const enum cgt_group_id *coarse_control_combo[] = { - MCB(CGT_HCR_IMO_FMO, CGT_HCR_IMO, CGT_HCR_FMO), MCB(CGT_HCR_TID2_TID4, CGT_HCR_TID2, CGT_HCR_TID4), MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS), MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS), @@ -402,6 +439,9 @@ static const enum cgt_group_id *coarse_control_combo[] = { MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA), MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA), MCB(CGT_MDCR_TDCC_TDE_TDA, CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA), + + MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC, CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC), + MCB(CGT_ICH_HCR_TC_TDIR, CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR), }; typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); @@ -536,9 +576,9 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4), SR_RANGE_TRAP(SYS_ID_PFR0_EL1, sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3), - SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO), - SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO), - SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO), + SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0), sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0), @@ -786,6 +826,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV), + SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV), @@ -867,6 +908,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT), SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN), @@ -1108,6 +1150,35 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN), SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN), + SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM), + /* + * IMPDEF choice: + * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as + * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for + * ICC_SRE_EL1 access, and always handle it locally. + */ + SR_TRAP(SYS_ICC_AP0R0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R1_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R2_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R3_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP1R0_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R2_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R3_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_BPR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_BPR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_CTLR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_DIR_EL1, CGT_ICH_HCR_TC_TDIR), + SR_TRAP(SYS_ICC_EOIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_EOIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_HPPIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_HPPIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IAR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IAR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IGRPEN0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IGRPEN1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_PMR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_RPR_EL1, CGT_ICH_HCR_TC), }; static DEFINE_XARRAY(sr_forward_xa); diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index c53e5b14038d..ea5484ce1f3b 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -63,6 +63,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) */ *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; *host_data_ptr(fpsimd_state) = kern_hyp_va(¤t->thread.uw.fpsimd_state); + *host_data_ptr(fpmr_ptr) = kern_hyp_va(¤t->thread.uw.fpmr); vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) @@ -134,8 +135,8 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) fp_state.sve_state = vcpu->arch.sve_state; fp_state.sve_vl = vcpu->arch.sve_max_vl; fp_state.sme_state = NULL; - fp_state.svcr = &vcpu->arch.svcr; - fp_state.fpmr = &vcpu->arch.fpmr; + fp_state.svcr = &__vcpu_sys_reg(vcpu, SVCR); + fp_state.fpmr = &__vcpu_sys_reg(vcpu, FPMR); fp_state.fp_type = &vcpu->arch.fp_type; if (vcpu_has_sve(vcpu)) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 11098eb7eb44..962f985977c2 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1045,6 +1045,11 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, mutex_lock(&kvm->slots_lock); + if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) { + ret = -EBUSY; + goto out; + } + while (length > 0) { kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); void *maddr; @@ -1059,6 +1064,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, page = pfn_to_online_page(pfn); if (!page) { /* Reject ZONE_DEVICE memory */ + kvm_release_pfn_clean(pfn); ret = -EFAULT; goto out; } diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 9e13c1bc2ad5..17df94570f03 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -14,6 +14,7 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { + int ret; u64 par, tmp; /* @@ -27,7 +28,9 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) * saved the guest context yet, and we may return early... */ par = read_sysreg_par(); - if (!__kvm_at("s1e1r", far)) + ret = system_supports_poe() ? __kvm_at(OP_AT_S1E1A, far) : + __kvm_at(OP_AT_S1E1R, far); + if (!ret) tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */ diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 37ff87d782b6..46d52e8a3df3 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -403,6 +403,9 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) else __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs); + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + write_sysreg_s(__vcpu_sys_reg(vcpu, FPMR), SYS_FPMR); + /* Skip restoring fpexc32 for AArch64 guests */ if (!(read_sysreg(hcr_el2) & HCR_RW)) write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2); diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 4c0fdabaf8ae..1579a3c08a36 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -16,9 +16,15 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt); + static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, MDSCR_EL1) = read_sysreg(mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL0) = read_sysreg_s(SYS_POR_EL0); } static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) @@ -66,6 +72,17 @@ static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt) return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, TCRX, IMP); } +static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!system_supports_poe()) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1POE, IMP); +} + static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR); @@ -80,6 +97,9 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR); ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0); } + + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL1) = read_sysreg_el1(SYS_POR); } ctxt_sys_reg(ctxt, ESR_EL1) = read_sysreg_el1(SYS_ESR); ctxt_sys_reg(ctxt, AFSR0_EL1) = read_sysreg_el1(SYS_AFSR0); @@ -120,6 +140,10 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1), mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + write_sysreg_s(ctxt_sys_reg(ctxt, POR_EL0), SYS_POR_EL0); } static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) @@ -158,6 +182,9 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR); write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0); } + + if (ctxt_has_s1poe(ctxt)) + write_sysreg_el1(ctxt_sys_reg(ctxt, POR_EL1), SYS_POR); } write_sysreg_el1(ctxt_sys_reg(ctxt, ESR_EL1), SYS_ESR); write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR0_EL1), SYS_AFSR0); diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index e715c157c2c4..e433dfab882a 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -426,9 +426,9 @@ out: return; } -static __always_inline void do_ffa_mem_xfer(const u64 func_id, - struct arm_smccc_res *res, - struct kvm_cpu_context *ctxt) +static void __do_ffa_mem_xfer(const u64 func_id, + struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, len, ctxt, 1); DECLARE_REG(u32, fraglen, ctxt, 2); @@ -440,9 +440,6 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id, u32 offset, nr_ranges; int ret = 0; - BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE && - func_id != FFA_FN64_MEM_LEND); - if (addr_mbz || npages_mbz || fraglen > len || fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { ret = FFA_RET_INVALID_PARAMETERS; @@ -461,6 +458,11 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id, goto out_unlock; } + if (len > ffa_desc_buf.len) { + ret = FFA_RET_NO_MEMORY; + goto out_unlock; + } + buf = hyp_buffers.tx; memcpy(buf, host_buffers.tx, fraglen); @@ -512,6 +514,13 @@ err_unshare: goto out_unlock; } +#define do_ffa_mem_xfer(fid, res, ctxt) \ + do { \ + BUILD_BUG_ON((fid) != FFA_FN64_MEM_SHARE && \ + (fid) != FFA_FN64_MEM_LEND); \ + __do_ffa_mem_xfer((fid), (res), (ctxt)); \ + } while (0); + static void do_ffa_mem_reclaim(struct arm_smccc_res *res, struct kvm_cpu_context *ctxt) { diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 07120b37da35..401af1835be6 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -130,7 +130,7 @@ alternative_else_nop_endif /* Invalidate the stale TLBs from Bootloader */ tlbi alle2 - tlbi vmalls12e1 + tlbi alle1 dsb sy mov_q x0, INIT_SCTLR_EL2_MMU_ON diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index f43d845f3c4e..87692b566d90 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -62,6 +62,8 @@ static void fpsimd_sve_flush(void) static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) { + bool has_fpmr; + if (!guest_owns_fp_regs()) return; @@ -73,11 +75,18 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) else __fpsimd_save_state(&vcpu->arch.ctxt.fp_regs); + has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm)); + if (has_fpmr) + __vcpu_sys_reg(vcpu, FPMR) = read_sysreg_s(SYS_FPMR); + if (system_supports_sve()) __hyp_sve_restore_host(); else __fpsimd_restore_state(*host_data_ptr(fpsimd_state)); + if (has_fpmr) + write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR); + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; } diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 8f5c56d5b1cd..cc69106734ca 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -197,6 +197,15 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) } else { __fpsimd_save_state(*host_data_ptr(fpsimd_state)); } + + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) { + u64 val = read_sysreg_s(SYS_FPMR); + + if (unlikely(is_protected_kvm_enabled())) + *host_data_ptr(fpmr) = val; + else + **host_data_ptr(fpmr_ptr) = val; + } } static const exit_handler_fn hyp_exit_handlers[] = { diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index ca3c09df8d7c..48da9ca9763f 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -132,10 +132,10 @@ static void exit_vmid_context(struct tlb_inv_context *cxt) else __load_host_stage2(); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { - /* Ensure write of the old VMID */ - isb(); + /* Ensure write of the old VMID */ + isb(); + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { if (!(cxt->sctlr & SCTLR_ELx_M)) { write_sysreg_el1(cxt->sctlr, SYS_SCTLR); isb(); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 9e2bbee77491..b11bcebac908 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -17,48 +17,6 @@ #define KVM_PTE_TYPE_PAGE 1 #define KVM_PTE_TYPE_TABLE 1 -#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) - -#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ - ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ - ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) -#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) - -#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) - -#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) - -#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ - KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ - KVM_PTE_LEAF_ATTR_HI_S2_XN) - -#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) -#define KVM_MAX_OWNER_ID 1 - -/* - * Used to indicate a pte for which a 'break-before-make' sequence is in - * progress. - */ -#define KVM_INVALID_PTE_LOCKED BIT(10) - struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; @@ -1547,7 +1505,6 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, */ new = kvm_init_table_pte(childp, mm_ops); stage2_make_pte(ctx, new); - dsb(ishst); return 0; } @@ -1559,8 +1516,11 @@ int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, .flags = KVM_PGTABLE_WALK_LEAF, .arg = mc, }; + int ret; - return kvm_pgtable_walk(pgt, addr, size, &walker); + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + return ret; } int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 7b397fad26f2..18d4677002b1 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -268,8 +268,16 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) * starting to mess with the rest of the GIC, and VMCR_EL2 in * particular. This logic must be called before * __vgic_v3_restore_state(). + * + * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is + * provisioned at all. In order to prevent illegal accesses to the + * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1 + * so that the trap bits can take effect. Yes, we *loves* the GIC. */ - if (!cpu_if->vgic_sre) { + if (!(cpu_if->vgic_hcr & ICH_HCR_EN)) { + write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1); + isb(); + } else if (!cpu_if->vgic_sre) { write_gicreg(0, ICC_SRE_EL1); isb(); write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); @@ -288,8 +296,9 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) } /* - * Prevent the guest from touching the GIC system registers if - * SRE isn't enabled for GICv3 emulation. + * Prevent the guest from touching the ICC_SRE_EL1 system + * register. Note that this may not have any effect, as + * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation. */ write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); @@ -297,10 +306,11 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) /* * If we need to trap system registers, we must write * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected, + * injected. Note that this also applies if we don't expect + * any system register access (no vgic at all). */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); } @@ -326,7 +336,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) * no interrupts were being injected, and we disable it again here. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(0, ICH_HCR_EL2); } @@ -1032,6 +1042,75 @@ static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) write_gicreg(vmcr, ICH_VMCR_EL2); } +static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, + u32 sysreg, bool is_read) +{ + u64 ich_hcr; + + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + return false; + + ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + + switch (sysreg) { + case SYS_ICC_IGRPEN0_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP0Rn_EL1(0): + case SYS_ICC_AP0Rn_EL1(1): + case SYS_ICC_AP0Rn_EL1(2): + case SYS_ICC_AP0Rn_EL1(3): + case SYS_ICC_BPR0_EL1: + case SYS_ICC_EOIR0_EL1: + case SYS_ICC_HPPIR0_EL1: + case SYS_ICC_IAR0_EL1: + return ich_hcr & ICH_HCR_TALL0; + + case SYS_ICC_IGRPEN1_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP1Rn_EL1(0): + case SYS_ICC_AP1Rn_EL1(1): + case SYS_ICC_AP1Rn_EL1(2): + case SYS_ICC_AP1Rn_EL1(3): + case SYS_ICC_BPR1_EL1: + case SYS_ICC_EOIR1_EL1: + case SYS_ICC_HPPIR1_EL1: + case SYS_ICC_IAR1_EL1: + return ich_hcr & ICH_HCR_TALL1; + + case SYS_ICC_DIR_EL1: + if (ich_hcr & ICH_HCR_TDIR) + return true; + + fallthrough; + + case SYS_ICC_RPR_EL1: + case SYS_ICC_CTLR_EL1: + case SYS_ICC_PMR_EL1: + return ich_hcr & ICH_HCR_TC; + + default: + return false; + } +} + int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) { int rt; @@ -1041,6 +1120,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) bool is_read; u32 sysreg; + if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) + return 0; + esr = kvm_vcpu_get_esr(vcpu); if (vcpu_mode_is_32bit(vcpu)) { if (!kvm_condition_valid(vcpu)) { @@ -1055,6 +1137,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; + if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read)) + return 0; + switch (sysreg) { case SYS_ICC_IAR0_EL1: case SYS_ICC_IAR1_EL1: diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 77010b76c150..80581b1c3995 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -312,6 +312,9 @@ static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) { __fpsimd_save_state(*host_data_ptr(fpsimd_state)); + + if (kvm_has_fpmr(vcpu->kvm)) + **host_data_ptr(fpmr_ptr) = read_sysreg_s(SYS_FPMR); } static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index bab27f9d8cc6..638b6205526f 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -103,20 +103,6 @@ struct s2_walk_info { bool be; }; -static unsigned int ps_to_output_size(unsigned int ps) -{ - switch (ps) { - case 0: return 32; - case 1: return 36; - case 2: return 40; - case 3: return 42; - case 4: return 44; - case 5: - default: - return 48; - } -} - static u32 compute_fsc(int level, u32 fsc) { return fsc | (level & 0x3); @@ -256,7 +242,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, /* Check for valid descriptor at this point */ if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); - out->upper_attr = desc; + out->desc = desc; return 1; } @@ -266,7 +252,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, if (check_output_size(wi, desc)) { out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); - out->upper_attr = desc; + out->desc = desc; return 1; } @@ -278,27 +264,24 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, if (level < first_block_level) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); - out->upper_attr = desc; + out->desc = desc; return 1; } - /* - * We don't use the contiguous bit in the stage-2 ptes, so skip check - * for misprogramming of the contiguous bit. - */ - if (check_output_size(wi, desc)) { out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); - out->upper_attr = desc; + out->desc = desc; return 1; } if (!(desc & BIT(10))) { out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS); - out->upper_attr = desc; + out->desc = desc; return 1; } + addr_bottom += contiguous_bit_shift(desc, wi, level); + /* Calculate and return the result */ paddr = (desc & GENMASK_ULL(47, addr_bottom)) | (ipa & GENMASK_ULL(addr_bottom - 1, 0)); @@ -307,7 +290,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, out->readable = desc & (0b01 << 6); out->writable = desc & (0b10 << 6); out->level = level; - out->upper_attr = desc & GENMASK_ULL(63, 52); + out->desc = desc; return 0; } @@ -954,19 +937,16 @@ static void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) int kvm_init_nv_sysregs(struct kvm *kvm) { u64 res0, res1; - int ret = 0; - mutex_lock(&kvm->arch.config_lock); + lockdep_assert_held(&kvm->arch.config_lock); if (kvm->arch.sysreg_masks) - goto out; + return 0; kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)), GFP_KERNEL_ACCOUNT); - if (!kvm->arch.sysreg_masks) { - ret = -ENOMEM; - goto out; - } + if (!kvm->arch.sysreg_masks) + return -ENOMEM; limit_nv_id_regs(kvm); @@ -1195,8 +1175,13 @@ int kvm_init_nv_sysregs(struct kvm *kvm) if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1)) res0 |= ~(res0 | res1); set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); -out: - mutex_unlock(&kvm->arch.config_lock); - return ret; + /* SCTLR_EL1 */ + res0 = SCTLR_EL1_RES0; + res1 = SCTLR_EL1_RES1; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + res0 |= SCTLR_EL1_EPAN; + set_sysreg_masks(kvm, SCTLR_EL1, res0, res1); + + return 0; } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 82a2a003259c..ac36c438b8c1 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -233,7 +233,7 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) int i; struct kvm_pmu *pmu = &vcpu->arch.pmu; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) pmu->pmc[i].idx = i; } @@ -260,7 +260,7 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) { int i; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i)); irq_work_sync(&vcpu->arch.pmu.overflow_work); } @@ -291,7 +291,7 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) || !val) return; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { struct kvm_pmc *pmc; if (!(val & BIT(i))) @@ -323,7 +323,7 @@ void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) if (!kvm_vcpu_has_pmu(vcpu) || !val) return; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { struct kvm_pmc *pmc; if (!(val & BIT(i))) @@ -910,10 +910,10 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; /* - * The arm_pmu->num_events considers the cycle counter as well. - * Ignore that and return only the general-purpose counters. + * The arm_pmu->cntr_mask considers the fixed counter(s) as well. + * Ignore those and return only the general-purpose counters. */ - return arm_pmu->num_events - 1; + return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); } static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 329819806096..0b3adf3e17b4 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -5,6 +5,8 @@ */ #include <linux/kvm_host.h> #include <linux/perf_event.h> +#include <linux/perf/arm_pmu.h> +#include <linux/perf/arm_pmuv3.h> static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events); @@ -35,7 +37,7 @@ struct kvm_pmu_events *kvm_get_pmu_events(void) * Add events to track that we may want to switch at guest entry/exit * time. */ -void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) +void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); @@ -51,7 +53,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) /* * Stop tracking events */ -void kvm_clr_pmu_events(u32 clr) +void kvm_clr_pmu_events(u64 clr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); @@ -62,79 +64,32 @@ void kvm_clr_pmu_events(u32 clr) pmu->events_guest &= ~clr; } -#define PMEVTYPER_READ_CASE(idx) \ - case idx: \ - return read_sysreg(pmevtyper##idx##_el0) - -#define PMEVTYPER_WRITE_CASE(idx) \ - case idx: \ - write_sysreg(val, pmevtyper##idx##_el0); \ - break - -#define PMEVTYPER_CASES(readwrite) \ - PMEVTYPER_##readwrite##_CASE(0); \ - PMEVTYPER_##readwrite##_CASE(1); \ - PMEVTYPER_##readwrite##_CASE(2); \ - PMEVTYPER_##readwrite##_CASE(3); \ - PMEVTYPER_##readwrite##_CASE(4); \ - PMEVTYPER_##readwrite##_CASE(5); \ - PMEVTYPER_##readwrite##_CASE(6); \ - PMEVTYPER_##readwrite##_CASE(7); \ - PMEVTYPER_##readwrite##_CASE(8); \ - PMEVTYPER_##readwrite##_CASE(9); \ - PMEVTYPER_##readwrite##_CASE(10); \ - PMEVTYPER_##readwrite##_CASE(11); \ - PMEVTYPER_##readwrite##_CASE(12); \ - PMEVTYPER_##readwrite##_CASE(13); \ - PMEVTYPER_##readwrite##_CASE(14); \ - PMEVTYPER_##readwrite##_CASE(15); \ - PMEVTYPER_##readwrite##_CASE(16); \ - PMEVTYPER_##readwrite##_CASE(17); \ - PMEVTYPER_##readwrite##_CASE(18); \ - PMEVTYPER_##readwrite##_CASE(19); \ - PMEVTYPER_##readwrite##_CASE(20); \ - PMEVTYPER_##readwrite##_CASE(21); \ - PMEVTYPER_##readwrite##_CASE(22); \ - PMEVTYPER_##readwrite##_CASE(23); \ - PMEVTYPER_##readwrite##_CASE(24); \ - PMEVTYPER_##readwrite##_CASE(25); \ - PMEVTYPER_##readwrite##_CASE(26); \ - PMEVTYPER_##readwrite##_CASE(27); \ - PMEVTYPER_##readwrite##_CASE(28); \ - PMEVTYPER_##readwrite##_CASE(29); \ - PMEVTYPER_##readwrite##_CASE(30) - /* * Read a value direct from PMEVTYPER<idx> where idx is 0-30 - * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). + * or PMxCFILTR_EL0 where idx is 31-32. */ static u64 kvm_vcpu_pmu_read_evtype_direct(int idx) { - switch (idx) { - PMEVTYPER_CASES(READ); - case ARMV8_PMU_CYCLE_IDX: - return read_sysreg(pmccfiltr_el0); - default: - WARN_ON(1); - } + if (idx == ARMV8_PMU_CYCLE_IDX) + return read_pmccfiltr(); + else if (idx == ARMV8_PMU_INSTR_IDX) + return read_pmicfiltr(); - return 0; + return read_pmevtypern(idx); } /* * Write a value direct to PMEVTYPER<idx> where idx is 0-30 - * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). + * or PMxCFILTR_EL0 where idx is 31-32. */ static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val) { - switch (idx) { - PMEVTYPER_CASES(WRITE); - case ARMV8_PMU_CYCLE_IDX: - write_sysreg(val, pmccfiltr_el0); - break; - default: - WARN_ON(1); - } + if (idx == ARMV8_PMU_CYCLE_IDX) + write_pmccfiltr(val); + else if (idx == ARMV8_PMU_INSTR_IDX) + write_pmicfiltr(val); + else + write_pmevtypern(idx, val); } /* @@ -145,7 +100,7 @@ static void kvm_vcpu_pmu_enable_el0(unsigned long events) u64 typer; u32 counter; - for_each_set_bit(counter, &events, 32) { + for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { typer = kvm_vcpu_pmu_read_evtype_direct(counter); typer &= ~ARMV8_PMU_EXCLUDE_EL0; kvm_vcpu_pmu_write_evtype_direct(counter, typer); @@ -160,7 +115,7 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) u64 typer; u32 counter; - for_each_set_bit(counter, &events, 32) { + for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { typer = kvm_vcpu_pmu_read_evtype_direct(counter); typer |= ARMV8_PMU_EXCLUDE_EL0; kvm_vcpu_pmu_write_evtype_direct(counter, typer); @@ -176,7 +131,7 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) { struct kvm_pmu_events *pmu; - u32 events_guest, events_host; + u64 events_guest, events_host; if (!kvm_arm_support_pmu_v3() || !has_vhe()) return; @@ -197,7 +152,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) { struct kvm_pmu_events *pmu; - u32 events_guest, events_host; + u64 events_guest, events_host; if (!kvm_arm_support_pmu_v3() || !has_vhe()) return; diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c new file mode 100644 index 000000000000..e4a342e903e2 --- /dev/null +++ b/arch/arm64/kvm/ptdump.c @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Debug helper used to dump the stage-2 pagetables of the system and their + * associated permissions. + * + * Copyright (C) Google, 2024 + * Author: Sebastian Ene <sebastianene@google.com> + */ +#include <linux/debugfs.h> +#include <linux/kvm_host.h> +#include <linux/seq_file.h> + +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/ptdump.h> + +#define MARKERS_LEN 2 +#define KVM_PGTABLE_MAX_LEVELS (KVM_PGTABLE_LAST_LEVEL + 1) + +struct kvm_ptdump_guest_state { + struct kvm *kvm; + struct ptdump_pg_state parser_state; + struct addr_marker ipa_marker[MARKERS_LEN]; + struct ptdump_pg_level level[KVM_PGTABLE_MAX_LEVELS]; + struct ptdump_range range[MARKERS_LEN]; +}; + +static const struct ptdump_prot_bits stage2_pte_bits[] = { + { + .mask = PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "F", + }, { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID, + .set = "R", + .clear = " ", + }, { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID, + .set = "W", + .clear = " ", + }, { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "X", + }, { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID, + .val = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID, + .set = "AF", + .clear = " ", + }, { + .mask = PTE_TABLE_BIT | PTE_VALID, + .val = PTE_VALID, + .set = "BLK", + .clear = " ", + }, +}; + +static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct ptdump_pg_state *st = ctx->arg; + struct ptdump_state *pt_st = &st->ptdump; + + note_page(pt_st, ctx->addr, ctx->level, ctx->old); + + return 0; +} + +static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl) +{ + u32 i; + u64 mask; + + if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL)) + return -EINVAL; + + mask = 0; + for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++) + mask |= stage2_pte_bits[i].mask; + + for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) { + snprintf(level[i].name, sizeof(level[i].name), "%u", i); + + level[i].num = ARRAY_SIZE(stage2_pte_bits); + level[i].bits = stage2_pte_bits; + level[i].mask = mask; + } + + return 0; +} + +static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm) +{ + struct kvm_ptdump_guest_state *st; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct kvm_pgtable *pgtable = mmu->pgt; + int ret; + + st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT); + if (!st) + return ERR_PTR(-ENOMEM); + + ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level); + if (ret) { + kfree(st); + return ERR_PTR(ret); + } + + st->ipa_marker[0].name = "Guest IPA"; + st->ipa_marker[1].start_address = BIT(pgtable->ia_bits); + st->range[0].end = BIT(pgtable->ia_bits); + + st->kvm = kvm; + st->parser_state = (struct ptdump_pg_state) { + .marker = &st->ipa_marker[0], + .level = -1, + .pg_level = &st->level[0], + .ptdump.range = &st->range[0], + .start_address = 0, + }; + + return st; +} + +static int kvm_ptdump_guest_show(struct seq_file *m, void *unused) +{ + int ret; + struct kvm_ptdump_guest_state *st = m->private; + struct kvm *kvm = st->kvm; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct ptdump_pg_state *parser_state = &st->parser_state; + struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) { + .cb = kvm_ptdump_visitor, + .arg = parser_state, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + parser_state->seq = m; + + write_lock(&kvm->mmu_lock); + ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker); + write_unlock(&kvm->mmu_lock); + + return ret; +} + +static int kvm_ptdump_guest_open(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + struct kvm_ptdump_guest_state *st; + int ret; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + st = kvm_ptdump_parser_create(kvm); + if (IS_ERR(st)) { + ret = PTR_ERR(st); + goto err_with_kvm_ref; + } + + ret = single_open(file, kvm_ptdump_guest_show, st); + if (!ret) + return 0; + + kfree(st); +err_with_kvm_ref: + kvm_put_kvm(kvm); + return ret; +} + +static int kvm_ptdump_guest_close(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + void *st = ((struct seq_file *)file->private_data)->private; + + kfree(st); + kvm_put_kvm(kvm); + + return single_release(m, file); +} + +static const struct file_operations kvm_ptdump_guest_fops = { + .open = kvm_ptdump_guest_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_ptdump_guest_close, +}; + +static int kvm_pgtable_range_show(struct seq_file *m, void *unused) +{ + struct kvm_pgtable *pgtable = m->private; + + seq_printf(m, "%2u\n", pgtable->ia_bits); + return 0; +} + +static int kvm_pgtable_levels_show(struct seq_file *m, void *unused) +{ + struct kvm_pgtable *pgtable = m->private; + + seq_printf(m, "%1d\n", KVM_PGTABLE_MAX_LEVELS - pgtable->start_level); + return 0; +} + +static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file, + int (*show)(struct seq_file *, void *)) +{ + struct kvm *kvm = m->i_private; + struct kvm_pgtable *pgtable; + int ret; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + pgtable = kvm->arch.mmu.pgt; + + ret = single_open(file, show, pgtable); + if (ret < 0) + kvm_put_kvm(kvm); + return ret; +} + +static int kvm_pgtable_range_open(struct inode *m, struct file *file) +{ + return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_range_show); +} + +static int kvm_pgtable_levels_open(struct inode *m, struct file *file) +{ + return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_levels_show); +} + +static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + + kvm_put_kvm(kvm); + return single_release(m, file); +} + +static const struct file_operations kvm_pgtable_range_fops = { + .open = kvm_pgtable_range_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_pgtable_debugfs_close, +}; + +static const struct file_operations kvm_pgtable_levels_fops = { + .open = kvm_pgtable_levels_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_pgtable_debugfs_close, +}; + +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) +{ + debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry, + kvm, &kvm_ptdump_guest_fops); + debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm, + &kvm_pgtable_range_fops); + debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry, + kvm, &kvm_pgtable_levels_fops); +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 31e49da867ff..dad88e31f953 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -18,6 +18,7 @@ #include <linux/printk.h> #include <linux/uaccess.h> +#include <asm/arm_pmuv3.h> #include <asm/cacheflush.h> #include <asm/cputype.h> #include <asm/debug-monitors.h> @@ -47,6 +48,13 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val); +static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + kvm_inject_undefined(vcpu); + return false; +} + static bool bad_trap(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r, @@ -54,8 +62,7 @@ static bool bad_trap(struct kvm_vcpu *vcpu, { WARN_ONCE(1, "Unexpected %s\n", msg); print_sys_reg_instr(params); - kvm_inject_undefined(vcpu); - return false; + return undef_access(vcpu, params, r); } static bool read_from_write_only(struct kvm_vcpu *vcpu, @@ -346,10 +353,8 @@ static bool access_dcgsw(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - if (!kvm_has_mte(vcpu->kvm)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_mte(vcpu->kvm)) + return undef_access(vcpu, p, r); /* Treat MTE S/W ops as we treat the classic ones: with contempt */ return access_dcsw(vcpu, p, r); @@ -386,10 +391,8 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu, u64 val, mask, shift; if (reg_to_encoding(r) == SYS_TCR2_EL1 && - !kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) { - kvm_inject_undefined(vcpu); - return false; - } + !kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) + return undef_access(vcpu, p, r); BUG_ON(!p->is_write); @@ -436,10 +439,8 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, { bool g1; - if (!kvm_has_gicv3(vcpu->kvm)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); if (!p->is_write) return read_from_write_only(vcpu, p, r); @@ -484,6 +485,9 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + if (p->is_write) return ignore_write(vcpu, p); @@ -501,14 +505,6 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu, return read_zero(vcpu, p); } -static bool trap_undef(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - kvm_inject_undefined(vcpu); - return false; -} - /* * ARMv8.1 mandates at least a trivial LORegion implementation, where all the * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0 @@ -521,10 +517,8 @@ static bool trap_loregion(struct kvm_vcpu *vcpu, { u32 sr = reg_to_encoding(r); - if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) + return undef_access(vcpu, p, r); if (p->is_write && sr == SYS_LORID_EL1) return write_to_read_only(vcpu, p, r); @@ -893,7 +887,7 @@ static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK; + __vcpu_sys_reg(vcpu, r->reg) &= PMSELR_EL0_SEL_MASK; return __vcpu_sys_reg(vcpu, r->reg); } @@ -985,7 +979,7 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, else /* return PMSELR.SEL field */ p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0) - & ARMV8_PMU_COUNTER_MASK; + & PMSELR_EL0_SEL_MASK; return true; } @@ -1053,8 +1047,8 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, if (pmu_access_event_counter_el0_disabled(vcpu)) return false; - idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) - & ARMV8_PMU_COUNTER_MASK; + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, + __vcpu_sys_reg(vcpu, PMSELR_EL0)); } else if (r->Op2 == 0) { /* PMCCNTR_EL0 */ if (pmu_access_cycle_counter_el0_disabled(vcpu)) @@ -1104,7 +1098,7 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) { /* PMXEVTYPER_EL0 */ - idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_PMU_COUNTER_MASK; + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); reg = PMEVTYPER0_EL0 + idx; } else if (r->CRn == 14 && (r->CRm & 12) == 12) { idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); @@ -1257,10 +1251,8 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) { - if (!vcpu_mode_priv(vcpu)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!vcpu_mode_priv(vcpu)) + return undef_access(vcpu, p, r); __vcpu_sys_reg(vcpu, PMUSERENR_EL0) = p->regval & ARMV8_PMU_USERENR_MASK; @@ -1344,14 +1336,6 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, .reset = reset_pmevtyper, \ .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } -static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - kvm_inject_undefined(vcpu); - - return false; -} - /* Macro to expand the AMU counter and type registers*/ #define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access } #define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access } @@ -1410,8 +1394,7 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, break; default: print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); - kvm_inject_undefined(vcpu); - return false; + return undef_access(vcpu, p, r); } if (p->is_write) @@ -1545,6 +1528,10 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); break; + case SYS_ID_AA64PFR2_EL1: + /* We only expose FPMR */ + val &= ID_AA64PFR2_EL1_FPMR; + break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | @@ -1562,6 +1549,9 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; break; + case SYS_ID_AA64MMFR3_EL1: + val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE; + break; case SYS_ID_MMFR4_EL1: val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); break; @@ -1675,6 +1665,24 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } +static unsigned int sme_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SME, IMP)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_fpmr(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { @@ -2091,26 +2099,6 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, #define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) /* - * EL{0,1}2 registers are the EL2 view on an EL0 or EL1 register when - * HCR_EL2.E2H==1, and only in the sysreg table for convenience of - * handling traps. Given that, they are always hidden from userspace. - */ -static unsigned int hidden_user_visibility(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - return REG_HIDDEN_USER; -} - -#define EL12_REG(name, acc, rst, v) { \ - SYS_DESC(SYS_##name##_EL12), \ - .access = acc, \ - .reset = rst, \ - .reg = name##_EL1, \ - .val = v, \ - .visibility = hidden_user_visibility, \ -} - -/* * Since reset() callback and field val are not used for idregs, they will be * used for specific purposes for idregs. * The reset() would return KVM sanitised register value. The value would be the @@ -2217,6 +2205,18 @@ static bool access_spsr(struct kvm_vcpu *vcpu, return true; } +static bool access_cntkctl_el12(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + __vcpu_sys_reg(vcpu, CNTKCTL_EL1) = p->regval; + else + p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1); + + return true; +} + static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 val = r->val; @@ -2261,6 +2261,15 @@ static bool access_zcr_el2(struct kvm_vcpu *vcpu, return true; } +static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + return 0; + + return REG_HIDDEN; +} + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -2307,7 +2316,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { // DBGDTR[TR]X_EL0 share the same encoding { SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi }, - { SYS_DESC(SYS_DBGVCR32_EL2), trap_undef, reset_val, DBGVCR32_EL2, 0 }, + { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 }, { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 }, @@ -2365,16 +2374,15 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR0_EL1_MPAM | ID_AA64PFR0_EL1_SVE | ID_AA64PFR0_EL1_RAS | - ID_AA64PFR0_EL1_GIC | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP), }, ID_SANITISED(ID_AA64PFR1_EL1), - ID_UNALLOCATED(4,2), + ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), ID_HIDDEN(ID_AA64SMFR0_EL1), ID_UNALLOCATED(4,6), - ID_UNALLOCATED(4,7), + ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0), /* CRm=5 */ { SYS_DESC(SYS_ID_AA64DFR0_EL1), @@ -2424,7 +2432,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64MMFR2_EL1_IDS | ID_AA64MMFR2_EL1_NV | ID_AA64MMFR2_EL1_CCIDX)), - ID_SANITISED(ID_AA64MMFR3_EL1), + ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | + ID_AA64MMFR3_EL1_S1POE)), ID_SANITISED(ID_AA64MMFR4_EL1), ID_UNALLOCATED(7,5), ID_UNALLOCATED(7,6), @@ -2455,6 +2464,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SPSR_EL1), access_spsr}, { SYS_DESC(SYS_ELR_EL1), access_elr}, + { SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, + { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, @@ -2498,6 +2509,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1 }, { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1 }, + { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1, + .visibility = s1poe_visibility }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, { SYS_DESC(SYS_LORSA_EL1), trap_loregion }, @@ -2509,18 +2522,31 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, - { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi }, - { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, { SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 }, @@ -2541,7 +2567,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { CTR_EL0_IDC_MASK | CTR_EL0_DminLine_MASK | CTR_EL0_IminLine_MASK), - { SYS_DESC(SYS_SVCR), undef_access }, + { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility }, + { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility }, { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, .reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr }, @@ -2584,6 +2611,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { .access = access_pmovs, .reg = PMOVSSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg }, + { SYS_DESC(SYS_POR_EL0), NULL, reset_unknown, POR_EL0, + .visibility = s1poe_visibility }, { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, { SYS_DESC(SYS_TPIDR2_EL0), undef_access }, @@ -2764,7 +2793,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), - { SYS_DESC(SYS_DACR32_EL2), trap_undef, reset_unknown, DACR32_EL2 }, + { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 }, EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0), EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0), EL2_REG_VNCR(HAFGRTR_EL2, reset_val, 0), @@ -2773,20 +2802,16 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SP_EL1), access_sp_el1}, /* AArch32 SPSR_* are RES0 if trapped from a NV guest */ - { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - - { SYS_DESC(SYS_IFSR32_EL2), trap_undef, reset_unknown, IFSR32_EL2 }, + { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi }, + + { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 }, EL2_REG(AFSR0_EL2, access_rw, reset_val, 0), EL2_REG(AFSR1_EL2, access_rw, reset_val, 0), EL2_REG_REDIR(ESR_EL2, reset_val, 0), - { SYS_DESC(SYS_FPEXC32_EL2), trap_undef, reset_val, FPEXC32_EL2, 0x700 }, + { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 }, EL2_REG_REDIR(FAR_EL2, reset_val, 0), EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), @@ -2796,7 +2821,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(VBAR_EL2, access_rw, reset_val, 0), EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), - { SYS_DESC(SYS_RMR_EL2), trap_undef }, + { SYS_DESC(SYS_RMR_EL2), undef_access }, + + EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0), EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), @@ -2804,11 +2831,48 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), - EL12_REG(CNTKCTL, access_rw, reset_val, 0), + { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; +static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + __kvm_at_s1e01(vcpu, op, p->regval); + + return true; +} + +static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + /* There is no FGT associated with AT S1E2A :-( */ + if (op == OP_AT_S1E2A && + !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) { + kvm_inject_undefined(vcpu); + return false; + } + + __kvm_at_s1e2(vcpu, op, p->regval); + + return true; +} + +static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + __kvm_at_s12(vcpu, op, p->regval); + + return true; +} + static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; @@ -2830,10 +2894,8 @@ static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); - if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); write_lock(&vcpu->kvm->mmu_lock); @@ -2902,10 +2964,8 @@ static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 limit, vttbr; - if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm)); @@ -2930,10 +2990,8 @@ static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, u64 base, range, tg, num, scale; int shift; - if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); /* * Because the shadow S2 structure doesn't necessarily reflect that @@ -3001,10 +3059,8 @@ static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); - if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { @@ -3044,10 +3100,8 @@ static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, WARN_ON(!vcpu_is_el2(vcpu)); - if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { @@ -3071,6 +3125,14 @@ static struct sys_reg_desc sys_insn_descs[] = { { SYS_DESC(SYS_DC_ISW), access_dcsw }, { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, + + SYS_INSN(AT_S1E1R, handle_at_s1e01), + SYS_INSN(AT_S1E1W, handle_at_s1e01), + SYS_INSN(AT_S1E0R, handle_at_s1e01), + SYS_INSN(AT_S1E0W, handle_at_s1e01), + SYS_INSN(AT_S1E1RP, handle_at_s1e01), + SYS_INSN(AT_S1E1WP, handle_at_s1e01), + { SYS_DESC(SYS_DC_CSW), access_dcsw }, { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, @@ -3150,19 +3212,27 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1), + SYS_INSN(AT_S1E2R, handle_at_s1e2), + SYS_INSN(AT_S1E2W, handle_at_s1e2), + SYS_INSN(AT_S12E1R, handle_at_s12), + SYS_INSN(AT_S12E1W, handle_at_s12), + SYS_INSN(AT_S12E0R, handle_at_s12), + SYS_INSN(AT_S12E0W, handle_at_s12), + SYS_INSN(AT_S1E2A, handle_at_s1e2), + SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is), SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is), - SYS_INSN(TLBI_ALLE2OS, trap_undef), - SYS_INSN(TLBI_VAE2OS, trap_undef), + SYS_INSN(TLBI_ALLE2OS, undef_access), + SYS_INSN(TLBI_VAE2OS, undef_access), SYS_INSN(TLBI_ALLE1OS, handle_alle1is), - SYS_INSN(TLBI_VALE2OS, trap_undef), + SYS_INSN(TLBI_VALE2OS, undef_access), SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is), - SYS_INSN(TLBI_RVAE2IS, trap_undef), - SYS_INSN(TLBI_RVALE2IS, trap_undef), + SYS_INSN(TLBI_RVAE2IS, undef_access), + SYS_INSN(TLBI_RVALE2IS, undef_access), SYS_INSN(TLBI_ALLE1IS, handle_alle1is), SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is), @@ -3174,10 +3244,10 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is), - SYS_INSN(TLBI_RVAE2OS, trap_undef), - SYS_INSN(TLBI_RVALE2OS, trap_undef), - SYS_INSN(TLBI_RVAE2, trap_undef), - SYS_INSN(TLBI_RVALE2, trap_undef), + SYS_INSN(TLBI_RVAE2OS, undef_access), + SYS_INSN(TLBI_RVALE2OS, undef_access), + SYS_INSN(TLBI_RVAE2, undef_access), + SYS_INSN(TLBI_RVALE2, undef_access), SYS_INSN(TLBI_ALLE1, handle_alle1is), SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is), @@ -3186,19 +3256,19 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is), - SYS_INSN(TLBI_ALLE2OSNXS, trap_undef), - SYS_INSN(TLBI_VAE2OSNXS, trap_undef), + SYS_INSN(TLBI_ALLE2OSNXS, undef_access), + SYS_INSN(TLBI_VAE2OSNXS, undef_access), SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is), - SYS_INSN(TLBI_VALE2OSNXS, trap_undef), + SYS_INSN(TLBI_VALE2OSNXS, undef_access), SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is), - SYS_INSN(TLBI_RVAE2ISNXS, trap_undef), - SYS_INSN(TLBI_RVALE2ISNXS, trap_undef), - SYS_INSN(TLBI_ALLE2ISNXS, trap_undef), - SYS_INSN(TLBI_VAE2ISNXS, trap_undef), + SYS_INSN(TLBI_RVAE2ISNXS, undef_access), + SYS_INSN(TLBI_RVALE2ISNXS, undef_access), + SYS_INSN(TLBI_ALLE2ISNXS, undef_access), + SYS_INSN(TLBI_VAE2ISNXS, undef_access), SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is), - SYS_INSN(TLBI_VALE2ISNXS, trap_undef), + SYS_INSN(TLBI_VALE2ISNXS, undef_access), SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is), @@ -3208,14 +3278,14 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is), - SYS_INSN(TLBI_RVAE2OSNXS, trap_undef), - SYS_INSN(TLBI_RVALE2OSNXS, trap_undef), - SYS_INSN(TLBI_RVAE2NXS, trap_undef), - SYS_INSN(TLBI_RVALE2NXS, trap_undef), - SYS_INSN(TLBI_ALLE2NXS, trap_undef), - SYS_INSN(TLBI_VAE2NXS, trap_undef), + SYS_INSN(TLBI_RVAE2OSNXS, undef_access), + SYS_INSN(TLBI_RVALE2OSNXS, undef_access), + SYS_INSN(TLBI_RVAE2NXS, undef_access), + SYS_INSN(TLBI_RVALE2NXS, undef_access), + SYS_INSN(TLBI_ALLE2NXS, undef_access), + SYS_INSN(TLBI_VAE2NXS, undef_access), SYS_INSN(TLBI_ALLE1NXS, handle_alle1is), - SYS_INSN(TLBI_VALE2NXS, trap_undef), + SYS_INSN(TLBI_VALE2NXS, undef_access), SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is), }; @@ -3393,6 +3463,7 @@ static const struct sys_reg_desc cp15_regs[] = { /* TTBCR2 */ { AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 }, { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 }, + { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, /* DFSR */ { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 }, { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 }, @@ -3442,8 +3513,28 @@ static const struct sys_reg_desc cp15_regs[] = { /* AMAIR1 */ { AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 }, - /* ICC_SRE */ - { Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 }, @@ -4280,7 +4371,7 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int ret; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); - if (!r || sysreg_hidden_user(vcpu, r)) + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (r->get_user) { @@ -4324,7 +4415,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, return -EFAULT; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); - if (!r || sysreg_hidden_user(vcpu, r)) + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (sysreg_user_write_ignore(vcpu, r)) @@ -4410,7 +4501,7 @@ static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, if (!(rd->reg || rd->get_user)) return 0; - if (sysreg_hidden_user(vcpu, rd)) + if (sysreg_hidden(vcpu, rd)) return 0; if (!copy_reg_to_user(rd, uind)) @@ -4551,6 +4642,7 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) mutex_lock(&kvm->arch.config_lock); vcpu_set_hcr(vcpu); + vcpu_set_ich_hcr(vcpu); if (cpus_have_final_cap(ARM64_HAS_HCX)) { /* @@ -4566,6 +4658,9 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) if (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En; + + if (kvm_has_fpmr(kvm)) + vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM; } if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) @@ -4574,8 +4669,6 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) kvm->arch.fgu[HFGxTR_GROUP] = (HFGxTR_EL2_nAMAIR2_EL1 | HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nS2POR_EL1 | - HFGxTR_EL2_nPOR_EL1 | - HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nACCDATA_EL1 | HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK); @@ -4606,10 +4699,21 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS); + if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) + kvm->arch.fgu[HFGITR_GROUP] |= HFGITR_EL2_ATS1E1A; + + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2)) + kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP | + HFGITR_EL2_ATS1E1WP); + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 | + HFGxTR_EL2_nPOR_EL0); + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 | HAFGRTR_EL2_RES1); @@ -4619,6 +4723,36 @@ out: mutex_unlock(&kvm->arch.config_lock); } +/* + * Perform last adjustments to the ID registers that are implied by the + * configuration outside of the ID regs themselves, as well as any + * initialisation that directly depend on these ID registers (such as + * RES0/RES1 behaviours). This is not the place to configure traps though. + * + * Because this can be called once per CPU, changes must be idempotent. + */ +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + guard(mutex)(&kvm->arch.config_lock); + + if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) && + irqchip_in_kernel(kvm) && + kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) { + kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK; + kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK; + } + + if (vcpu_has_nv(vcpu)) { + int ret = kvm_init_nv_sysregs(kvm); + if (ret) + return ret; + } + + return 0; +} + int __init kvm_sys_reg_table_init(void) { bool valid = true; diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 997eea21ba2a..1d94ed6efad2 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -95,9 +95,8 @@ struct sys_reg_desc { }; #define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ -#define REG_HIDDEN_USER (1 << 1) /* hidden from userspace only */ -#define REG_RAZ (1 << 2) /* RAZ from userspace and guest */ -#define REG_USER_WI (1 << 3) /* WI from userspace only */ +#define REG_RAZ (1 << 1) /* RAZ from userspace and guest */ +#define REG_USER_WI (1 << 2) /* WI from userspace only */ static __printf(2, 3) inline void print_sys_reg_msg(const struct sys_reg_params *p, @@ -165,15 +164,6 @@ static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, return sysreg_visibility(vcpu, r) & REG_HIDDEN; } -static inline bool sysreg_hidden_user(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *r) -{ - if (likely(!r->visibility)) - return false; - - return r->visibility(vcpu, r) & (REG_HIDDEN | REG_HIDDEN_USER); -} - static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { @@ -235,6 +225,8 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu); + #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x @@ -248,4 +240,11 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) +#define CP15_SYS_DESC(reg) \ + .name = #reg, \ + .aarch32_map = AA32_DIRECT, \ + Op0(0), Op1(sys_reg_Op1(reg)), \ + CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ + Op2(sys_reg_Op2(reg)) + #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */ diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 3eecdd2f4b8f..b217b256853c 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -292,6 +292,18 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) /* Get the show on the road... */ vgic_v3->vgic_hcr = ICH_HCR_EN; +} + +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; + + /* Hide GICv3 sysreg if necessary */ + if (!kvm_has_gicv3(vcpu->kvm)) { + vgic_v3->vgic_hcr |= ICH_HCR_TALL0 | ICH_HCR_TALL1 | ICH_HCR_TC; + return; + } + if (group0_trap) vgic_v3->vgic_hcr |= ICH_HCR_TALL0; if (group1_trap) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index abe29c7d85d0..f50274fd5581 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -922,10 +922,13 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) void kvm_vgic_load(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_load(vcpu); else vgic_v3_load(vcpu); @@ -933,10 +936,13 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu) void kvm_vgic_put(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_put(vcpu); else vgic_v3_put(vcpu); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 8532bfe3fed4..f2486b4d9f95 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -346,11 +346,11 @@ void vgic_v4_configure_vsgis(struct kvm *kvm); void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq); +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu); + static inline bool kvm_has_gicv3(struct kvm *kvm) { - return (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) && - irqchip_in_kernel(kvm) && - kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3); + return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); } #endif diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 60454256945b..2fc8c6dd0407 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := dma-mapping.o extable.o fault.o init.o \ cache.o copypage.o flush.o \ - ioremap.o mmap.o pgd.o mmu.o \ + ioremap.o mmap.o pgd.o mem_encrypt.o mmu.o \ context.o proc.o pageattr.o fixmap.o obj-$(CONFIG_ARM64_CONTPTE) += contpte.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index a3edced29ac1..55107d27d3f8 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -421,6 +421,12 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma, ptep = contpte_align_down(ptep); start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + /* + * We are not advancing entry because __ptep_set_access_flags() + * only consumes access flags from entry. And since we have checked + * for the whole contpte block and returned early, pte_same() + * within __ptep_set_access_flags() is likely false. + */ for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) __ptep_set_access_flags(vma, addr, ptep, entry, 0); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 451ba7cbd5ad..8b281cf308b3 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -23,6 +23,7 @@ #include <linux/sched/debug.h> #include <linux/highmem.h> #include <linux/perf_event.h> +#include <linux/pkeys.h> #include <linux/preempt.h> #include <linux/hugetlb.h> @@ -486,6 +487,23 @@ static void do_bad_area(unsigned long far, unsigned long esr, } } +static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, + unsigned int mm_flags) +{ + unsigned long iss2 = ESR_ELx_ISS2(esr); + + if (!system_supports_poe()) + return false; + + if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay)) + return true; + + return !arch_vma_access_permitted(vma, + mm_flags & FAULT_FLAG_WRITE, + mm_flags & FAULT_FLAG_INSTRUCTION, + false); +} + static bool is_el0_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; @@ -511,6 +529,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, unsigned long addr = untagged_addr(far); struct vm_area_struct *vma; int si_code; + int pkey = -1; if (kprobe_page_fault(regs, esr)) return 0; @@ -575,6 +594,16 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto bad_area; } + + if (fault_from_pkey(esr, vma, mm_flags)) { + pkey = vma_pkey(vma); + vma_end_read(vma); + fault = 0; + si_code = SEGV_PKUERR; + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto bad_area; + } + fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) vma_end_read(vma); @@ -610,7 +639,16 @@ retry: goto bad_area; } + if (fault_from_pkey(esr, vma, mm_flags)) { + pkey = vma_pkey(vma); + mmap_read_unlock(mm); + fault = 0; + si_code = SEGV_PKUERR; + goto bad_area; + } + fault = handle_mm_fault(vma, addr, mm_flags, regs); + /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) @@ -669,8 +707,23 @@ bad_area: arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name); } else { + /* + * The pkey value that we return to userspace can be different + * from the pkey that caused the fault. + * + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); + * 2. T1 : set POR_EL0 to deny access to pkey=4, touches, page + * 3. T1 : faults... + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); + * 5. T1 : enters fault handler, takes mmap_lock, etc... + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really + * faulted on a pte with its pkey=4. + */ /* Something tried to access memory that out of memory map */ - arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name); + if (si_code == SEGV_PKUERR) + arm64_force_sig_fault_pkey(far, inf->name, pkey); + else + arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name); } return 0; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9b5ab6818f7f..a0400b9aa814 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -414,8 +414,16 @@ void __init mem_init(void) void free_initmem(void) { - free_reserved_area(lm_alias(__init_begin), - lm_alias(__init_end), + void *lm_init_begin = lm_alias(__init_begin); + void *lm_init_end = lm_alias(__init_end); + + WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE)); + WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE)); + + /* Delete __init region from memblock.reserved. */ + memblock_free(lm_init_begin, lm_init_end - lm_init_begin); + + free_reserved_area(lm_init_begin, lm_init_end, POISON_FREE_INITMEM, "unused kernel"); /* * Unmap the __init region but leave the VM area in place. This diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index 269f2f63ab7d..6cc0b7e7eb03 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -3,10 +3,22 @@ #include <linux/mm.h> #include <linux/io.h> +static ioremap_prot_hook_t ioremap_prot_hook; + +int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook) +{ + if (WARN_ON(ioremap_prot_hook)) + return -EBUSY; + + ioremap_prot_hook = hook; + return 0; +} + void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, unsigned long prot) { unsigned long last_addr = phys_addr + size - 1; + pgprot_t pgprot = __pgprot(prot); /* Don't allow outside PHYS_MASK */ if (last_addr & ~PHYS_MASK) @@ -16,7 +28,16 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr)))) return NULL; - return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); + /* + * If a hook is registered (e.g. for confidential computing + * purposes), call that now and barf if it fails. + */ + if (unlikely(ioremap_prot_hook) && + WARN_ON(ioremap_prot_hook(phys_addr, size, &pgprot))) { + return NULL; + } + + return generic_ioremap_prot(phys_addr, size, pgprot); } EXPORT_SYMBOL(ioremap_prot); diff --git a/arch/arm64/mm/mem_encrypt.c b/arch/arm64/mm/mem_encrypt.c new file mode 100644 index 000000000000..ee3c0ab04384 --- /dev/null +++ b/arch/arm64/mm/mem_encrypt.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Implementation of the memory encryption/decryption API. + * + * Since the low-level details of the operation depend on the + * Confidential Computing environment (e.g. pKVM, CCA, ...), this just + * acts as a top-level dispatcher to whatever hooks may have been + * registered. + * + * Author: Will Deacon <will@kernel.org> + * Copyright (C) 2024 Google LLC + * + * "Hello, boils and ghouls!" + */ + +#include <linux/bug.h> +#include <linux/compiler.h> +#include <linux/err.h> +#include <linux/mm.h> + +#include <asm/mem_encrypt.h> + +static const struct arm64_mem_crypt_ops *crypt_ops; + +int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops) +{ + if (WARN_ON(crypt_ops)) + return -EBUSY; + + crypt_ops = ops; + return 0; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return crypt_ops->encrypt(addr, numpages); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return crypt_ops->decrypt(addr, numpages); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 642bdf908b22..7e3ad97e27d8 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -102,6 +102,17 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags) if (vm_flags & VM_MTE) prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED); +#ifdef CONFIG_ARCH_HAS_PKEYS + if (system_supports_poe()) { + if (vm_flags & VM_PKEY_BIT0) + prot |= PTE_PO_IDX_0; + if (vm_flags & VM_PKEY_BIT1) + prot |= PTE_PO_IDX_1; + if (vm_flags & VM_PKEY_BIT2) + prot |= PTE_PO_IDX_2; + } +#endif + return __pgprot(prot); } EXPORT_SYMBOL(vm_get_page_prot); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 353ea5dc32b8..e55b02fbddc8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -25,6 +25,7 @@ #include <linux/vmalloc.h> #include <linux/set_memory.h> #include <linux/kfence.h> +#include <linux/pkeys.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -1549,3 +1550,47 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) cpu_uninstall_idmap(); } + +#ifdef CONFIG_ARCH_HAS_PKEYS +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) +{ + u64 new_por = POE_RXW; + u64 old_por; + u64 pkey_shift; + + if (!system_supports_poe()) + return -ENOSPC; + + /* + * This code should only be called with valid 'pkey' + * values originating from in-kernel users. Complain + * if a bad value is observed. + */ + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) + return -EINVAL; + + /* Set the bits we need in POR: */ + new_por = POE_RXW; + if (init_val & PKEY_DISABLE_WRITE) + new_por &= ~POE_W; + if (init_val & PKEY_DISABLE_ACCESS) + new_por &= ~POE_RW; + if (init_val & PKEY_DISABLE_READ) + new_por &= ~POE_R; + if (init_val & PKEY_DISABLE_EXECUTE) + new_por &= ~POE_X; + + /* Shift the bits in to the correct place in POR for pkey: */ + pkey_shift = pkey * POR_BITS_PER_PKEY; + new_por <<= pkey_shift; + + /* Get old POR and mask off any old bits in place: */ + old_por = read_sysreg_s(SYS_POR_EL0); + old_por &= ~(POE_MASK << pkey_shift); + + /* Write old part along with new part: */ + write_sysreg_s(old_por | new_por, SYS_POR_EL0); + + return 0; +} +#endif diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index f4bc6c5bac06..8abdc7fed321 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -36,8 +36,6 @@ #define TCR_KASLR_FLAGS 0 #endif -#define TCR_SMP_FLAGS TCR_SHARED - /* PTWs cacheable, inner/outer WBWA */ #define TCR_CACHE_FLAGS TCR_IRGN_WBWA | TCR_ORGN_WBWA @@ -469,7 +467,7 @@ SYM_FUNC_START(__cpu_setup) tcr .req x16 mov_q mair, MAIR_EL1_SET mov_q tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \ - TCR_SMP_FLAGS | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ + TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS tcr_clear_errata_bits tcr, x9, x5 diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 6986827e0d64..264c5f9b97d8 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -38,33 +38,7 @@ seq_printf(m, fmt); \ }) -/* - * The page dumper groups page table entries of the same type into a single - * description. It uses pg_state to track the range information while - * iterating over the pte entries. When the continuity is broken it then - * dumps out a description of the range. - */ -struct pg_state { - struct ptdump_state ptdump; - struct seq_file *seq; - const struct addr_marker *marker; - const struct mm_struct *mm; - unsigned long start_address; - int level; - u64 current_prot; - bool check_wx; - unsigned long wx_pages; - unsigned long uxn_pages; -}; - -struct prot_bits { - u64 mask; - u64 val; - const char *set; - const char *clear; -}; - -static const struct prot_bits pte_bits[] = { +static const struct ptdump_prot_bits pte_bits[] = { { .mask = PTE_VALID, .val = PTE_VALID, @@ -143,14 +117,7 @@ static const struct prot_bits pte_bits[] = { } }; -struct pg_level { - const struct prot_bits *bits; - char name[4]; - int num; - u64 mask; -}; - -static struct pg_level pg_level[] __ro_after_init = { +static struct ptdump_pg_level kernel_pg_levels[] __ro_after_init = { { /* pgd */ .name = "PGD", .bits = pte_bits, @@ -174,7 +141,7 @@ static struct pg_level pg_level[] __ro_after_init = { }, }; -static void dump_prot(struct pg_state *st, const struct prot_bits *bits, +static void dump_prot(struct ptdump_pg_state *st, const struct ptdump_prot_bits *bits, size_t num) { unsigned i; @@ -192,7 +159,7 @@ static void dump_prot(struct pg_state *st, const struct prot_bits *bits, } } -static void note_prot_uxn(struct pg_state *st, unsigned long addr) +static void note_prot_uxn(struct ptdump_pg_state *st, unsigned long addr) { if (!st->check_wx) return; @@ -206,7 +173,7 @@ static void note_prot_uxn(struct pg_state *st, unsigned long addr) st->uxn_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_prot_wx(struct pg_state *st, unsigned long addr) +static void note_prot_wx(struct ptdump_pg_state *st, unsigned long addr) { if (!st->check_wx) return; @@ -221,16 +188,17 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) st->wx_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - u64 val) +void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, + u64 val) { - struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + struct ptdump_pg_state *st = container_of(pt_st, struct ptdump_pg_state, ptdump); + struct ptdump_pg_level *pg_level = st->pg_level; static const char units[] = "KMGTPE"; u64 prot = 0; /* check if the current level has been folded dynamically */ - if ((level == 1 && mm_p4d_folded(st->mm)) || - (level == 2 && mm_pud_folded(st->mm))) + if (st->mm && ((level == 1 && mm_p4d_folded(st->mm)) || + (level == 2 && mm_pud_folded(st->mm)))) level = 0; if (level >= 0) @@ -286,15 +254,16 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, void ptdump_walk(struct seq_file *s, struct ptdump_info *info) { unsigned long end = ~0UL; - struct pg_state st; + struct ptdump_pg_state st; if (info->base_addr < TASK_SIZE_64) end = TASK_SIZE_64; - st = (struct pg_state){ + st = (struct ptdump_pg_state){ .seq = s, .marker = info->markers, .mm = info->mm, + .pg_level = &kernel_pg_levels[0], .level = -1, .ptdump = { .note_page = note_page, @@ -312,10 +281,10 @@ static void __init ptdump_initialize(void) { unsigned i, j; - for (i = 0; i < ARRAY_SIZE(pg_level); i++) - if (pg_level[i].bits) - for (j = 0; j < pg_level[i].num; j++) - pg_level[i].mask |= pg_level[i].bits[j].mask; + for (i = 0; i < ARRAY_SIZE(kernel_pg_levels); i++) + if (kernel_pg_levels[i].bits) + for (j = 0; j < kernel_pg_levels[i].num; j++) + kernel_pg_levels[i].mask |= kernel_pg_levels[i].bits[j].mask; } static struct ptdump_info kernel_ptdump_info __ro_after_init = { @@ -324,12 +293,13 @@ static struct ptdump_info kernel_ptdump_info __ro_after_init = { bool ptdump_check_wx(void) { - struct pg_state st = { + struct ptdump_pg_state st = { .seq = NULL, .marker = (struct addr_marker[]) { { 0, NULL}, { -1, NULL}, }, + .pg_level = &kernel_pg_levels[0], .level = -1, .check_wx = true, .ptdump = { diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 5139a28130c0..0f7b484cb2ff 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -42,14 +42,16 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) * the temporary mappings we use during restore. */ __set_pte(dst_ptep, pte_mkwrite_novma(pte)); - } else if ((debug_pagealloc_enabled() || - is_kfence_address((void *)addr)) && !pte_none(pte)) { + } else if (!pte_none(pte)) { /* * debug_pagealloc will removed the PTE_VALID bit if * the page isn't in use by the resume kernel. It may have * been in use by the original kernel, in which case we need * to put it back in our copy to do the restore. * + * Other cases include kfence / vmalloc / memfd_secret which + * may call `set_direct_map_invalid_noflush()`. + * * Before marking this entry valid, check the pfn should * be mapped. */ diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index ac3429d892b9..eedb5acc21ed 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -45,6 +45,7 @@ HAS_MOPS HAS_NESTED_VIRT HAS_PAN HAS_S1PIE +HAS_S1POE HAS_RAS_EXTN HAS_RNG HAS_SB diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 7ceaa1e0b4bc..8d637ac4b7c6 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2029,6 +2029,31 @@ Sysreg FAR_EL1 3 0 6 0 0 Field 63:0 ADDR EndSysreg +Sysreg PMICNTR_EL0 3 3 9 4 0 +Field 63:0 ICNT +EndSysreg + +Sysreg PMICFILTR_EL0 3 3 9 6 0 +Res0 63:59 +Field 58 SYNC +Field 57:56 VS +Res0 55:32 +Field 31 P +Field 30 U +Field 29 NSK +Field 28 NSU +Field 27 NSH +Field 26 M +Res0 25 +Field 24 SH +Field 23 T +Field 22 RLK +Field 21 RLU +Field 20 RLH +Res0 19:16 +Field 15:0 evtCount +EndSysreg + Sysreg PMSCR_EL1 3 0 9 9 0 Res0 63:8 Field 7:6 PCT @@ -2153,6 +2178,11 @@ Field 4 P Field 3:0 ALIGN EndSysreg +Sysreg PMSELR_EL0 3 3 9 12 5 +Res0 63:5 +Field 4:0 SEL +EndSysreg + SysregFields CONTEXTIDR_ELx Res0 63:32 Field 31:0 PROCID |