diff options
Diffstat (limited to 'arch')
121 files changed, 1384 insertions, 595 deletions
diff --git a/arch/arc/net/bpf_jit.h b/arch/arc/net/bpf_jit.h index ec44873c42d1..495f3023e4c1 100644 --- a/arch/arc/net/bpf_jit.h +++ b/arch/arc/net/bpf_jit.h @@ -39,7 +39,7 @@ /************** Functions that the back-end must provide **************/ /* Extension for 32-bit operations. */ -inline u8 zext(u8 *buf, u8 rd); +u8 zext(u8 *buf, u8 rd); /***** Moves *****/ u8 mov_r32(u8 *buf, u8 rd, u8 rs, u8 sign_ext); u8 mov_r32_i32(u8 *buf, u8 reg, s32 imm); diff --git a/arch/arc/net/bpf_jit_arcv2.c b/arch/arc/net/bpf_jit_arcv2.c index 31bfb6e9ce00..4458e409ca0a 100644 --- a/arch/arc/net/bpf_jit_arcv2.c +++ b/arch/arc/net/bpf_jit_arcv2.c @@ -62,7 +62,7 @@ enum { * If/when we decide to add ARCv2 instructions that do use register pairs, * the mapping, hopefully, doesn't need to be revisited. */ -const u8 bpf2arc[][2] = { +static const u8 bpf2arc[][2] = { /* Return value from in-kernel function, and exit value from eBPF */ [BPF_REG_0] = {ARC_R_8, ARC_R_9}, /* Arguments from eBPF program to in-kernel function */ @@ -1302,7 +1302,7 @@ static u8 arc_b(u8 *buf, s32 offset) /************* Packers (Deal with BPF_REGs) **************/ -inline u8 zext(u8 *buf, u8 rd) +u8 zext(u8 *buf, u8 rd) { if (rd != BPF_REG_FP) return arc_movi_r(buf, REG_HI(rd), 0); @@ -2235,6 +2235,7 @@ u8 gen_swap(u8 *buf, u8 rd, u8 size, u8 endian, bool force, bool do_zext) break; default: /* The caller must have handled this. */ + break; } } else { /* @@ -2253,6 +2254,7 @@ u8 gen_swap(u8 *buf, u8 rd, u8 size, u8 endian, bool force, bool do_zext) break; default: /* The caller must have handled this. */ + break; } } @@ -2517,7 +2519,7 @@ u8 arc_epilogue(u8 *buf, u32 usage, u16 frame_size) #define JCC64_NR_OF_JMPS 3 /* Number of jumps in jcc64 template. */ #define JCC64_INSNS_TO_END 3 /* Number of insn. inclusive the 2nd jmp to end. */ #define JCC64_SKIP_JMP 1 /* Index of the "skip" jump to "end". */ -const struct { +static const struct { /* * "jit_off" is common between all "jmp[]" and is coupled with * "cond" of each "jmp[]" instance. e.g.: @@ -2883,7 +2885,7 @@ u8 gen_jmp_64(u8 *buf, u8 rd, u8 rs, u8 cond, u32 curr_off, u32 targ_off) * The "ARC_CC_SET" becomes "CC_unequal" because of the "tst" * instruction that precedes the conditional branch. */ -const u8 arcv2_32_jmps[ARC_CC_LAST] = { +static const u8 arcv2_32_jmps[ARC_CC_LAST] = { [ARC_CC_UGT] = CC_great_u, [ARC_CC_UGE] = CC_great_eq_u, [ARC_CC_ULT] = CC_less_u, diff --git a/arch/arc/net/bpf_jit_core.c b/arch/arc/net/bpf_jit_core.c index 6f6b4ffccf2c..e3628922c24a 100644 --- a/arch/arc/net/bpf_jit_core.c +++ b/arch/arc/net/bpf_jit_core.c @@ -159,7 +159,7 @@ static void jit_dump(const struct jit_context *ctx) /* Initialise the context so there's no garbage. */ static int jit_ctx_init(struct jit_context *ctx, struct bpf_prog *prog) { - memset(ctx, 0, sizeof(ctx)); + memset(ctx, 0, sizeof(*ctx)); ctx->orig_prog = prog; @@ -167,7 +167,7 @@ static int jit_ctx_init(struct jit_context *ctx, struct bpf_prog *prog) ctx->prog = bpf_jit_blind_constants(prog); if (IS_ERR(ctx->prog)) return PTR_ERR(ctx->prog); - ctx->blinded = (ctx->prog == ctx->orig_prog ? false : true); + ctx->blinded = (ctx->prog != ctx->orig_prog); /* If the verifier doesn't zero-extend, then we have to do it. */ ctx->do_zext = !ctx->prog->aux->verifier_zext; @@ -1182,12 +1182,12 @@ static int jit_prepare(struct jit_context *ctx) } /* - * All the "handle_*()" functions have been called before by the - * "jit_prepare()". If there was an error, we would know by now. - * Therefore, no extra error checking at this point, other than - * a sanity check at the end that expects the calculated length - * (jit.len) to be equal to the length of generated instructions - * (jit.index). + * jit_compile() is the real compilation phase. jit_prepare() is + * invoked before jit_compile() as a dry-run to make sure everything + * will go OK and allocate the necessary memory. + * + * In the end, jit_compile() checks if it has produced the same number + * of instructions as jit_prepare() would. */ static int jit_compile(struct jit_context *ctx) { @@ -1407,9 +1407,9 @@ static struct bpf_prog *do_extra_pass(struct bpf_prog *prog) /* * This function may be invoked twice for the same stream of BPF - * instructions. The "extra pass" happens, when there are "call"s - * involved that their addresses are not known during the first - * invocation. + * instructions. The "extra pass" happens, when there are + * (re)locations involved that their addresses are not known + * during the first run. */ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { diff --git a/arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi b/arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi index d80440446473..05d7a462ea25 100644 --- a/arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi @@ -85,7 +85,7 @@ }; }; - panel { + panel_dpi: panel { compatible = "sii,43wvf1g"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_display_power>; diff --git a/arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso b/arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso index c84e9b052527..151e9cee3c87 100644 --- a/arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso +++ b/arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso @@ -10,8 +10,6 @@ /plugin/; &{/} { - /delete-node/ panel; - hdmi: connector-hdmi { compatible = "hdmi-connector"; label = "hdmi"; @@ -82,6 +80,10 @@ }; }; +&panel_dpi { + status = "disabled"; +}; + &tve { status = "disabled"; }; diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index 78282ced5038..e408399d5f0e 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -14,6 +14,7 @@ #include <asm/mach/map.h> #include <asm/mmu_context.h> #include <asm/ptrace.h> +#include <asm/uaccess.h> #ifdef CONFIG_EFI void efi_init(void); @@ -25,6 +26,18 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md, boo #define arch_efi_call_virt_setup() efi_virtmap_load() #define arch_efi_call_virt_teardown() efi_virtmap_unload() +#ifdef CONFIG_CPU_TTBR0_PAN +#undef arch_efi_call_virt +#define arch_efi_call_virt(p, f, args...) ({ \ + unsigned int flags = uaccess_save_and_enable(); \ + efi_status_t res = _Generic((p)->f(args), \ + efi_status_t: (p)->f(args), \ + default: ((p)->f(args), EFI_ABORTED)); \ + uaccess_restore(flags); \ + res; \ +}) +#endif + #define ARCH_EFI_IRQ_FLAGS_MASK \ (PSR_J_BIT | PSR_E_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | \ PSR_T_BIT | MODE_MASK) diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index a0b6d1e3812f..e61591f33a6c 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -232,11 +232,24 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, unsigned long old; if (unlikely(atomic_read(¤t->tracing_graph_pause))) +err_out: return; if (IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER)) { - /* FP points one word below parent's top of stack */ - frame_pointer += 4; + /* + * Usually, the stack frames are contiguous in memory but cases + * have been observed where the next stack frame does not live + * at 'frame_pointer + 4' as this code used to assume. + * + * Instead, dereference the field in the stack frame that + * stores the SP of the calling frame: to avoid unbounded + * recursion, this cannot involve any ftrace instrumented + * functions, so use the __get_kernel_nofault() primitive + * directly. + */ + __get_kernel_nofault(&frame_pointer, + (unsigned long *)(frame_pointer - 8), + unsigned long, err_out); } else { struct stackframe frame = { .fp = frame_pointer, diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index 4768b05fd765..98544741ce17 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -6,6 +6,7 @@ #include <dt-bindings/phy/phy-imx8-pcie.h> #include <dt-bindings/pwm/pwm.h> #include "imx8mm.dtsi" +#include "imx8mm-overdrive.dtsi" / { chosen { @@ -935,7 +936,7 @@ /* Verdin GPIO_9_DSI (pulled-up as active-low) */ pinctrl_gpio_9_dsi: gpio9dsigrp { fsl,pins = - <MX8MM_IOMUXC_NAND_RE_B_GPIO3_IO15 0x146>; /* SODIMM 17 */ + <MX8MM_IOMUXC_NAND_RE_B_GPIO3_IO15 0x1c6>; /* SODIMM 17 */ }; /* Verdin GPIO_10_DSI (pulled-up as active-low) */ diff --git a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi index 43f1d45ccc96..f5115f9e8c47 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi @@ -254,7 +254,7 @@ <&clk IMX8MP_CLK_CLKOUT2>, <&clk IMX8MP_AUDIO_PLL2_OUT>; assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL2_OUT>; - assigned-clock-rates = <13000000>, <13000000>, <156000000>; + assigned-clock-rates = <13000000>, <13000000>, <208000000>; reset-gpios = <&gpio4 1 GPIO_ACTIVE_HIGH>; status = "disabled"; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi index dec57fad6828..e2b5e7ac3e46 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi @@ -219,7 +219,7 @@ bluetooth { compatible = "brcm,bcm4330-bt"; - shutdown-gpios = <&gpio4 16 GPIO_ACTIVE_HIGH>; + shutdown-gpios = <&gpio1 3 GPIO_ACTIVE_HIGH>; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx8qm-mek.dts b/arch/arm64/boot/dts/freescale/imx8qm-mek.dts index 5c6b39c6933f..6e05361c1ffb 100644 --- a/arch/arm64/boot/dts/freescale/imx8qm-mek.dts +++ b/arch/arm64/boot/dts/freescale/imx8qm-mek.dts @@ -36,7 +36,7 @@ regulator-name = "SD1_SPWR"; regulator-min-microvolt = <3000000>; regulator-max-microvolt = <3000000>; - gpio = <&lsio_gpio4 19 GPIO_ACTIVE_HIGH>; + gpio = <&lsio_gpio4 7 GPIO_ACTIVE_HIGH>; enable-active-high; }; diff --git a/arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts b/arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts index d400d85f42a9..bd98eff4d685 100644 --- a/arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts +++ b/arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts @@ -296,7 +296,6 @@ vmmc-supply = <®_usdhc2_vmmc>; bus-width = <4>; status = "okay"; - no-sdio; no-mmc; }; diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index e4546b29dd0c..fd87c4b8f984 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -146,7 +146,7 @@ /* Coprocessor traps */ .macro __init_el2_cptr __check_hvhe .LnVHE_\@, x1 - mov x0, #(CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN) + mov x0, #CPACR_ELx_FPEN msr cpacr_el1, x0 b .Lskip_set_cptr_\@ .LnVHE_\@: @@ -277,7 +277,7 @@ // (h)VHE case mrs x0, cpacr_el1 // Disable SVE traps - orr x0, x0, #(CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN) + orr x0, x0, #CPACR_ELx_ZEN msr cpacr_el1, x0 b .Lskip_set_cptr_\@ @@ -298,7 +298,7 @@ // (h)VHE case mrs x0, cpacr_el1 // Disable SME traps - orr x0, x0, #(CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN) + orr x0, x0, #CPACR_ELx_SMEN msr cpacr_el1, x0 b .Lskip_set_cptr_sme_\@ diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 4ff0ae3f6d66..41fd90895dfc 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -153,8 +153,9 @@ extern void __memset_io(volatile void __iomem *, int, size_t); * emit the large TLP from the CPU. */ -static inline void __const_memcpy_toio_aligned32(volatile u32 __iomem *to, - const u32 *from, size_t count) +static __always_inline void +__const_memcpy_toio_aligned32(volatile u32 __iomem *to, const u32 *from, + size_t count) { switch (count) { case 8: @@ -196,24 +197,22 @@ static inline void __const_memcpy_toio_aligned32(volatile u32 __iomem *to, void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count); -static inline void __const_iowrite32_copy(void __iomem *to, const void *from, - size_t count) +static __always_inline void +__iowrite32_copy(void __iomem *to, const void *from, size_t count) { - if (count == 8 || count == 4 || count == 2 || count == 1) { + if (__builtin_constant_p(count) && + (count == 8 || count == 4 || count == 2 || count == 1)) { __const_memcpy_toio_aligned32(to, from, count); dgh(); } else { __iowrite32_copy_full(to, from, count); } } +#define __iowrite32_copy __iowrite32_copy -#define __iowrite32_copy(to, from, count) \ - (__builtin_constant_p(count) ? \ - __const_iowrite32_copy(to, from, count) : \ - __iowrite32_copy_full(to, from, count)) - -static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to, - const u64 *from, size_t count) +static __always_inline void +__const_memcpy_toio_aligned64(volatile u64 __iomem *to, const u64 *from, + size_t count) { switch (count) { case 8: @@ -255,21 +254,18 @@ static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to, void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count); -static inline void __const_iowrite64_copy(void __iomem *to, const void *from, - size_t count) +static __always_inline void +__iowrite64_copy(void __iomem *to, const void *from, size_t count) { - if (count == 8 || count == 4 || count == 2 || count == 1) { + if (__builtin_constant_p(count) && + (count == 8 || count == 4 || count == 2 || count == 1)) { __const_memcpy_toio_aligned64(to, from, count); dgh(); } else { __iowrite64_copy_full(to, from, count); } } - -#define __iowrite64_copy(to, from, count) \ - (__builtin_constant_p(count) ? \ - __const_iowrite64_copy(to, from, count) : \ - __iowrite64_copy_full(to, from, count)) +#define __iowrite64_copy __iowrite64_copy /* * I/O memory mapping functions. diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index e01bb5ca13b7..b2adc2c6c82a 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -305,6 +305,12 @@ GENMASK(19, 14) | \ BIT(11)) +#define CPTR_VHE_EL2_RES0 (GENMASK(63, 32) | \ + GENMASK(27, 26) | \ + GENMASK(23, 22) | \ + GENMASK(19, 18) | \ + GENMASK(15, 0)) + /* Hyp Debug Configuration Register bits */ #define MDCR_EL2_E2TB_MASK (UL(0x3)) #define MDCR_EL2_E2TB_SHIFT (UL(24)) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 501e3e019c93..21650e7924d4 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -557,6 +557,68 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) vcpu_set_flag((v), e); \ } while (0) +#define __build_check_all_or_none(r, bits) \ + BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits)) + +#define __cpacr_to_cptr_clr(clr, set) \ + ({ \ + u64 cptr = 0; \ + \ + if ((set) & CPACR_ELx_FPEN) \ + cptr |= CPTR_EL2_TFP; \ + if ((set) & CPACR_ELx_ZEN) \ + cptr |= CPTR_EL2_TZ; \ + if ((set) & CPACR_ELx_SMEN) \ + cptr |= CPTR_EL2_TSM; \ + if ((clr) & CPACR_ELx_TTA) \ + cptr |= CPTR_EL2_TTA; \ + if ((clr) & CPTR_EL2_TAM) \ + cptr |= CPTR_EL2_TAM; \ + if ((clr) & CPTR_EL2_TCPAC) \ + cptr |= CPTR_EL2_TCPAC; \ + \ + cptr; \ + }) + +#define __cpacr_to_cptr_set(clr, set) \ + ({ \ + u64 cptr = 0; \ + \ + if ((clr) & CPACR_ELx_FPEN) \ + cptr |= CPTR_EL2_TFP; \ + if ((clr) & CPACR_ELx_ZEN) \ + cptr |= CPTR_EL2_TZ; \ + if ((clr) & CPACR_ELx_SMEN) \ + cptr |= CPTR_EL2_TSM; \ + if ((set) & CPACR_ELx_TTA) \ + cptr |= CPTR_EL2_TTA; \ + if ((set) & CPTR_EL2_TAM) \ + cptr |= CPTR_EL2_TAM; \ + if ((set) & CPTR_EL2_TCPAC) \ + cptr |= CPTR_EL2_TCPAC; \ + \ + cptr; \ + }) + +#define cpacr_clear_set(clr, set) \ + do { \ + BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0); \ + BUILD_BUG_ON((clr) & CPACR_ELx_E0POE); \ + __build_check_all_or_none((clr), CPACR_ELx_FPEN); \ + __build_check_all_or_none((set), CPACR_ELx_FPEN); \ + __build_check_all_or_none((clr), CPACR_ELx_ZEN); \ + __build_check_all_or_none((set), CPACR_ELx_ZEN); \ + __build_check_all_or_none((clr), CPACR_ELx_SMEN); \ + __build_check_all_or_none((set), CPACR_ELx_SMEN); \ + \ + if (has_vhe() || has_hvhe()) \ + sysreg_clear_set(cpacr_el1, clr, set); \ + else \ + sysreg_clear_set(cptr_el2, \ + __cpacr_to_cptr_clr(clr, set), \ + __cpacr_to_cptr_set(clr, set));\ + } while (0) + static __always_inline void kvm_write_cptr_el2(u64 val) { if (has_vhe() || has_hvhe()) @@ -570,17 +632,16 @@ static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu) u64 val; if (has_vhe()) { - val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN | - CPACR_EL1_ZEN_EL1EN); + val = (CPACR_ELx_FPEN | CPACR_EL1_ZEN_EL1EN); if (cpus_have_final_cap(ARM64_SME)) val |= CPACR_EL1_SMEN_EL1EN; } else if (has_hvhe()) { - val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); + val = CPACR_ELx_FPEN; if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) - val |= CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN; + val |= CPACR_ELx_ZEN; if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN; + val |= CPACR_ELx_SMEN; } else { val = CPTR_NVHE_EL2_RES1; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8170c04fde91..36b8e97bf49e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -76,6 +76,7 @@ static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; }; DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); extern unsigned int __ro_after_init kvm_sve_max_vl; +extern unsigned int __ro_after_init kvm_host_sve_max_vl; int __init kvm_arm_init_sve(void); u32 __attribute_const__ kvm_target_cpu(void); @@ -521,6 +522,20 @@ struct kvm_cpu_context { u64 *vncr_array; }; +struct cpu_sve_state { + __u64 zcr_el1; + + /* + * Ordering is important since __sve_save_state/__sve_restore_state + * relies on it. + */ + __u32 fpsr; + __u32 fpcr; + + /* Must be SVE_VQ_BYTES (128 bit) aligned. */ + __u8 sve_regs[]; +}; + /* * This structure is instantiated on a per-CPU basis, and contains * data that is: @@ -534,7 +549,15 @@ struct kvm_cpu_context { */ struct kvm_host_data { struct kvm_cpu_context host_ctxt; - struct user_fpsimd_state *fpsimd_state; /* hyp VA */ + + /* + * All pointers in this union are hyp VA. + * sve_state is only used in pKVM and if system_supports_sve(). + */ + union { + struct user_fpsimd_state *fpsimd_state; + struct cpu_sve_state *sve_state; + }; /* Ownership of the FP regs */ enum { diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 3e80464f8953..b05bceca3385 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -111,7 +111,8 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); -void __sve_restore_state(void *sve_pffr, u32 *fpsr); +void __sve_save_state(void *sve_pffr, u32 *fpsr, int save_ffr); +void __sve_restore_state(void *sve_pffr, u32 *fpsr, int restore_ffr); u64 __guest_enter(struct kvm_vcpu *vcpu); @@ -142,5 +143,6 @@ extern u64 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val); extern unsigned long kvm_nvhe_sym(__icache_flags); extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits); +extern unsigned int kvm_nvhe_sym(kvm_host_sve_max_vl); #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index ad9cfb5c1ff4..cd56acd9a842 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -128,4 +128,13 @@ static inline unsigned long hyp_ffa_proxy_pages(void) return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE); } +static inline size_t pkvm_host_sve_state_size(void) +{ + if (!system_supports_sve()) + return 0; + + return size_add(sizeof(struct cpu_sve_state), + SVE_SIG_REGS_SIZE(sve_vq_from_vl(kvm_host_sve_max_vl))); +} + #endif /* __ARM64_KVM_PKVM_H__ */ diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index dd6ce86d4332..b776e7424fe9 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -462,6 +462,9 @@ static int run_all_insn_set_hw_mode(unsigned int cpu) for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { struct insn_emulation *insn = insn_emulations[i]; bool enable = READ_ONCE(insn->current_mode) == INSN_HW; + if (insn->status == INSN_UNAVAILABLE) + continue; + if (insn->set_hw_mode && insn->set_hw_mode(enable)) { pr_warn("CPU[%u] cannot support the emulation of %s", cpu, insn->name); diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 4a92096db34e..712718aed5dd 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -9,6 +9,7 @@ #include <linux/efi.h> #include <linux/init.h> +#include <linux/kmemleak.h> #include <linux/screen_info.h> #include <linux/vmalloc.h> @@ -213,6 +214,7 @@ l: if (!p) { return -ENOMEM; } + kmemleak_not_leak(p); efi_rt_stack_top = p + THREAD_SIZE; return 0; } diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9996a989b52e..59716789fe0f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1931,6 +1931,11 @@ static unsigned long nvhe_percpu_order(void) return size ? get_order(size) : 0; } +static size_t pkvm_host_sve_state_order(void) +{ + return get_order(pkvm_host_sve_state_size()); +} + /* A lookup table holding the hypervisor VA for each vector slot */ static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; @@ -2310,12 +2315,20 @@ static void __init teardown_subsystems(void) static void __init teardown_hyp_mode(void) { + bool free_sve = system_supports_sve() && is_protected_kvm_enabled(); int cpu; free_hyp_pgds(); for_each_possible_cpu(cpu) { free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); + + if (free_sve) { + struct cpu_sve_state *sve_state; + + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + free_pages((unsigned long) sve_state, pkvm_host_sve_state_order()); + } } } @@ -2398,6 +2411,58 @@ static int __init kvm_hyp_init_protection(u32 hyp_va_bits) return 0; } +static int init_pkvm_host_sve_state(void) +{ + int cpu; + + if (!system_supports_sve()) + return 0; + + /* Allocate pages for host sve state in protected mode. */ + for_each_possible_cpu(cpu) { + struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order()); + + if (!page) + return -ENOMEM; + + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page); + } + + /* + * Don't map the pages in hyp since these are only used in protected + * mode, which will (re)create its own mapping when initialized. + */ + + return 0; +} + +/* + * Finalizes the initialization of hyp mode, once everything else is initialized + * and the initialziation process cannot fail. + */ +static void finalize_init_hyp_mode(void) +{ + int cpu; + + if (system_supports_sve() && is_protected_kvm_enabled()) { + for_each_possible_cpu(cpu) { + struct cpu_sve_state *sve_state; + + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = + kern_hyp_va(sve_state); + } + } else { + for_each_possible_cpu(cpu) { + struct user_fpsimd_state *fpsimd_state; + + fpsimd_state = &per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->host_ctxt.fp_regs; + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->fpsimd_state = + kern_hyp_va(fpsimd_state); + } + } +} + static void pkvm_hyp_init_ptrauth(void) { struct kvm_cpu_context *hyp_ctxt; @@ -2566,6 +2631,10 @@ static int __init init_hyp_mode(void) goto out_err; } + err = init_pkvm_host_sve_state(); + if (err) + goto out_err; + err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); @@ -2730,6 +2799,13 @@ static __init int kvm_arm_init(void) if (err) goto out_subs; + /* + * This should be called after initialization is done and failure isn't + * possible anymore. + */ + if (!in_hyp_mode) + finalize_init_hyp_mode(); + kvm_arm_initialised = true; return 0; diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 72d733c74a38..54090967a335 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2181,16 +2181,23 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) if (forward_traps(vcpu, HCR_NV)) return; + spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); + spsr = kvm_check_illegal_exception_return(vcpu, spsr); + /* Check for an ERETAx */ esr = kvm_vcpu_get_esr(vcpu); if (esr_iss_is_eretax(esr) && !kvm_auth_eretax(vcpu, &elr)) { /* - * Oh no, ERETAx failed to authenticate. If we have - * FPACCOMBINE, deliver an exception right away. If we - * don't, then let the mangled ELR value trickle down the + * Oh no, ERETAx failed to authenticate. + * + * If we have FPACCOMBINE and we don't have a pending + * Illegal Execution State exception (which has priority + * over FPAC), deliver an exception right away. + * + * Otherwise, let the mangled ELR value trickle down the * ERET handling, and the guest will have a little surprise. */ - if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE)) { + if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE) && !(spsr & PSR_IL_BIT)) { esr &= ESR_ELx_ERET_ISS_ERETA; esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_FPAC); kvm_inject_nested_sync(vcpu, esr); @@ -2201,17 +2208,11 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) preempt_disable(); kvm_arch_vcpu_put(vcpu); - spsr = __vcpu_sys_reg(vcpu, SPSR_EL2); - spsr = kvm_check_illegal_exception_return(vcpu, spsr); if (!esr_iss_is_eretax(esr)) elr = __vcpu_sys_reg(vcpu, ELR_EL2); trace_kvm_nested_eret(vcpu, elr, spsr); - /* - * Note that the current exception level is always the virtual EL2, - * since we set HCR_EL2.NV bit only when entering the virtual EL2. - */ *vcpu_pc(vcpu) = elr; *vcpu_cpsr(vcpu) = spsr; diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 1807d3a79a8a..521b32868d0d 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -90,6 +90,13 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) fpsimd_save_and_flush_cpu_state(); } } + + /* + * If normal guests gain SME support, maintain this behavior for pKVM + * guests, which don't support SME. + */ + WARN_ON(is_protected_kvm_enabled() && system_supports_sme() && + read_sysreg_s(SYS_SVCR)); } /* @@ -161,9 +168,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) if (has_vhe() && system_supports_sme()) { /* Also restore EL0 state seen on entry */ if (vcpu_get_flag(vcpu, HOST_SME_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, - CPACR_EL1_SMEN_EL0EN | - CPACR_EL1_SMEN_EL1EN); + sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_SMEN); else sysreg_clear_set(CPACR_EL1, CPACR_EL1_SMEN_EL0EN, diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index e2f762d959bb..11098eb7eb44 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -251,6 +251,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case PSR_AA32_MODE_SVC: case PSR_AA32_MODE_ABT: case PSR_AA32_MODE_UND: + case PSR_AA32_MODE_SYS: if (!vcpu_el1_is_32bit(vcpu)) return -EINVAL; break; @@ -276,7 +277,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) { int i, nr_reg; - switch (*vcpu_cpsr(vcpu)) { + switch (*vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK) { /* * Either we are dealing with user mode, and only the * first 15 registers (+ PC) must be narrowed to 32bit. diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c index 8d9670e6615d..449fa58cf3b6 100644 --- a/arch/arm64/kvm/hyp/aarch32.c +++ b/arch/arm64/kvm/hyp/aarch32.c @@ -50,9 +50,23 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) u32 cpsr_cond; int cond; - /* Top two bits non-zero? Unconditional. */ - if (kvm_vcpu_get_esr(vcpu) >> 30) + /* + * These are the exception classes that could fire with a + * conditional instruction. + */ + switch (kvm_vcpu_trap_get_class(vcpu)) { + case ESR_ELx_EC_CP15_32: + case ESR_ELx_EC_CP15_64: + case ESR_ELx_EC_CP14_MR: + case ESR_ELx_EC_CP14_LS: + case ESR_ELx_EC_FP_ASIMD: + case ESR_ELx_EC_CP10_ID: + case ESR_ELx_EC_CP14_64: + case ESR_ELx_EC_SVC32: + break; + default: return true; + } /* Is condition field valid? */ cond = kvm_vcpu_get_condition(vcpu); diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S index 61e6f3ba7b7d..e950875e31ce 100644 --- a/arch/arm64/kvm/hyp/fpsimd.S +++ b/arch/arm64/kvm/hyp/fpsimd.S @@ -25,3 +25,9 @@ SYM_FUNC_START(__sve_restore_state) sve_load 0, x1, x2, 3 ret SYM_FUNC_END(__sve_restore_state) + +SYM_FUNC_START(__sve_save_state) + mov x2, #1 + sve_save 0, x1, x2, 3 + ret +SYM_FUNC_END(__sve_save_state) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index a92566f36022..0c4de44534b7 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -316,10 +316,24 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) { sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); __sve_restore_state(vcpu_sve_pffr(vcpu), - &vcpu->arch.ctxt.fp_regs.fpsr); + &vcpu->arch.ctxt.fp_regs.fpsr, + true); write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); } +static inline void __hyp_sve_save_host(void) +{ + struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); + + sve_state->zcr_el1 = read_sysreg_el1(SYS_ZCR); + write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); + __sve_save_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), + &sve_state->fpsr, + true); +} + +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu); + /* * We trap the first access to the FP/SIMD to save the host context and * restore the guest context lazily. @@ -330,7 +344,6 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { bool sve_guest; u8 esr_ec; - u64 reg; if (!system_supports_fpsimd()) return false; @@ -353,24 +366,15 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (has_vhe() || has_hvhe()) { - reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN; - if (sve_guest) - reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; - - sysreg_clear_set(cpacr_el1, 0, reg); - } else { - reg = CPTR_EL2_TFP; - if (sve_guest) - reg |= CPTR_EL2_TZ; - - sysreg_clear_set(cptr_el2, reg, 0); - } + if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve())) + cpacr_clear_set(0, CPACR_ELx_FPEN | CPACR_ELx_ZEN); + else + cpacr_clear_set(0, CPACR_ELx_FPEN); isb(); /* Write out the host state if it's in the registers */ if (host_owns_fp_regs()) - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); + kvm_hyp_save_fpsimd_host(vcpu); /* Restore the guest state */ if (sve_guest) diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 22f374e9f532..24a9a8330d19 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -59,7 +59,6 @@ static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu) } void pkvm_hyp_vm_table_init(void *tbl); -void pkvm_host_fpsimd_state_init(void); int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva); diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 02746f9d0980..efb053af331c 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -177,6 +177,14 @@ static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len) res); } +static void ffa_rx_release(struct arm_smccc_res *res) +{ + arm_smccc_1_1_smc(FFA_RX_RELEASE, + 0, 0, + 0, 0, 0, 0, 0, + res); +} + static void do_ffa_rxtx_map(struct arm_smccc_res *res, struct kvm_cpu_context *ctxt) { @@ -543,16 +551,19 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res, if (WARN_ON(offset > len || fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) { ret = FFA_RET_ABORTED; + ffa_rx_release(res); goto out_unlock; } if (len > ffa_desc_buf.len) { ret = FFA_RET_NO_MEMORY; + ffa_rx_release(res); goto out_unlock; } buf = ffa_desc_buf.buf; memcpy(buf, hyp_buffers.rx, fraglen); + ffa_rx_release(res); for (fragoff = fraglen; fragoff < len; fragoff += fraglen) { ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff); @@ -563,6 +574,7 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res, fraglen = res->a3; memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen); + ffa_rx_release(res); } ffa_mem_reclaim(res, handle_lo, handle_hi, flags); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index d5c48dc98f67..f43d845f3c4e 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -23,20 +23,80 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); +static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu) +{ + __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); + /* + * On saving/restoring guest sve state, always use the maximum VL for + * the guest. The layout of the data when saving the sve state depends + * on the VL, so use a consistent (i.e., the maximum) guest VL. + */ + sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); + __sve_save_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr, true); + write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); +} + +static void __hyp_sve_restore_host(void) +{ + struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); + + /* + * On saving/restoring host sve state, always use the maximum VL for + * the host. The layout of the data when saving the sve state depends + * on the VL, so use a consistent (i.e., the maximum) host VL. + * + * Setting ZCR_EL2 to ZCR_ELx_LEN_MASK sets the effective length + * supported by the system (or limited at EL3). + */ + write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); + __sve_restore_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), + &sve_state->fpsr, + true); + write_sysreg_el1(sve_state->zcr_el1, SYS_ZCR); +} + +static void fpsimd_sve_flush(void) +{ + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +} + +static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) +{ + if (!guest_owns_fp_regs()) + return; + + cpacr_clear_set(0, CPACR_ELx_FPEN | CPACR_ELx_ZEN); + isb(); + + if (vcpu_has_sve(vcpu)) + __hyp_sve_save_guest(vcpu); + else + __fpsimd_save_state(&vcpu->arch.ctxt.fp_regs); + + if (system_supports_sve()) + __hyp_sve_restore_host(); + else + __fpsimd_restore_state(*host_data_ptr(fpsimd_state)); + + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +} + static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) { struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + fpsimd_sve_flush(); + hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state); - hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl; + /* Limit guest vector length to the maximum supported by the host. */ + hyp_vcpu->vcpu.arch.sve_max_vl = min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl); hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2; hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; - hyp_vcpu->vcpu.arch.cptr_el2 = host_vcpu->arch.cptr_el2; hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; @@ -54,10 +114,11 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3; unsigned int i; + fpsimd_sve_sync(&hyp_vcpu->vcpu); + host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2; - host_vcpu->arch.cptr_el2 = hyp_vcpu->vcpu.arch.cptr_el2; host_vcpu->arch.fault = hyp_vcpu->vcpu.arch.fault; @@ -79,6 +140,17 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) struct pkvm_hyp_vcpu *hyp_vcpu; struct kvm *host_kvm; + /* + * KVM (and pKVM) doesn't support SME guests for now, and + * ensures that SME features aren't enabled in pstate when + * loading a vcpu. Therefore, if SME features enabled the host + * is misbehaving. + */ + if (unlikely(system_supports_sme() && read_sysreg_s(SYS_SVCR))) { + ret = -EINVAL; + goto out; + } + host_kvm = kern_hyp_va(host_vcpu->kvm); hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle, host_vcpu->vcpu_idx); @@ -405,11 +477,7 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) handle_host_smc(host_ctxt); break; case ESR_ELx_EC_SVE: - if (has_hvhe()) - sysreg_clear_set(cpacr_el1, 0, (CPACR_EL1_ZEN_EL1EN | - CPACR_EL1_ZEN_EL0EN)); - else - sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); + cpacr_clear_set(0, CPACR_ELx_ZEN); isb(); sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); break; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 16aa4875ddb8..95cf18574251 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -18,6 +18,8 @@ unsigned long __icache_flags; /* Used by kvm_get_vttbr(). */ unsigned int kvm_arm_vmid_bits; +unsigned int kvm_host_sve_max_vl; + /* * Set trap register values based on features in ID_AA64PFR0. */ @@ -63,7 +65,7 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) /* Trap SVE */ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) { if (has_hvhe()) - cptr_clear |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; + cptr_clear |= CPACR_ELx_ZEN; else cptr_set |= CPTR_EL2_TZ; } @@ -247,17 +249,6 @@ void pkvm_hyp_vm_table_init(void *tbl) vm_table = tbl; } -void pkvm_host_fpsimd_state_init(void) -{ - unsigned long i; - - for (i = 0; i < hyp_nr_cpus; i++) { - struct kvm_host_data *host_data = per_cpu_ptr(&kvm_host_data, i); - - host_data->fpsimd_state = &host_data->host_ctxt.fp_regs; - } -} - /* * Return the hyp vm structure corresponding to the handle. */ @@ -586,6 +577,8 @@ unlock: if (ret) unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); + hyp_vcpu->vcpu.arch.cptr_el2 = kvm_get_reset_cptr_el2(&hyp_vcpu->vcpu); + return ret; } diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 859f22f754d3..f4350ba07b0b 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -67,6 +67,28 @@ static int divide_memory_pool(void *virt, unsigned long size) return 0; } +static int pkvm_create_host_sve_mappings(void) +{ + void *start, *end; + int ret, i; + + if (!system_supports_sve()) + return 0; + + for (i = 0; i < hyp_nr_cpus; i++) { + struct kvm_host_data *host_data = per_cpu_ptr(&kvm_host_data, i); + struct cpu_sve_state *sve_state = host_data->sve_state; + + start = kern_hyp_va(sve_state); + end = start + PAGE_ALIGN(pkvm_host_sve_state_size()); + ret = pkvm_create_mappings(start, end, PAGE_HYP); + if (ret) + return ret; + } + + return 0; +} + static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, unsigned long *per_cpu_base, u32 hyp_va_bits) @@ -125,6 +147,8 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, return ret; } + pkvm_create_host_sve_mappings(); + /* * Map the host sections RO in the hypervisor, but transfer the * ownership from the host to the hypervisor itself to make sure they @@ -300,7 +324,6 @@ void __noreturn __pkvm_init_finalise(void) goto out; pkvm_hyp_vm_table_init(vm_table_base); - pkvm_host_fpsimd_state_init(); out: /* * We tail-called to here from handle___pkvm_init() and will not return, diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 6758cd905570..6af179c6356d 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -48,15 +48,14 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA; if (cpus_have_final_cap(ARM64_SME)) { if (has_hvhe()) - val &= ~(CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN); + val &= ~CPACR_ELx_SMEN; else val |= CPTR_EL2_TSM; } if (!guest_owns_fp_regs()) { if (has_hvhe()) - val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN | - CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN); + val &= ~(CPACR_ELx_FPEN | CPACR_ELx_ZEN); else val |= CPTR_EL2_TFP | CPTR_EL2_TZ; @@ -182,6 +181,25 @@ static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) kvm_handle_pvm_sysreg(vcpu, exit_code)); } +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +{ + /* + * Non-protected kvm relies on the host restoring its sve state. + * Protected kvm restores the host's sve state as not to reveal that + * fpsimd was used by a guest nor leak upper sve bits. + */ + if (unlikely(is_protected_kvm_enabled() && system_supports_sve())) { + __hyp_sve_save_host(); + + /* Re-enable SVE traps if not supported for the guest vcpu. */ + if (!vcpu_has_sve(vcpu)) + cpacr_clear_set(CPACR_ELx_ZEN, 0); + + } else { + __fpsimd_save_state(*host_data_ptr(fpsimd_state)); + } +} + static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index d7af5f46f22a..8fbb6a2e0559 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -93,8 +93,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val = read_sysreg(cpacr_el1); val |= CPACR_ELx_TTA; - val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN | - CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN); + val &= ~(CPACR_ELx_ZEN | CPACR_ELx_SMEN); /* * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to @@ -109,9 +108,9 @@ static void __activate_traps(struct kvm_vcpu *vcpu) if (guest_owns_fp_regs()) { if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; + val |= CPACR_ELx_ZEN; } else { - val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); + val &= ~CPACR_ELx_FPEN; __activate_traps_fpsimd32(vcpu); } @@ -262,6 +261,11 @@ static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) return true; } +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +{ + __fpsimd_save_state(*host_data_ptr(fpsimd_state)); +} + static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 6813c7c7f00a..bae8536cbf00 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -58,8 +58,10 @@ static u64 limit_nv_id_reg(u32 id, u64 val) break; case SYS_ID_AA64PFR1_EL1: - /* Only support SSBS */ - val &= NV_FTR(PFR1, SSBS); + /* Only support BTI, SSBS, CSV2_frac */ + val &= (NV_FTR(PFR1, BT) | + NV_FTR(PFR1, SSBS) | + NV_FTR(PFR1, CSV2_frac)); break; case SYS_ID_AA64MMFR0_EL1: diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 1b7b58cb121f..3fc8ca164dbe 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -32,6 +32,7 @@ /* Maximum phys_shift supported for any VM on this host */ static u32 __ro_after_init kvm_ipa_limit; +unsigned int __ro_after_init kvm_host_sve_max_vl; /* * ARMv8 Reset Values @@ -51,6 +52,8 @@ int __init kvm_arm_init_sve(void) { if (system_supports_sve()) { kvm_sve_max_vl = sve_max_virtualisable_vl(); + kvm_host_sve_max_vl = sve_max_vl(); + kvm_nvhe_sym(kvm_host_sve_max_vl) = kvm_host_sve_max_vl; /* * The get_sve_reg()/set_sve_reg() ioctl interface will need diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 8f5b7a3e7009..7f68cf58b978 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -391,7 +391,7 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm) if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) - vgic_v3_free_redist_region(rdreg); + vgic_v3_free_redist_region(kvm, rdreg); INIT_LIST_HEAD(&dist->rd_regions); } else { dist->vgic_cpu_base = VGIC_ADDR_UNDEF; diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index a3983a631b5a..9e50928f5d7d 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -919,8 +919,19 @@ free: return ret; } -void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg) +void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg) { + struct kvm_vcpu *vcpu; + unsigned long c; + + lockdep_assert_held(&kvm->arch.config_lock); + + /* Garbage collect the region */ + kvm_for_each_vcpu(c, vcpu, kvm) { + if (vcpu->arch.vgic_cpu.rdreg == rdreg) + vcpu->arch.vgic_cpu.rdreg = NULL; + } + list_del(&rdreg->list); kfree(rdreg); } @@ -945,7 +956,7 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) mutex_lock(&kvm->arch.config_lock); rdreg = vgic_v3_rdist_region_from_index(kvm, index); - vgic_v3_free_redist_region(rdreg); + vgic_v3_free_redist_region(kvm, rdreg); mutex_unlock(&kvm->arch.config_lock); return ret; } diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 6106ebd5ba42..03d356a12377 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -316,7 +316,7 @@ vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, u32 index); -void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg); +void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg); bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 9f9486de0004..a3edced29ac1 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -376,7 +376,7 @@ void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, * clearing access/dirty for the whole block. */ unsigned long start = addr; - unsigned long end = start + nr; + unsigned long end = start + nr * PAGE_SIZE; if (pte_cont(__ptep_get(ptep + nr - 1))) end = ALIGN(end, CONT_PTE_SIZE); @@ -386,7 +386,7 @@ void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, ptep = contpte_align_down(ptep); } - __clear_young_dirty_ptes(vma, start, ptep, end - start, flags); + __clear_young_dirty_ptes(vma, start, ptep, (end - start) / PAGE_SIZE, flags); } EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes); diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index e38139c576ee..ddc042895d01 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -143,7 +143,7 @@ config LOONGARCH select HAVE_LIVEPATCH select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI - select HAVE_OBJTOOL if AS_HAS_EXPLICIT_RELOCS + select HAVE_OBJTOOL if AS_HAS_EXPLICIT_RELOCS && AS_HAS_THIN_ADD_SUB && !CC_IS_CLANG select HAVE_PCI select HAVE_PERF_EVENTS select HAVE_PERF_REGS @@ -261,6 +261,9 @@ config AS_HAS_EXPLICIT_RELOCS config AS_HAS_FCSR_CLASS def_bool $(as-instr,movfcsr2gr \$t0$(comma)\$fcsr0) +config AS_HAS_THIN_ADD_SUB + def_bool $(cc-option,-Wa$(comma)-mthin-add-sub) + config AS_HAS_LSX_EXTENSION def_bool $(as-instr,vld \$vr0$(comma)\$a0$(comma)0) diff --git a/arch/loongarch/Kconfig.debug b/arch/loongarch/Kconfig.debug index 98d60630c3d4..8b2ce5b5d43e 100644 --- a/arch/loongarch/Kconfig.debug +++ b/arch/loongarch/Kconfig.debug @@ -28,6 +28,7 @@ config UNWINDER_PROLOGUE config UNWINDER_ORC bool "ORC unwinder" + depends on HAVE_OBJTOOL select OBJTOOL help This option enables the ORC (Oops Rewind Capability) unwinder for diff --git a/arch/loongarch/boot/dts/loongson-2k0500-ref.dts b/arch/loongarch/boot/dts/loongson-2k0500-ref.dts index 8aefb0c12672..a34734a6c3ce 100644 --- a/arch/loongarch/boot/dts/loongson-2k0500-ref.dts +++ b/arch/loongarch/boot/dts/loongson-2k0500-ref.dts @@ -44,14 +44,14 @@ &gmac0 { status = "okay"; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; bus_id = <0x0>; }; &gmac1 { status = "okay"; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; bus_id = <0x1>; }; diff --git a/arch/loongarch/boot/dts/loongson-2k1000-ref.dts b/arch/loongarch/boot/dts/loongson-2k1000-ref.dts index 8463fe035386..23cf26cc3e5f 100644 --- a/arch/loongarch/boot/dts/loongson-2k1000-ref.dts +++ b/arch/loongarch/boot/dts/loongson-2k1000-ref.dts @@ -43,7 +43,7 @@ &gmac0 { status = "okay"; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; phy-handle = <&phy0>; mdio { compatible = "snps,dwmac-mdio"; @@ -58,7 +58,7 @@ &gmac1 { status = "okay"; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; phy-handle = <&phy1>; mdio { compatible = "snps,dwmac-mdio"; diff --git a/arch/loongarch/boot/dts/loongson-2k2000-ref.dts b/arch/loongarch/boot/dts/loongson-2k2000-ref.dts index 74b99bd234cc..ea9e6985d0e9 100644 --- a/arch/loongarch/boot/dts/loongson-2k2000-ref.dts +++ b/arch/loongarch/boot/dts/loongson-2k2000-ref.dts @@ -92,7 +92,7 @@ &gmac2 { status = "okay"; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; phy-handle = <&phy2>; mdio { compatible = "snps,dwmac-mdio"; diff --git a/arch/loongarch/include/asm/hw_breakpoint.h b/arch/loongarch/include/asm/hw_breakpoint.h index 21447fb1efc7..d78330916bd1 100644 --- a/arch/loongarch/include/asm/hw_breakpoint.h +++ b/arch/loongarch/include/asm/hw_breakpoint.h @@ -75,6 +75,8 @@ do { \ #define CSR_MWPC_NUM 0x3f #define CTRL_PLV_ENABLE 0x1e +#define CTRL_PLV0_ENABLE 0x02 +#define CTRL_PLV3_ENABLE 0x10 #define MWPnCFG3_LoadEn 8 #define MWPnCFG3_StoreEn 9 @@ -101,7 +103,7 @@ struct perf_event; struct perf_event_attr; extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type, int *offset); + int *gen_len, int *gen_type); extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, diff --git a/arch/loongarch/include/asm/numa.h b/arch/loongarch/include/asm/numa.h index 27f319b49862..b5f9de9f102e 100644 --- a/arch/loongarch/include/asm/numa.h +++ b/arch/loongarch/include/asm/numa.h @@ -56,6 +56,7 @@ extern int early_cpu_to_node(int cpu); static inline void early_numa_add_cpu(int cpuid, s16 node) { } static inline void numa_add_cpu(unsigned int cpu) { } static inline void numa_remove_cpu(unsigned int cpu) { } +static inline void set_cpuid_to_node(int cpuid, s16 node) { } static inline int early_cpu_to_node(int cpu) { diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h index 45b507a7b06f..d9eafd3ee3d1 100644 --- a/arch/loongarch/include/asm/stackframe.h +++ b/arch/loongarch/include/asm/stackframe.h @@ -42,7 +42,7 @@ .macro JUMP_VIRT_ADDR temp1 temp2 li.d \temp1, CACHE_BASE pcaddi \temp2, 0 - or \temp1, \temp1, \temp2 + bstrins.d \temp1, \temp2, (DMW_PABITS - 1), 0 jirl zero, \temp1, 0xc .endm diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S index c4f7de2e2805..4677ea8fa8e9 100644 --- a/arch/loongarch/kernel/head.S +++ b/arch/loongarch/kernel/head.S @@ -22,7 +22,7 @@ _head: .word MZ_MAGIC /* "MZ", MS-DOS header */ .org 0x8 - .dword kernel_entry /* Kernel entry point */ + .dword _kernel_entry /* Kernel entry point (physical address) */ .dword _kernel_asize /* Kernel image effective size */ .quad PHYS_LINK_KADDR /* Kernel image load offset from start of RAM */ .org 0x38 /* 0x20 ~ 0x37 reserved */ diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c index fc55c4de2a11..621ad7634df7 100644 --- a/arch/loongarch/kernel/hw_breakpoint.c +++ b/arch/loongarch/kernel/hw_breakpoint.c @@ -174,11 +174,21 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) static int hw_breakpoint_control(struct perf_event *bp, enum hw_breakpoint_ops ops) { - u32 ctrl; + u32 ctrl, privilege; int i, max_slots, enable; + struct pt_regs *regs; struct perf_event **slots; struct arch_hw_breakpoint *info = counter_arch_bp(bp); + if (arch_check_bp_in_kernelspace(info)) + privilege = CTRL_PLV0_ENABLE; + else + privilege = CTRL_PLV3_ENABLE; + + /* Whether bp belongs to a task. */ + if (bp->hw.target) + regs = task_pt_regs(bp->hw.target); + if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { /* Breakpoint */ slots = this_cpu_ptr(bp_on_reg); @@ -197,31 +207,38 @@ static int hw_breakpoint_control(struct perf_event *bp, switch (ops) { case HW_BREAKPOINT_INSTALL: /* Set the FWPnCFG/MWPnCFG 1~4 register. */ - write_wb_reg(CSR_CFG_ADDR, i, 0, info->address); - write_wb_reg(CSR_CFG_ADDR, i, 1, info->address); - write_wb_reg(CSR_CFG_MASK, i, 0, info->mask); - write_wb_reg(CSR_CFG_MASK, i, 1, info->mask); - write_wb_reg(CSR_CFG_ASID, i, 0, 0); - write_wb_reg(CSR_CFG_ASID, i, 1, 0); if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { - write_wb_reg(CSR_CFG_CTRL, i, 0, CTRL_PLV_ENABLE); + write_wb_reg(CSR_CFG_ADDR, i, 0, info->address); + write_wb_reg(CSR_CFG_MASK, i, 0, info->mask); + write_wb_reg(CSR_CFG_ASID, i, 0, 0); + write_wb_reg(CSR_CFG_CTRL, i, 0, privilege); } else { + write_wb_reg(CSR_CFG_ADDR, i, 1, info->address); + write_wb_reg(CSR_CFG_MASK, i, 1, info->mask); + write_wb_reg(CSR_CFG_ASID, i, 1, 0); ctrl = encode_ctrl_reg(info->ctrl); - write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl | CTRL_PLV_ENABLE); + write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl | privilege); } enable = csr_read64(LOONGARCH_CSR_CRMD); csr_write64(CSR_CRMD_WE | enable, LOONGARCH_CSR_CRMD); + if (bp->hw.target) + regs->csr_prmd |= CSR_PRMD_PWE; break; case HW_BREAKPOINT_UNINSTALL: /* Reset the FWPnCFG/MWPnCFG 1~4 register. */ - write_wb_reg(CSR_CFG_ADDR, i, 0, 0); - write_wb_reg(CSR_CFG_ADDR, i, 1, 0); - write_wb_reg(CSR_CFG_MASK, i, 0, 0); - write_wb_reg(CSR_CFG_MASK, i, 1, 0); - write_wb_reg(CSR_CFG_CTRL, i, 0, 0); - write_wb_reg(CSR_CFG_CTRL, i, 1, 0); - write_wb_reg(CSR_CFG_ASID, i, 0, 0); - write_wb_reg(CSR_CFG_ASID, i, 1, 0); + if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { + write_wb_reg(CSR_CFG_ADDR, i, 0, 0); + write_wb_reg(CSR_CFG_MASK, i, 0, 0); + write_wb_reg(CSR_CFG_CTRL, i, 0, 0); + write_wb_reg(CSR_CFG_ASID, i, 0, 0); + } else { + write_wb_reg(CSR_CFG_ADDR, i, 1, 0); + write_wb_reg(CSR_CFG_MASK, i, 1, 0); + write_wb_reg(CSR_CFG_CTRL, i, 1, 0); + write_wb_reg(CSR_CFG_ASID, i, 1, 0); + } + if (bp->hw.target) + regs->csr_prmd &= ~CSR_PRMD_PWE; break; } @@ -283,7 +300,7 @@ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) * to generic breakpoint descriptions. */ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type, int *offset) + int *gen_len, int *gen_type) { /* Type */ switch (ctrl.type) { @@ -303,11 +320,6 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, return -EINVAL; } - if (!ctrl.len) - return -EINVAL; - - *offset = __ffs(ctrl.len); - /* Len */ switch (ctrl.len) { case LOONGARCH_BREAKPOINT_LEN_1: @@ -386,21 +398,17 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, struct arch_hw_breakpoint *hw) { int ret; - u64 alignment_mask, offset; + u64 alignment_mask; /* Build the arch_hw_breakpoint. */ ret = arch_build_bp_info(bp, attr, hw); if (ret) return ret; - if (hw->ctrl.type != LOONGARCH_BREAKPOINT_EXECUTE) - alignment_mask = 0x7; - else + if (hw->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { alignment_mask = 0x3; - offset = hw->address & alignment_mask; - - hw->address &= ~alignment_mask; - hw->ctrl.len <<= offset; + hw->address &= ~alignment_mask; + } return 0; } @@ -471,12 +479,15 @@ void breakpoint_handler(struct pt_regs *regs) slots = this_cpu_ptr(bp_on_reg); for (i = 0; i < boot_cpu_data.watch_ireg_count; ++i) { - bp = slots[i]; - if (bp == NULL) - continue; - perf_bp_event(bp, regs); + if ((csr_read32(LOONGARCH_CSR_FWPS) & (0x1 << i))) { + bp = slots[i]; + if (bp == NULL) + continue; + perf_bp_event(bp, regs); + csr_write32(0x1 << i, LOONGARCH_CSR_FWPS); + update_bp_registers(regs, 0, 0); + } } - update_bp_registers(regs, 0, 0); } NOKPROBE_SYMBOL(breakpoint_handler); @@ -488,12 +499,15 @@ void watchpoint_handler(struct pt_regs *regs) slots = this_cpu_ptr(wp_on_reg); for (i = 0; i < boot_cpu_data.watch_dreg_count; ++i) { - wp = slots[i]; - if (wp == NULL) - continue; - perf_bp_event(wp, regs); + if ((csr_read32(LOONGARCH_CSR_MWPS) & (0x1 << i))) { + wp = slots[i]; + if (wp == NULL) + continue; + perf_bp_event(wp, regs); + csr_write32(0x1 << i, LOONGARCH_CSR_MWPS); + update_bp_registers(regs, 0, 1); + } } - update_bp_registers(regs, 0, 1); } NOKPROBE_SYMBOL(watchpoint_handler); diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c index c114c5ef1332..200109de1971 100644 --- a/arch/loongarch/kernel/ptrace.c +++ b/arch/loongarch/kernel/ptrace.c @@ -494,28 +494,14 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, struct arch_hw_breakpoint_ctrl ctrl, struct perf_event_attr *attr) { - int err, len, type, offset; + int err, len, type; - err = arch_bp_generic_fields(ctrl, &len, &type, &offset); + err = arch_bp_generic_fields(ctrl, &len, &type); if (err) return err; - switch (note_type) { - case NT_LOONGARCH_HW_BREAK: - if ((type & HW_BREAKPOINT_X) != type) - return -EINVAL; - break; - case NT_LOONGARCH_HW_WATCH: - if ((type & HW_BREAKPOINT_RW) != type) - return -EINVAL; - break; - default: - return -EINVAL; - } - attr->bp_len = len; attr->bp_type = type; - attr->bp_addr += offset; return 0; } @@ -609,10 +595,27 @@ static int ptrace_hbp_set_ctrl(unsigned int note_type, return PTR_ERR(bp); attr = bp->attr; - decode_ctrl_reg(uctrl, &ctrl); - err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr); - if (err) - return err; + + switch (note_type) { + case NT_LOONGARCH_HW_BREAK: + ctrl.type = LOONGARCH_BREAKPOINT_EXECUTE; + ctrl.len = LOONGARCH_BREAKPOINT_LEN_4; + break; + case NT_LOONGARCH_HW_WATCH: + decode_ctrl_reg(uctrl, &ctrl); + break; + default: + return -EINVAL; + } + + if (uctrl & CTRL_PLV_ENABLE) { + err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr); + if (err) + return err; + attr.disabled = 0; + } else { + attr.disabled = 1; + } return modify_user_hw_breakpoint(bp, &attr); } @@ -643,6 +646,10 @@ static int ptrace_hbp_set_addr(unsigned int note_type, struct perf_event *bp; struct perf_event_attr attr; + /* Kernel-space address cannot be monitored by user-space */ + if ((unsigned long)addr >= XKPRANGE) + return -EINVAL; + bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx); if (IS_ERR(bp)) return PTR_ERR(bp); diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 60e0fe97f61a..3d048f1be143 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -282,7 +282,7 @@ static void __init fdt_setup(void) return; /* Prefer to use built-in dtb, checking its legality first. */ - if (!fdt_check_header(__dtb_start)) + if (IS_ENABLED(CONFIG_BUILTIN_DTB) && !fdt_check_header(__dtb_start)) fdt_pointer = __dtb_start; else fdt_pointer = efi_fdt_pointer(); /* Fallback to firmware dtb */ @@ -351,10 +351,8 @@ void __init platform_init(void) arch_reserve_vmcore(); arch_reserve_crashkernel(); -#ifdef CONFIG_ACPI_TABLE_UPGRADE - acpi_table_upgrade(); -#endif #ifdef CONFIG_ACPI + acpi_table_upgrade(); acpi_gbl_use_default_register_widths = false; acpi_boot_table_init(); #endif diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index 0dfe2388ef41..1436d2465939 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -273,7 +273,6 @@ static void __init fdt_smp_setup(void) if (cpuid == loongson_sysconf.boot_cpu_id) { cpu = 0; - numa_add_cpu(cpu); } else { cpu = cpumask_next_zero(-1, cpu_present_mask); } @@ -283,6 +282,9 @@ static void __init fdt_smp_setup(void) set_cpu_present(cpu, true); __cpu_number_map[cpuid] = cpu; __cpu_logical_map[cpu] = cpuid; + + early_numa_add_cpu(cpu, 0); + set_cpuid_to_node(cpuid, 0); } loongson_sysconf.nr_cpus = num_processors; @@ -468,6 +470,7 @@ void smp_prepare_boot_cpu(void) set_cpu_possible(0, true); set_cpu_online(0, true); set_my_cpu_offset(per_cpu_offset(0)); + numa_add_cpu(0); rr_node = first_node(node_online_map); for_each_possible_cpu(cpu) { diff --git a/arch/loongarch/kernel/vmlinux.lds.S b/arch/loongarch/kernel/vmlinux.lds.S index e8e97dbf9ca4..3c7595342730 100644 --- a/arch/loongarch/kernel/vmlinux.lds.S +++ b/arch/loongarch/kernel/vmlinux.lds.S @@ -6,6 +6,7 @@ #define PAGE_SIZE _PAGE_SIZE #define RO_EXCEPTION_TABLE_ALIGN 4 +#define PHYSADDR_MASK 0xffffffffffff /* 48-bit */ /* * Put .bss..swapper_pg_dir as the first thing in .bss. This will @@ -142,10 +143,11 @@ SECTIONS #ifdef CONFIG_EFI_STUB /* header symbols */ - _kernel_asize = _end - _text; - _kernel_fsize = _edata - _text; - _kernel_vsize = _end - __initdata_begin; - _kernel_rsize = _edata - __initdata_begin; + _kernel_entry = ABSOLUTE(kernel_entry & PHYSADDR_MASK); + _kernel_asize = ABSOLUTE(_end - _text); + _kernel_fsize = ABSOLUTE(_edata - _text); + _kernel_vsize = ABSOLUTE(_end - __initdata_begin); + _kernel_rsize = ABSOLUTE(_edata - __initdata_begin); #endif .gptab.sdata : { diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index c86e099af5ca..a68573e091c0 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -761,7 +761,7 @@ static void kvm_handle_service(struct kvm_vcpu *vcpu) default: ret = KVM_HCALL_INVALID_CODE; break; - }; + } kvm_write_reg(vcpu, LOONGARCH_GPR_A0, ret); } diff --git a/arch/mips/bmips/setup.c b/arch/mips/bmips/setup.c index ec180ab92eaa..66a8ba19c287 100644 --- a/arch/mips/bmips/setup.c +++ b/arch/mips/bmips/setup.c @@ -110,7 +110,8 @@ static void bcm6358_quirks(void) * RAC flush causes kernel panics on BCM6358 when booting from TP1 * because the bootloader is not initializing it properly. */ - bmips_rac_flush_disable = !!(read_c0_brcm_cmt_local() & (1 << 31)); + bmips_rac_flush_disable = !!(read_c0_brcm_cmt_local() & (1 << 31)) || + !!BMIPS_GET_CBR(); } static void bcm6368_quirks(void) diff --git a/arch/mips/include/asm/mipsmtregs.h b/arch/mips/include/asm/mipsmtregs.h index 30e86861c206..b1ee3c48e84b 100644 --- a/arch/mips/include/asm/mipsmtregs.h +++ b/arch/mips/include/asm/mipsmtregs.h @@ -322,7 +322,7 @@ static inline void ehb(void) " .set push \n" \ " .set "MIPS_ISA_LEVEL" \n" \ _ASM_SET_MFTC0 \ - " mftc0 $1, " #rt ", " #sel " \n" \ + " mftc0 %0, " #rt ", " #sel " \n" \ _ASM_UNSET_MFTC0 \ " .set pop \n" \ : "=r" (__res)); \ diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 008ebe60263e..81428a2eb660 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -27,7 +27,7 @@ 17 o32 break sys_ni_syscall # 18 was sys_stat 18 o32 unused18 sys_ni_syscall -19 o32 lseek sys_lseek +19 o32 lseek sys_lseek compat_sys_lseek 20 o32 getpid sys_getpid 21 o32 mount sys_mount 22 o32 umount sys_oldumount diff --git a/arch/mips/pci/ops-rc32434.c b/arch/mips/pci/ops-rc32434.c index 874ed6df9768..34b9323bdabb 100644 --- a/arch/mips/pci/ops-rc32434.c +++ b/arch/mips/pci/ops-rc32434.c @@ -112,8 +112,8 @@ retry: * gives them time to settle */ if (where == PCI_VENDOR_ID) { - if (ret == 0xffffffff || ret == 0x00000000 || - ret == 0x0000ffff || ret == 0xffff0000) { + if (*val == 0xffffffff || *val == 0x00000000 || + *val == 0x0000ffff || *val == 0xffff0000) { if (delay > 4) return 0; delay *= 2; diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index ba4c05bc24d6..8394718870e1 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -31,18 +31,17 @@ void flush_cache_all_local(void); void flush_cache_all(void); void flush_cache_mm(struct mm_struct *mm); -void flush_kernel_dcache_page_addr(const void *addr); - #define flush_kernel_dcache_range(start,size) \ flush_kernel_dcache_range_asm((start), (start)+(size)); +/* The only way to flush a vmap range is to flush whole cache */ #define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 void flush_kernel_vmap_range(void *vaddr, int size); void invalidate_kernel_vmap_range(void *vaddr, int size); -#define flush_cache_vmap(start, end) flush_cache_all() +void flush_cache_vmap(unsigned long start, unsigned long end); #define flush_cache_vmap_early(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) flush_cache_all() +void flush_cache_vunmap(unsigned long start, unsigned long end); void flush_dcache_folio(struct folio *folio); #define flush_dcache_folio flush_dcache_folio @@ -77,17 +76,11 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); -/* defined in pacache.S exported in cache.c used by flush_anon_page */ -void flush_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr); - #define ARCH_HAS_FLUSH_ANON_PAGE void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr); #define ARCH_HAS_FLUSH_ON_KUNMAP -static inline void kunmap_flush_on_unmap(const void *addr) -{ - flush_kernel_dcache_page_addr(addr); -} +void kunmap_flush_on_unmap(const void *addr); #endif /* _PARISC_CACHEFLUSH_H */ diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 974accac05cd..babf65751e81 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -448,14 +448,17 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return pte; } +static inline pte_t ptep_get(pte_t *ptep) +{ + return READ_ONCE(*ptep); +} +#define ptep_get ptep_get + static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; - if (!pte_young(*ptep)) - return 0; - - pte = *ptep; + pte = ptep_get(ptep); if (!pte_young(pte)) { return 0; } @@ -463,17 +466,10 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned return 1; } -struct mm_struct; -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - pte_t old_pte; - - old_pte = *ptep; - set_pte(ptep, __pte(0)); - - return old_pte; -} +int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { set_pte(ptep, pte_wrprotect(*ptep)); @@ -511,7 +507,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH #define __HAVE_ARCH_PTEP_SET_WRPROTECT #define __HAVE_ARCH_PTE_SAME diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 422f3e1e6d9c..483bfafd930c 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -20,6 +20,7 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/syscalls.h> +#include <linux/vmalloc.h> #include <asm/pdc.h> #include <asm/cache.h> #include <asm/cacheflush.h> @@ -31,20 +32,31 @@ #include <asm/mmu_context.h> #include <asm/cachectl.h> +#define PTR_PAGE_ALIGN_DOWN(addr) PTR_ALIGN_DOWN(addr, PAGE_SIZE) + +/* + * When nonzero, use _PAGE_ACCESSED bit to try to reduce the number + * of page flushes done flush_cache_page_if_present. There are some + * pros and cons in using this option. It may increase the risk of + * random segmentation faults. + */ +#define CONFIG_FLUSH_PAGE_ACCESSED 0 + int split_tlb __ro_after_init; int dcache_stride __ro_after_init; int icache_stride __ro_after_init; EXPORT_SYMBOL(dcache_stride); +/* Internal implementation in arch/parisc/kernel/pacache.S */ void flush_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr); EXPORT_SYMBOL(flush_dcache_page_asm); void purge_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr); void flush_icache_page_asm(unsigned long phys_addr, unsigned long vaddr); - -/* Internal implementation in arch/parisc/kernel/pacache.S */ void flush_data_cache_local(void *); /* flushes local data-cache only */ void flush_instruction_cache_local(void); /* flushes local code-cache only */ +static void flush_kernel_dcache_page_addr(const void *addr); + /* On some machines (i.e., ones with the Merced bus), there can be * only a single PxTLB broadcast at a time; this must be guaranteed * by software. We need a spinlock around all TLB flushes to ensure @@ -321,6 +333,18 @@ __flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, { if (!static_branch_likely(&parisc_has_cache)) return; + + /* + * The TLB is the engine of coherence on parisc. The CPU is + * entitled to speculate any page with a TLB mapping, so here + * we kill the mapping then flush the page along a special flush + * only alias mapping. This guarantees that the page is no-longer + * in the cache for any process and nor may it be speculatively + * read in (until the user or kernel specifically accesses it, + * of course). + */ + flush_tlb_page(vma, vmaddr); + preempt_disable(); flush_dcache_page_asm(physaddr, vmaddr); if (vma->vm_flags & VM_EXEC) @@ -328,46 +352,44 @@ __flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, preempt_enable(); } -static void flush_user_cache_page(struct vm_area_struct *vma, unsigned long vmaddr) +static void flush_kernel_dcache_page_addr(const void *addr) { - unsigned long flags, space, pgd, prot; -#ifdef CONFIG_TLB_PTLOCK - unsigned long pgd_lock; -#endif + unsigned long vaddr = (unsigned long)addr; + unsigned long flags; - vmaddr &= PAGE_MASK; + /* Purge TLB entry to remove translation on all CPUs */ + purge_tlb_start(flags); + pdtlb(SR_KERNEL, addr); + purge_tlb_end(flags); + /* Use tmpalias flush to prevent data cache move-in */ preempt_disable(); + flush_dcache_page_asm(__pa(vaddr), vaddr); + preempt_enable(); +} - /* Set context for flush */ - local_irq_save(flags); - prot = mfctl(8); - space = mfsp(SR_USER); - pgd = mfctl(25); -#ifdef CONFIG_TLB_PTLOCK - pgd_lock = mfctl(28); -#endif - switch_mm_irqs_off(NULL, vma->vm_mm, NULL); - local_irq_restore(flags); - - flush_user_dcache_range_asm(vmaddr, vmaddr + PAGE_SIZE); - if (vma->vm_flags & VM_EXEC) - flush_user_icache_range_asm(vmaddr, vmaddr + PAGE_SIZE); - flush_tlb_page(vma, vmaddr); +static void flush_kernel_icache_page_addr(const void *addr) +{ + unsigned long vaddr = (unsigned long)addr; + unsigned long flags; - /* Restore previous context */ - local_irq_save(flags); -#ifdef CONFIG_TLB_PTLOCK - mtctl(pgd_lock, 28); -#endif - mtctl(pgd, 25); - mtsp(space, SR_USER); - mtctl(prot, 8); - local_irq_restore(flags); + /* Purge TLB entry to remove translation on all CPUs */ + purge_tlb_start(flags); + pdtlb(SR_KERNEL, addr); + purge_tlb_end(flags); + /* Use tmpalias flush to prevent instruction cache move-in */ + preempt_disable(); + flush_icache_page_asm(__pa(vaddr), vaddr); preempt_enable(); } +void kunmap_flush_on_unmap(const void *addr) +{ + flush_kernel_dcache_page_addr(addr); +} +EXPORT_SYMBOL(kunmap_flush_on_unmap); + void flush_icache_pages(struct vm_area_struct *vma, struct page *page, unsigned int nr) { @@ -375,13 +397,16 @@ void flush_icache_pages(struct vm_area_struct *vma, struct page *page, for (;;) { flush_kernel_dcache_page_addr(kaddr); - flush_kernel_icache_page(kaddr); + flush_kernel_icache_page_addr(kaddr); if (--nr == 0) break; kaddr += PAGE_SIZE; } } +/* + * Walk page directory for MM to find PTEP pointer for address ADDR. + */ static inline pte_t *get_ptep(struct mm_struct *mm, unsigned long addr) { pte_t *ptep = NULL; @@ -410,6 +435,41 @@ static inline bool pte_needs_flush(pte_t pte) == (_PAGE_PRESENT | _PAGE_ACCESSED); } +/* + * Return user physical address. Returns 0 if page is not present. + */ +static inline unsigned long get_upa(struct mm_struct *mm, unsigned long addr) +{ + unsigned long flags, space, pgd, prot, pa; +#ifdef CONFIG_TLB_PTLOCK + unsigned long pgd_lock; +#endif + + /* Save context */ + local_irq_save(flags); + prot = mfctl(8); + space = mfsp(SR_USER); + pgd = mfctl(25); +#ifdef CONFIG_TLB_PTLOCK + pgd_lock = mfctl(28); +#endif + + /* Set context for lpa_user */ + switch_mm_irqs_off(NULL, mm, NULL); + pa = lpa_user(addr); + + /* Restore previous context */ +#ifdef CONFIG_TLB_PTLOCK + mtctl(pgd_lock, 28); +#endif + mtctl(pgd, 25); + mtsp(space, SR_USER); + mtctl(prot, 8); + local_irq_restore(flags); + + return pa; +} + void flush_dcache_folio(struct folio *folio) { struct address_space *mapping = folio_flush_mapping(folio); @@ -458,50 +518,23 @@ void flush_dcache_folio(struct folio *folio) if (addr + nr * PAGE_SIZE > vma->vm_end) nr = (vma->vm_end - addr) / PAGE_SIZE; - if (parisc_requires_coherency()) { - for (i = 0; i < nr; i++) { - pte_t *ptep = get_ptep(vma->vm_mm, - addr + i * PAGE_SIZE); - if (!ptep) - continue; - if (pte_needs_flush(*ptep)) - flush_user_cache_page(vma, - addr + i * PAGE_SIZE); - /* Optimise accesses to the same table? */ - pte_unmap(ptep); - } - } else { + if (old_addr == 0 || (old_addr & (SHM_COLOUR - 1)) + != (addr & (SHM_COLOUR - 1))) { + for (i = 0; i < nr; i++) + __flush_cache_page(vma, + addr + i * PAGE_SIZE, + (pfn + i) * PAGE_SIZE); /* - * The TLB is the engine of coherence on parisc: - * The CPU is entitled to speculate any page - * with a TLB mapping, so here we kill the - * mapping then flush the page along a special - * flush only alias mapping. This guarantees that - * the page is no-longer in the cache for any - * process and nor may it be speculatively read - * in (until the user or kernel specifically - * accesses it, of course) + * Software is allowed to have any number + * of private mappings to a page. */ - for (i = 0; i < nr; i++) - flush_tlb_page(vma, addr + i * PAGE_SIZE); - if (old_addr == 0 || (old_addr & (SHM_COLOUR - 1)) - != (addr & (SHM_COLOUR - 1))) { - for (i = 0; i < nr; i++) - __flush_cache_page(vma, - addr + i * PAGE_SIZE, - (pfn + i) * PAGE_SIZE); - /* - * Software is allowed to have any number - * of private mappings to a page. - */ - if (!(vma->vm_flags & VM_SHARED)) - continue; - if (old_addr) - pr_err("INEQUIVALENT ALIASES 0x%lx and 0x%lx in file %pD\n", - old_addr, addr, vma->vm_file); - if (nr == folio_nr_pages(folio)) - old_addr = addr; - } + if (!(vma->vm_flags & VM_SHARED)) + continue; + if (old_addr) + pr_err("INEQUIVALENT ALIASES 0x%lx and 0x%lx in file %pD\n", + old_addr, addr, vma->vm_file); + if (nr == folio_nr_pages(folio)) + old_addr = addr; } WARN_ON(++count == 4096); } @@ -591,35 +624,28 @@ extern void purge_kernel_dcache_page_asm(unsigned long); extern void clear_user_page_asm(void *, unsigned long); extern void copy_user_page_asm(void *, void *, unsigned long); -void flush_kernel_dcache_page_addr(const void *addr) -{ - unsigned long flags; - - flush_kernel_dcache_page_asm(addr); - purge_tlb_start(flags); - pdtlb(SR_KERNEL, addr); - purge_tlb_end(flags); -} -EXPORT_SYMBOL(flush_kernel_dcache_page_addr); - static void flush_cache_page_if_present(struct vm_area_struct *vma, - unsigned long vmaddr, unsigned long pfn) + unsigned long vmaddr) { +#if CONFIG_FLUSH_PAGE_ACCESSED bool needs_flush = false; - pte_t *ptep; + pte_t *ptep, pte; - /* - * The pte check is racy and sometimes the flush will trigger - * a non-access TLB miss. Hopefully, the page has already been - * flushed. - */ ptep = get_ptep(vma->vm_mm, vmaddr); if (ptep) { - needs_flush = pte_needs_flush(*ptep); + pte = ptep_get(ptep); + needs_flush = pte_needs_flush(pte); pte_unmap(ptep); } if (needs_flush) - flush_cache_page(vma, vmaddr, pfn); + __flush_cache_page(vma, vmaddr, PFN_PHYS(pte_pfn(pte))); +#else + struct mm_struct *mm = vma->vm_mm; + unsigned long physaddr = get_upa(mm, vmaddr); + + if (physaddr) + __flush_cache_page(vma, vmaddr, PAGE_ALIGN_DOWN(physaddr)); +#endif } void copy_user_highpage(struct page *to, struct page *from, @@ -629,7 +655,7 @@ void copy_user_highpage(struct page *to, struct page *from, kfrom = kmap_local_page(from); kto = kmap_local_page(to); - flush_cache_page_if_present(vma, vaddr, page_to_pfn(from)); + __flush_cache_page(vma, vaddr, PFN_PHYS(page_to_pfn(from))); copy_page_asm(kto, kfrom); kunmap_local(kto); kunmap_local(kfrom); @@ -638,16 +664,17 @@ void copy_user_highpage(struct page *to, struct page *from, void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long user_vaddr, void *dst, void *src, int len) { - flush_cache_page_if_present(vma, user_vaddr, page_to_pfn(page)); + __flush_cache_page(vma, user_vaddr, PFN_PHYS(page_to_pfn(page))); memcpy(dst, src, len); - flush_kernel_dcache_range_asm((unsigned long)dst, (unsigned long)dst + len); + flush_kernel_dcache_page_addr(PTR_PAGE_ALIGN_DOWN(dst)); } void copy_from_user_page(struct vm_area_struct *vma, struct page *page, unsigned long user_vaddr, void *dst, void *src, int len) { - flush_cache_page_if_present(vma, user_vaddr, page_to_pfn(page)); + __flush_cache_page(vma, user_vaddr, PFN_PHYS(page_to_pfn(page))); memcpy(dst, src, len); + flush_kernel_dcache_page_addr(PTR_PAGE_ALIGN_DOWN(src)); } /* __flush_tlb_range() @@ -681,32 +708,10 @@ int __flush_tlb_range(unsigned long sid, unsigned long start, static void flush_cache_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - unsigned long addr, pfn; - pte_t *ptep; - - for (addr = start; addr < end; addr += PAGE_SIZE) { - bool needs_flush = false; - /* - * The vma can contain pages that aren't present. Although - * the pte search is expensive, we need the pte to find the - * page pfn and to check whether the page should be flushed. - */ - ptep = get_ptep(vma->vm_mm, addr); - if (ptep) { - needs_flush = pte_needs_flush(*ptep); - pfn = pte_pfn(*ptep); - pte_unmap(ptep); - } - if (needs_flush) { - if (parisc_requires_coherency()) { - flush_user_cache_page(vma, addr); - } else { - if (WARN_ON(!pfn_valid(pfn))) - return; - __flush_cache_page(vma, addr, PFN_PHYS(pfn)); - } - } - } + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE) + flush_cache_page_if_present(vma, addr); } static inline unsigned long mm_total_size(struct mm_struct *mm) @@ -757,21 +762,19 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned if (WARN_ON(IS_ENABLED(CONFIG_SMP) && arch_irqs_disabled())) return; flush_tlb_range(vma, start, end); - flush_cache_all(); + if (vma->vm_flags & VM_EXEC) + flush_cache_all(); + else + flush_data_cache(); return; } - flush_cache_pages(vma, start, end); + flush_cache_pages(vma, start & PAGE_MASK, end); } void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn) { - if (WARN_ON(!pfn_valid(pfn))) - return; - if (parisc_requires_coherency()) - flush_user_cache_page(vma, vmaddr); - else - __flush_cache_page(vma, vmaddr, PFN_PHYS(pfn)); + __flush_cache_page(vma, vmaddr, PFN_PHYS(pfn)); } void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) @@ -779,34 +782,133 @@ void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned lon if (!PageAnon(page)) return; - if (parisc_requires_coherency()) { - if (vma->vm_flags & VM_SHARED) - flush_data_cache(); - else - flush_user_cache_page(vma, vmaddr); + __flush_cache_page(vma, vmaddr, PFN_PHYS(page_to_pfn(page))); +} + +int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) +{ + pte_t pte = ptep_get(ptep); + + if (!pte_young(pte)) + return 0; + set_pte(ptep, pte_mkold(pte)); +#if CONFIG_FLUSH_PAGE_ACCESSED + __flush_cache_page(vma, addr, PFN_PHYS(pte_pfn(pte))); +#endif + return 1; +} + +/* + * After a PTE is cleared, we have no way to flush the cache for + * the physical page. On PA8800 and PA8900 processors, these lines + * can cause random cache corruption. Thus, we must flush the cache + * as well as the TLB when clearing a PTE that's valid. + */ +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) +{ + struct mm_struct *mm = (vma)->vm_mm; + pte_t pte = ptep_get_and_clear(mm, addr, ptep); + unsigned long pfn = pte_pfn(pte); + + if (pfn_valid(pfn)) + __flush_cache_page(vma, addr, PFN_PHYS(pfn)); + else if (pte_accessible(mm, pte)) + flush_tlb_page(vma, addr); + + return pte; +} + +/* + * The physical address for pages in the ioremap case can be obtained + * from the vm_struct struct. I wasn't able to successfully handle the + * vmalloc and vmap cases. We have an array of struct page pointers in + * the uninitialized vmalloc case but the flush failed using page_to_pfn. + */ +void flush_cache_vmap(unsigned long start, unsigned long end) +{ + unsigned long addr, physaddr; + struct vm_struct *vm; + + /* Prevent cache move-in */ + flush_tlb_kernel_range(start, end); + + if (end - start >= parisc_cache_flush_threshold) { + flush_cache_all(); return; } - flush_tlb_page(vma, vmaddr); - preempt_disable(); - flush_dcache_page_asm(page_to_phys(page), vmaddr); - preempt_enable(); + if (WARN_ON_ONCE(!is_vmalloc_addr((void *)start))) { + flush_cache_all(); + return; + } + + vm = find_vm_area((void *)start); + if (WARN_ON_ONCE(!vm)) { + flush_cache_all(); + return; + } + + /* The physical addresses of IOREMAP regions are contiguous */ + if (vm->flags & VM_IOREMAP) { + physaddr = vm->phys_addr; + for (addr = start; addr < end; addr += PAGE_SIZE) { + preempt_disable(); + flush_dcache_page_asm(physaddr, start); + flush_icache_page_asm(physaddr, start); + preempt_enable(); + physaddr += PAGE_SIZE; + } + return; + } + + flush_cache_all(); } +EXPORT_SYMBOL(flush_cache_vmap); +/* + * The vm_struct has been retired and the page table is set up. The + * last page in the range is a guard page. Its physical address can't + * be determined using lpa, so there is no way to flush the range + * using flush_dcache_page_asm. + */ +void flush_cache_vunmap(unsigned long start, unsigned long end) +{ + /* Prevent cache move-in */ + flush_tlb_kernel_range(start, end); + flush_data_cache(); +} +EXPORT_SYMBOL(flush_cache_vunmap); + +/* + * On systems with PA8800/PA8900 processors, there is no way to flush + * a vmap range other than using the architected loop to flush the + * entire cache. The page directory is not set up, so we can't use + * fdc, etc. FDCE/FICE don't work to flush a portion of the cache. + * L2 is physically indexed but FDCE/FICE instructions in virtual + * mode output their virtual address on the core bus, not their + * real address. As a result, the L2 cache index formed from the + * virtual address will most likely not be the same as the L2 index + * formed from the real address. + */ void flush_kernel_vmap_range(void *vaddr, int size) { unsigned long start = (unsigned long)vaddr; unsigned long end = start + size; - if ((!IS_ENABLED(CONFIG_SMP) || !arch_irqs_disabled()) && - (unsigned long)size >= parisc_cache_flush_threshold) { - flush_tlb_kernel_range(start, end); - flush_data_cache(); + flush_tlb_kernel_range(start, end); + + if (!static_branch_likely(&parisc_has_dcache)) + return; + + /* If interrupts are disabled, we can only do local flush */ + if (WARN_ON(IS_ENABLED(CONFIG_SMP) && arch_irqs_disabled())) { + flush_data_cache_local(NULL); return; } - flush_kernel_dcache_range_asm(start, end); - flush_tlb_kernel_range(start, end); + flush_data_cache(); } EXPORT_SYMBOL(flush_kernel_vmap_range); @@ -818,15 +920,18 @@ void invalidate_kernel_vmap_range(void *vaddr, int size) /* Ensure DMA is complete */ asm_syncdma(); - if ((!IS_ENABLED(CONFIG_SMP) || !arch_irqs_disabled()) && - (unsigned long)size >= parisc_cache_flush_threshold) { - flush_tlb_kernel_range(start, end); - flush_data_cache(); + flush_tlb_kernel_range(start, end); + + if (!static_branch_likely(&parisc_has_dcache)) + return; + + /* If interrupts are disabled, we can only do local flush */ + if (WARN_ON(IS_ENABLED(CONFIG_SMP) && arch_irqs_disabled())) { + flush_data_cache_local(NULL); return; } - purge_kernel_dcache_range_asm(start, end); - flush_tlb_kernel_range(start, end); + flush_data_cache(); } EXPORT_SYMBOL(invalidate_kernel_vmap_range); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3c968f2f4ac4..c88c6d46a5bc 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,7 +137,7 @@ config PPC select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_HUGEPD if HUGETLB_PAGE select ARCH_HAS_KCOV - select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC_FPU + select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC64 && PPC_FPU select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MEMREMAP_COMPAT_ALIGN if PPC_64S_HASH_MMU diff --git a/arch/powerpc/crypto/.gitignore b/arch/powerpc/crypto/.gitignore index e1094f08f713..e9fe73aac8b6 100644 --- a/arch/powerpc/crypto/.gitignore +++ b/arch/powerpc/crypto/.gitignore @@ -1,3 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only aesp10-ppc.S +aesp8-ppc.S ghashp10-ppc.S +ghashp8-ppc.S diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index de10437fd206..fd594bf6c6a9 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -92,9 +92,25 @@ __pu_failed: \ : label) #endif +#ifdef CONFIG_CC_IS_CLANG +#define DS_FORM_CONSTRAINT "Z<>" +#else +#define DS_FORM_CONSTRAINT "YZ<>" +#endif + #ifdef __powerpc64__ +#ifdef CONFIG_PPC_KERNEL_PREFIXED #define __put_user_asm2_goto(x, ptr, label) \ __put_user_asm_goto(x, ptr, label, "std") +#else +#define __put_user_asm2_goto(x, addr, label) \ + asm goto ("1: std%U1%X1 %0,%1 # put_user\n" \ + EX_TABLE(1b, %l2) \ + : \ + : "r" (x), DS_FORM_CONSTRAINT (*addr) \ + : \ + : label) +#endif // CONFIG_PPC_KERNEL_PREFIXED #else /* __powerpc64__ */ #define __put_user_asm2_goto(x, addr, label) \ asm goto( \ @@ -165,8 +181,19 @@ do { \ #endif #ifdef __powerpc64__ +#ifdef CONFIG_PPC_KERNEL_PREFIXED #define __get_user_asm2_goto(x, addr, label) \ __get_user_asm_goto(x, addr, label, "ld") +#else +#define __get_user_asm2_goto(x, addr, label) \ + asm_goto_output( \ + "1: ld%U1%X1 %0, %1 # get_user\n" \ + EX_TABLE(1b, %l2) \ + : "=r" (x) \ + : DS_FORM_CONSTRAINT (*addr) \ + : \ + : label) +#endif // CONFIG_PPC_KERNEL_PREFIXED #else /* __powerpc64__ */ #define __get_user_asm2_goto(x, addr, label) \ asm_goto_output( \ diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index b569ebaa590e..3ff3de9a52ac 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -130,14 +130,16 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, } rcu_read_unlock(); - fdput(f); - - if (!found) + if (!found) { + fdput(f); return -EINVAL; + } table_group = iommu_group_get_iommudata(grp); - if (WARN_ON(!table_group)) + if (WARN_ON(!table_group)) { + fdput(f); return -EFAULT; + } for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { struct iommu_table *tbltmp = table_group->tables[i]; @@ -158,8 +160,10 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, break; } } - if (!tbl) + if (!tbl) { + fdput(f); return -EINVAL; + } rcu_read_lock(); list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { @@ -170,6 +174,7 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, /* stit is being destroyed */ iommu_tce_table_put(tbl); rcu_read_unlock(); + fdput(f); return -ENOTTY; } /* @@ -177,6 +182,7 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, * its KVM reference counter and can return. */ rcu_read_unlock(); + fdput(f); return 0; } rcu_read_unlock(); @@ -184,6 +190,7 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, stit = kzalloc(sizeof(*stit), GFP_KERNEL); if (!stit) { iommu_tce_table_put(tbl); + fdput(f); return -ENOMEM; } @@ -192,6 +199,7 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, list_add_rcu(&stit->next, &stt->iommu_tables); + fdput(f); return 0; } diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 43b97032a91c..a0c4f1bde83e 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -900,6 +900,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code /* Get offset into TMP_REG */ EMIT(PPC_RAW_LI(tmp_reg, off)); + /* + * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync' + * before and after the operation. + * + * This is a requirement in the Linux Kernel Memory Model. + * See __cmpxchg_u32() in asm/cmpxchg.h as an example. + */ + if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); tmp_idx = ctx->idx * 4; /* load value from memory into r0 */ EMIT(PPC_RAW_LWARX(_R0, tmp_reg, dst_reg, 0)); @@ -953,6 +962,9 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code /* For the BPF_FETCH variant, get old data into src_reg */ if (imm & BPF_FETCH) { + /* Emit 'sync' to enforce full ordering */ + if (IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); EMIT(PPC_RAW_MR(ret_reg, ax_reg)); if (!fp->aux->verifier_zext) EMIT(PPC_RAW_LI(ret_reg - 1, 0)); /* higher 32-bit */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 8afc14a4a125..7703dcf48be8 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -846,6 +846,15 @@ emit_clear: /* Get offset into TMP_REG_1 */ EMIT(PPC_RAW_LI(tmp1_reg, off)); + /* + * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync' + * before and after the operation. + * + * This is a requirement in the Linux Kernel Memory Model. + * See __cmpxchg_u64() in asm/cmpxchg.h as an example. + */ + if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); tmp_idx = ctx->idx * 4; /* load value from memory into TMP_REG_2 */ if (size == BPF_DW) @@ -908,6 +917,9 @@ emit_clear: PPC_BCC_SHORT(COND_NE, tmp_idx); if (imm & BPF_FETCH) { + /* Emit 'sync' to enforce full ordering */ + if (IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); EMIT(PPC_RAW_MR(ret_reg, _R0)); /* * Skip unnecessary zero-extension for 32-bit cmpxchg. diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 6e7029640c0c..62da20f9700a 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -371,8 +371,8 @@ static int read_dt_lpar_name(struct seq_file *m) static void read_lpar_name(struct seq_file *m) { - if (read_rtas_lpar_name(m) && read_dt_lpar_name(m)) - pr_err_once("Error can't get the LPAR name"); + if (read_rtas_lpar_name(m)) + read_dt_lpar_name(m); } #define SPLPAR_MAXLENGTH 1026*(sizeof(char)) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index b94176e25be1..0525ee2d63c7 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -106,7 +106,7 @@ config RISCV select HAS_IOPORT if MMU select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP - select HAVE_ARCH_HUGE_VMAP if MMU && 64BIT && !XIP_KERNEL + select HAVE_ARCH_HUGE_VMAP if MMU && 64BIT select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_JUMP_LABEL_RELATIVE if !XIP_KERNEL select HAVE_ARCH_KASAN if MMU && 64BIT diff --git a/arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts b/arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts index cd013588adc0..375ff2661b6e 100644 --- a/arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts +++ b/arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts @@ -45,6 +45,7 @@ no-1-8-v; no-mmc; no-sdio; + disable-wp; }; &uart0 { diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index ddb002ed89de..808b4c78462e 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -10,7 +10,7 @@ #include <asm/fence.h> -#define __arch_xchg_masked(prepend, append, r, p, n) \ +#define __arch_xchg_masked(sc_sfx, prepend, append, r, p, n) \ ({ \ u32 *__ptr32b = (u32 *)((ulong)(p) & ~0x3); \ ulong __s = ((ulong)(p) & (0x4 - sizeof(*p))) * BITS_PER_BYTE; \ @@ -25,7 +25,7 @@ "0: lr.w %0, %2\n" \ " and %1, %0, %z4\n" \ " or %1, %1, %z3\n" \ - " sc.w %1, %1, %2\n" \ + " sc.w" sc_sfx " %1, %1, %2\n" \ " bnez %1, 0b\n" \ append \ : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \ @@ -46,7 +46,8 @@ : "memory"); \ }) -#define _arch_xchg(ptr, new, sfx, prepend, append) \ +#define _arch_xchg(ptr, new, sc_sfx, swap_sfx, prepend, \ + sc_append, swap_append) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ __typeof__(*(__ptr)) __new = (new); \ @@ -55,15 +56,15 @@ switch (sizeof(*__ptr)) { \ case 1: \ case 2: \ - __arch_xchg_masked(prepend, append, \ + __arch_xchg_masked(sc_sfx, prepend, sc_append, \ __ret, __ptr, __new); \ break; \ case 4: \ - __arch_xchg(".w" sfx, prepend, append, \ + __arch_xchg(".w" swap_sfx, prepend, swap_append, \ __ret, __ptr, __new); \ break; \ case 8: \ - __arch_xchg(".d" sfx, prepend, append, \ + __arch_xchg(".d" swap_sfx, prepend, swap_append, \ __ret, __ptr, __new); \ break; \ default: \ @@ -73,16 +74,17 @@ }) #define arch_xchg_relaxed(ptr, x) \ - _arch_xchg(ptr, x, "", "", "") + _arch_xchg(ptr, x, "", "", "", "", "") #define arch_xchg_acquire(ptr, x) \ - _arch_xchg(ptr, x, "", "", RISCV_ACQUIRE_BARRIER) + _arch_xchg(ptr, x, "", "", "", \ + RISCV_ACQUIRE_BARRIER, RISCV_ACQUIRE_BARRIER) #define arch_xchg_release(ptr, x) \ - _arch_xchg(ptr, x, "", RISCV_RELEASE_BARRIER, "") + _arch_xchg(ptr, x, "", "", RISCV_RELEASE_BARRIER, "", "") #define arch_xchg(ptr, x) \ - _arch_xchg(ptr, x, ".aqrl", "", "") + _arch_xchg(ptr, x, ".rl", ".aqrl", "", RISCV_FULL_BARRIER, "") #define xchg32(ptr, x) \ ({ \ diff --git a/arch/riscv/include/asm/insn.h b/arch/riscv/include/asm/insn.h index 06e439eeef9a..09fde95a5e8f 100644 --- a/arch/riscv/include/asm/insn.h +++ b/arch/riscv/include/asm/insn.h @@ -145,7 +145,7 @@ /* parts of opcode for RVF, RVD and RVQ */ #define RVFDQ_FL_FS_WIDTH_OFF 12 -#define RVFDQ_FL_FS_WIDTH_MASK GENMASK(3, 0) +#define RVFDQ_FL_FS_WIDTH_MASK GENMASK(2, 0) #define RVFDQ_FL_FS_WIDTH_W 2 #define RVFDQ_FL_FS_WIDTH_D 3 #define RVFDQ_LS_FS_WIDTH_Q 4 diff --git a/arch/riscv/kernel/cpu_ops_sbi.c b/arch/riscv/kernel/cpu_ops_sbi.c index 1cc7df740edd..e6fbaaf54956 100644 --- a/arch/riscv/kernel/cpu_ops_sbi.c +++ b/arch/riscv/kernel/cpu_ops_sbi.c @@ -72,7 +72,7 @@ static int sbi_cpu_start(unsigned int cpuid, struct task_struct *tidle) /* Make sure tidle is updated */ smp_mb(); bdata->task_ptr = tidle; - bdata->stack_ptr = task_stack_page(tidle) + THREAD_SIZE; + bdata->stack_ptr = task_pt_regs(tidle); /* Make sure boot data is updated */ smp_mb(); hsm_data = __pa(bdata); diff --git a/arch/riscv/kernel/cpu_ops_spinwait.c b/arch/riscv/kernel/cpu_ops_spinwait.c index 613872b0a21a..24869eb88908 100644 --- a/arch/riscv/kernel/cpu_ops_spinwait.c +++ b/arch/riscv/kernel/cpu_ops_spinwait.c @@ -34,8 +34,7 @@ static void cpu_update_secondary_bootdata(unsigned int cpuid, /* Make sure tidle is updated */ smp_mb(); - WRITE_ONCE(__cpu_spinwait_stack_pointer[hartid], - task_stack_page(tidle) + THREAD_SIZE); + WRITE_ONCE(__cpu_spinwait_stack_pointer[hartid], task_pt_regs(tidle)); WRITE_ONCE(__cpu_spinwait_task_pointer[hartid], tidle); } diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 87cbd86576b2..4b95c574fd04 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -120,9 +120,6 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) out = ftrace_make_nop(mod, rec, MCOUNT_ADDR); mutex_unlock(&text_mutex); - if (!mod) - local_flush_icache_range(rec->ip, rec->ip + MCOUNT_INSN_SIZE); - return out; } @@ -156,9 +153,9 @@ static int __ftrace_modify_code(void *data) } else { while (atomic_read(¶m->cpu_count) <= num_online_cpus()) cpu_relax(); - } - local_flush_icache_all(); + local_flush_icache_all(); + } return 0; } diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c index 4007563fb607..ab03732d06c4 100644 --- a/arch/riscv/kernel/patch.c +++ b/arch/riscv/kernel/patch.c @@ -89,6 +89,14 @@ static int __patch_insn_set(void *addr, u8 c, size_t len) memset(waddr, c, len); + /* + * We could have just patched a function that is about to be + * called so make sure we don't execute partially patched + * instructions by flushing the icache as soon as possible. + */ + local_flush_icache_range((unsigned long)waddr, + (unsigned long)waddr + len); + patch_unmap(FIX_TEXT_POKE0); if (across_pages) @@ -135,6 +143,14 @@ static int __patch_insn_write(void *addr, const void *insn, size_t len) ret = copy_to_kernel_nofault(waddr, insn, len); + /* + * We could have just patched a function that is about to be + * called so make sure we don't execute partially patched + * instructions by flushing the icache as soon as possible. + */ + local_flush_icache_range((unsigned long)waddr, + (unsigned long)waddr + len); + patch_unmap(FIX_TEXT_POKE0); if (across_pages) @@ -189,9 +205,6 @@ int patch_text_set_nosync(void *addr, u8 c, size_t len) ret = patch_insn_set(tp, c, len); - if (!ret) - flush_icache_range((uintptr_t)tp, (uintptr_t)tp + len); - return ret; } NOKPROBE_SYMBOL(patch_text_set_nosync); @@ -224,9 +237,6 @@ int patch_text_nosync(void *addr, const void *insns, size_t len) ret = patch_insn_write(tp, insns, len); - if (!ret) - flush_icache_range((uintptr_t) tp, (uintptr_t) tp + len); - return ret; } NOKPROBE_SYMBOL(patch_text_nosync); @@ -253,9 +263,9 @@ static int patch_text_cb(void *data) } else { while (atomic_read(&patch->cpu_count) <= num_online_cpus()) cpu_relax(); - } - local_flush_icache_all(); + local_flush_icache_all(); + } return ret; } diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c index 0eb689351b7d..5cd407c6a8e4 100644 --- a/arch/riscv/kvm/aia_device.c +++ b/arch/riscv/kvm/aia_device.c @@ -237,10 +237,11 @@ static gpa_t aia_imsic_ppn(struct kvm_aia *aia, gpa_t addr) static u32 aia_imsic_hart_index(struct kvm_aia *aia, gpa_t addr) { - u32 hart, group = 0; + u32 hart = 0, group = 0; - hart = (addr >> (aia->nr_guest_bits + IMSIC_MMIO_PAGE_SHIFT)) & - GENMASK_ULL(aia->nr_hart_bits - 1, 0); + if (aia->nr_hart_bits) + hart = (addr >> (aia->nr_guest_bits + IMSIC_MMIO_PAGE_SHIFT)) & + GENMASK_ULL(aia->nr_hart_bits - 1, 0); if (aia->nr_group_bits) group = (addr >> aia->nr_group_shift) & GENMASK_ULL(aia->nr_group_bits - 1, 0); diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index c676275ea0a0..62874fbca29f 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -724,9 +724,9 @@ static int kvm_riscv_vcpu_set_reg_isa_ext(struct kvm_vcpu *vcpu, switch (reg_subtype) { case KVM_REG_RISCV_ISA_SINGLE: return riscv_vcpu_set_isa_ext_single(vcpu, reg_num, reg_val); - case KVM_REG_RISCV_SBI_MULTI_EN: + case KVM_REG_RISCV_ISA_MULTI_EN: return riscv_vcpu_set_isa_ext_multi(vcpu, reg_num, reg_val, true); - case KVM_REG_RISCV_SBI_MULTI_DIS: + case KVM_REG_RISCV_ISA_MULTI_DIS: return riscv_vcpu_set_isa_ext_multi(vcpu, reg_num, reg_val, false); default: return -ENOENT; diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index b3fcf7d67efb..5224f3733802 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -293,8 +293,8 @@ void handle_page_fault(struct pt_regs *regs) if (unlikely(access_error(cause, vma))) { vma_end_read(vma); count_vm_vma_lock_event(VMA_LOCK_SUCCESS); - tsk->thread.bad_cause = SEGV_ACCERR; - bad_area_nosemaphore(regs, code, addr); + tsk->thread.bad_cause = cause; + bad_area_nosemaphore(regs, SEGV_ACCERR, addr); return; } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index e3218d65f21d..e3405e4b99af 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -250,18 +250,19 @@ static void __init setup_bootmem(void) kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base; /* - * memblock allocator is not aware of the fact that last 4K bytes of - * the addressable memory can not be mapped because of IS_ERR_VALUE - * macro. Make sure that last 4k bytes are not usable by memblock - * if end of dram is equal to maximum addressable memory. For 64-bit - * kernel, this problem can't happen here as the end of the virtual - * address space is occupied by the kernel mapping then this check must - * be done as soon as the kernel mapping base address is determined. + * Reserve physical address space that would be mapped to virtual + * addresses greater than (void *)(-PAGE_SIZE) because: + * - This memory would overlap with ERR_PTR + * - This memory belongs to high memory, which is not supported + * + * This is not applicable to 64-bit kernel, because virtual addresses + * after (void *)(-PAGE_SIZE) are not linearly mapped: they are + * occupied by kernel mapping. Also it is unrealistic for high memory + * to exist on 64-bit platforms. */ if (!IS_ENABLED(CONFIG_64BIT)) { - max_mapped_addr = __pa(~(ulong)0); - if (max_mapped_addr == (phys_ram_end - 1)) - memblock_set_current_limit(max_mapped_addr - 4096); + max_mapped_addr = __va_to_pa_nodebug(-PAGE_SIZE); + memblock_reserve(max_mapped_addr, (phys_addr_t)-max_mapped_addr); } min_low_pfn = PFN_UP(phys_ram_base); diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 182aac6a0f77..48ef5fe5c08a 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -384,7 +384,7 @@ static void fixup_vmlinux_info(void) void startup_kernel(void) { unsigned long kernel_size = vmlinux.image_size + vmlinux.bss_size; - unsigned long nokaslr_offset_phys = mem_safe_offset(); + unsigned long nokaslr_offset_phys, kaslr_large_page_offset; unsigned long amode31_lma = 0; unsigned long max_physmem_end; unsigned long asce_limit; @@ -393,6 +393,12 @@ void startup_kernel(void) fixup_vmlinux_info(); setup_lpp(); + + /* + * Non-randomized kernel physical start address must be _SEGMENT_SIZE + * aligned (see blow). + */ + nokaslr_offset_phys = ALIGN(mem_safe_offset(), _SEGMENT_SIZE); safe_addr = PAGE_ALIGN(nokaslr_offset_phys + kernel_size); /* @@ -425,10 +431,25 @@ void startup_kernel(void) save_ipl_cert_comp_list(); rescue_initrd(safe_addr, ident_map_size); - if (kaslr_enabled()) - __kaslr_offset_phys = randomize_within_range(kernel_size, THREAD_SIZE, 0, ident_map_size); + /* + * __kaslr_offset_phys must be _SEGMENT_SIZE aligned, so the lower + * 20 bits (the offset within a large page) are zero. Copy the last + * 20 bits of __kaslr_offset, which is THREAD_SIZE aligned, to + * __kaslr_offset_phys. + * + * With this the last 20 bits of __kaslr_offset_phys and __kaslr_offset + * are identical, which is required to allow for large mappings of the + * kernel image. + */ + kaslr_large_page_offset = __kaslr_offset & ~_SEGMENT_MASK; + if (kaslr_enabled()) { + unsigned long end = ident_map_size - kaslr_large_page_offset; + + __kaslr_offset_phys = randomize_within_range(kernel_size, _SEGMENT_SIZE, 0, end); + } if (!__kaslr_offset_phys) __kaslr_offset_phys = nokaslr_offset_phys; + __kaslr_offset_phys |= kaslr_large_page_offset; kaslr_adjust_vmlinux_info(__kaslr_offset_phys); physmem_reserve(RR_VMLINUX, __kaslr_offset_phys, kernel_size); deploy_kernel((void *)__kaslr_offset_phys); diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 96d48b7112d4..40cfce2687c4 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -261,21 +261,27 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m static bool large_allowed(enum populate_mode mode) { - return (mode == POPULATE_DIRECT) || (mode == POPULATE_IDENTITY); + return (mode == POPULATE_DIRECT) || (mode == POPULATE_IDENTITY) || (mode == POPULATE_KERNEL); } static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end, enum populate_mode mode) { + unsigned long size = end - addr; + return machine.has_edat2 && large_allowed(mode) && - IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE; + IS_ALIGNED(addr, PUD_SIZE) && (size >= PUD_SIZE) && + IS_ALIGNED(_pa(addr, size, mode), PUD_SIZE); } static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end, enum populate_mode mode) { + unsigned long size = end - addr; + return machine.has_edat1 && large_allowed(mode) && - IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE; + IS_ALIGNED(addr, PMD_SIZE) && (size >= PMD_SIZE) && + IS_ALIGNED(_pa(addr, size, mode), PMD_SIZE); } static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end, diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index 1fe5a1d3ff60..a750711d44c8 100644 --- a/arch/s390/boot/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -109,6 +109,7 @@ SECTIONS #ifdef CONFIG_KERNEL_UNCOMPRESSED . = ALIGN(PAGE_SIZE); . += AMODE31_SIZE; /* .amode31 section */ + . = ALIGN(1 << 20); /* _SEGMENT_SIZE */ #else . = ALIGN(8); #endif diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 145342e46ea8..8c4adece8911 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -43,7 +43,6 @@ CONFIG_PROFILING=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y CONFIG_KEXEC_SIG=y -CONFIG_CRASH_DUMP=y CONFIG_LIVEPATCH=y CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=512 @@ -51,6 +50,7 @@ CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_CERT_STORE=y CONFIG_EXPOLINE=y +# CONFIG_EXPOLINE_EXTERN is not set CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y CONFIG_VFIO_CCW=m @@ -76,6 +76,7 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG_SHA256=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_BLK_WBT=y CONFIG_BLK_CGROUP_IOLATENCY=y @@ -100,7 +101,6 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CMA_DEBUG=y CONFIG_CMA_DEBUGFS=y CONFIG_CMA_SYSFS=y CONFIG_CMA_AREAS=7 @@ -119,6 +119,7 @@ CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m CONFIG_SMC_DIAG=m +CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y @@ -133,7 +134,6 @@ CONFIG_IP_MROUTE=y CONFIG_IP_MROUTE_MULTIPLE_TABLES=y CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y CONFIG_NET_IPVTI=m CONFIG_INET_AH=m CONFIG_INET_ESP=m @@ -167,6 +167,7 @@ CONFIG_BRIDGE_NETFILTER=m CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y CONFIG_NF_CONNTRACK_PROCFS=y CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_NF_CONNTRACK_TIMEOUT=y @@ -183,17 +184,39 @@ CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_CT_NETLINK=m CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y CONFIG_NF_TABLES=m CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_CONNLIMIT=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m -CONFIG_NETFILTER_XTABLES_COMPAT=y +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NFT_REJECT_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NF_FLOW_TABLE_PROCFS=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_AUDIT=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m @@ -206,8 +229,10 @@ CONFIG_NETFILTER_XT_TARGET_HMARK=m CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m CONFIG_NETFILTER_XT_TARGET_LOG=m CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m CONFIG_NETFILTER_XT_TARGET_NFLOG=m CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m CONFIG_NETFILTER_XT_TARGET_TEE=m CONFIG_NETFILTER_XT_TARGET_TPROXY=m CONFIG_NETFILTER_XT_TARGET_TRACE=m @@ -216,6 +241,7 @@ CONFIG_NETFILTER_XT_TARGET_TCPMSS=m CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m CONFIG_NETFILTER_XT_MATCH_CLUSTER=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m @@ -230,6 +256,7 @@ CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m CONFIG_NETFILTER_XT_MATCH_IPRANGE=m CONFIG_NETFILTER_XT_MATCH_IPVS=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m @@ -247,6 +274,7 @@ CONFIG_NETFILTER_XT_MATCH_QUOTA=m CONFIG_NETFILTER_XT_MATCH_RATEEST=m CONFIG_NETFILTER_XT_MATCH_REALM=m CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m CONFIG_NETFILTER_XT_MATCH_STATE=m CONFIG_NETFILTER_XT_MATCH_STATISTIC=m CONFIG_NETFILTER_XT_MATCH_STRING=m @@ -302,7 +330,6 @@ CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_FIB_IPV6=m @@ -373,7 +400,6 @@ CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m @@ -462,6 +488,7 @@ CONFIG_DM_VERITY=m CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y CONFIG_DM_SWITCH=m CONFIG_DM_INTEGRITY=m +CONFIG_DM_VDO=m CONFIG_NETDEVICES=y CONFIG_BONDING=m CONFIG_DUMMY=m @@ -574,7 +601,6 @@ CONFIG_WATCHDOG=y CONFIG_WATCHDOG_NOWAYOUT=y CONFIG_SOFT_WATCHDOG=m CONFIG_DIAG288_WATCHDOG=m -# CONFIG_DRM_DEBUG_MODESET_LOCK is not set CONFIG_FB=y # CONFIG_FB_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y @@ -645,7 +671,6 @@ CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_EXFAT_FS=m CONFIG_NTFS_FS=m -CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y @@ -663,6 +688,7 @@ CONFIG_SQUASHFS_XZ=y CONFIG_SQUASHFS_ZSTD=y CONFIG_ROMFS_FS=m CONFIG_NFS_FS=m +CONFIG_NFS_V2=m CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y @@ -879,6 +905,5 @@ CONFIG_RBTREE_TEST=y CONFIG_INTERVAL_TREE_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y -CONFIG_STRING_SELFTEST=y CONFIG_TEST_BITOPS=m CONFIG_TEST_BPF=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index dc237896f99d..6dd11d3b6aaa 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -41,7 +41,6 @@ CONFIG_PROFILING=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y CONFIG_KEXEC_SIG=y -CONFIG_CRASH_DUMP=y CONFIG_LIVEPATCH=y CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=512 @@ -49,6 +48,7 @@ CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_CERT_STORE=y CONFIG_EXPOLINE=y +# CONFIG_EXPOLINE_EXTERN is not set CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y CONFIG_VFIO_CCW=m @@ -71,6 +71,7 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG_SHA256=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_BLK_WBT=y CONFIG_BLK_CGROUP_IOLATENCY=y @@ -110,6 +111,7 @@ CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m CONFIG_SMC_DIAG=m +CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y @@ -124,7 +126,6 @@ CONFIG_IP_MROUTE=y CONFIG_IP_MROUTE_MULTIPLE_TABLES=y CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y CONFIG_NET_IPVTI=m CONFIG_INET_AH=m CONFIG_INET_ESP=m @@ -158,6 +159,7 @@ CONFIG_BRIDGE_NETFILTER=m CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y CONFIG_NF_CONNTRACK_PROCFS=y CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_NF_CONNTRACK_TIMEOUT=y @@ -174,17 +176,39 @@ CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_CT_NETLINK=m CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y CONFIG_NF_TABLES=m CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_CONNLIMIT=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m -CONFIG_NETFILTER_XTABLES_COMPAT=y +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NFT_REJECT_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NF_FLOW_TABLE_PROCFS=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_AUDIT=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m @@ -197,8 +221,10 @@ CONFIG_NETFILTER_XT_TARGET_HMARK=m CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m CONFIG_NETFILTER_XT_TARGET_LOG=m CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m CONFIG_NETFILTER_XT_TARGET_NFLOG=m CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m CONFIG_NETFILTER_XT_TARGET_TEE=m CONFIG_NETFILTER_XT_TARGET_TPROXY=m CONFIG_NETFILTER_XT_TARGET_TRACE=m @@ -207,6 +233,7 @@ CONFIG_NETFILTER_XT_TARGET_TCPMSS=m CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m CONFIG_NETFILTER_XT_MATCH_CLUSTER=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m @@ -221,6 +248,7 @@ CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m CONFIG_NETFILTER_XT_MATCH_IPRANGE=m CONFIG_NETFILTER_XT_MATCH_IPVS=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m @@ -238,6 +266,7 @@ CONFIG_NETFILTER_XT_MATCH_QUOTA=m CONFIG_NETFILTER_XT_MATCH_RATEEST=m CONFIG_NETFILTER_XT_MATCH_REALM=m CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m CONFIG_NETFILTER_XT_MATCH_STATE=m CONFIG_NETFILTER_XT_MATCH_STATISTIC=m CONFIG_NETFILTER_XT_MATCH_STRING=m @@ -293,7 +322,6 @@ CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_FIB_IPV6=m @@ -363,7 +391,6 @@ CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m @@ -452,6 +479,7 @@ CONFIG_DM_VERITY=m CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y CONFIG_DM_SWITCH=m CONFIG_DM_INTEGRITY=m +CONFIG_DM_VDO=m CONFIG_NETDEVICES=y CONFIG_BONDING=m CONFIG_DUMMY=m @@ -630,7 +658,6 @@ CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_EXFAT_FS=m CONFIG_NTFS_FS=m -CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y @@ -649,6 +676,7 @@ CONFIG_SQUASHFS_XZ=y CONFIG_SQUASHFS_ZSTD=y CONFIG_ROMFS_FS=m CONFIG_NFS_FS=m +CONFIG_NFS_V2=m CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index c51f3ec4eb28..8c2b61363bab 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -9,25 +9,22 @@ CONFIG_BPF_SYSCALL=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KEXEC=y -CONFIG_CRASH_DUMP=y CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=2 CONFIG_HZ_100=y # CONFIG_CHSC_SCH is not set # CONFIG_SCM_BUS is not set +# CONFIG_AP is not set # CONFIG_PFAULT is not set # CONFIG_S390_HYPFS is not set # CONFIG_VIRTUALIZATION is not set # CONFIG_S390_GUEST is not set # CONFIG_SECCOMP is not set -# CONFIG_GCC_PLUGINS is not set # CONFIG_BLOCK_LEGACY_AUTOLOAD is not set CONFIG_PARTITION_ADVANCED=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_SWAP is not set # CONFIG_COMPAT_BRK is not set -# CONFIG_COMPACTION is not set -# CONFIG_MIGRATION is not set CONFIG_NET=y # CONFIG_IUCV is not set # CONFIG_PCPU_DEV_REFCNT is not set diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 9863ebe75019..edae13416196 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -451,7 +451,7 @@ static void *nt_final(void *ptr) /* * Initialize ELF header (new kernel) */ -static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) +static void *ehdr_init(Elf64_Ehdr *ehdr, int phdr_count) { memset(ehdr, 0, sizeof(*ehdr)); memcpy(ehdr->e_ident, ELFMAG, SELFMAG); @@ -465,11 +465,8 @@ static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) ehdr->e_phoff = sizeof(Elf64_Ehdr); ehdr->e_ehsize = sizeof(Elf64_Ehdr); ehdr->e_phentsize = sizeof(Elf64_Phdr); - /* - * Number of memory chunk PT_LOAD program headers plus one kernel - * image PT_LOAD program header plus one PT_NOTE program header. - */ - ehdr->e_phnum = mem_chunk_cnt + 1 + 1; + /* Number of PT_LOAD program headers plus PT_NOTE program header */ + ehdr->e_phnum = phdr_count + 1; return ehdr + 1; } @@ -503,12 +500,14 @@ static int get_mem_chunk_cnt(void) /* * Initialize ELF loads (new kernel) */ -static void loads_init(Elf64_Phdr *phdr) +static void loads_init(Elf64_Phdr *phdr, bool os_info_has_vm) { - unsigned long old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); + unsigned long old_identity_base = 0; phys_addr_t start, end; u64 idx; + if (os_info_has_vm) + old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); for_each_physmem_range(idx, &oldmem_type, &start, &end) { phdr->p_type = PT_LOAD; phdr->p_vaddr = old_identity_base + start; @@ -522,6 +521,11 @@ static void loads_init(Elf64_Phdr *phdr) } } +static bool os_info_has_vm(void) +{ + return os_info_old_value(OS_INFO_KASLR_OFFSET); +} + /* * Prepare PT_LOAD type program header for kernel image region */ @@ -566,7 +570,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) return ptr; } -static size_t get_elfcorehdr_size(int mem_chunk_cnt) +static size_t get_elfcorehdr_size(int phdr_count) { size_t size; @@ -581,10 +585,8 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) size += nt_vmcoreinfo_size(); /* nt_final */ size += sizeof(Elf64_Nhdr); - /* PT_LOAD type program header for kernel text region */ - size += sizeof(Elf64_Phdr); /* PT_LOADS */ - size += mem_chunk_cnt * sizeof(Elf64_Phdr); + size += phdr_count * sizeof(Elf64_Phdr); return size; } @@ -595,8 +597,8 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) { Elf64_Phdr *phdr_notes, *phdr_loads, *phdr_text; + int mem_chunk_cnt, phdr_text_cnt; size_t alloc_size; - int mem_chunk_cnt; void *ptr, *hdr; u64 hdr_off; @@ -615,12 +617,14 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) } mem_chunk_cnt = get_mem_chunk_cnt(); + phdr_text_cnt = os_info_has_vm() ? 1 : 0; - alloc_size = get_elfcorehdr_size(mem_chunk_cnt); + alloc_size = get_elfcorehdr_size(mem_chunk_cnt + phdr_text_cnt); hdr = kzalloc(alloc_size, GFP_KERNEL); - /* Without elfcorehdr /proc/vmcore cannot be created. Thus creating + /* + * Without elfcorehdr /proc/vmcore cannot be created. Thus creating * a dump with this crash kernel will fail. Panic now to allow other * dump mechanisms to take over. */ @@ -628,21 +632,23 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) panic("s390 kdump allocating elfcorehdr failed"); /* Init elf header */ - ptr = ehdr_init(hdr, mem_chunk_cnt); + phdr_notes = ehdr_init(hdr, mem_chunk_cnt + phdr_text_cnt); /* Init program headers */ - phdr_notes = ptr; - ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); - phdr_text = ptr; - ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); - phdr_loads = ptr; - ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt); + if (phdr_text_cnt) { + phdr_text = phdr_notes + 1; + phdr_loads = phdr_text + 1; + } else { + phdr_loads = phdr_notes + 1; + } + ptr = PTR_ADD(phdr_loads, sizeof(Elf64_Phdr) * mem_chunk_cnt); /* Init notes */ hdr_off = PTR_DIFF(ptr, hdr); ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); /* Init kernel text program header */ - text_init(phdr_text); + if (phdr_text_cnt) + text_init(phdr_text); /* Init loads */ - loads_init(phdr_loads); + loads_init(phdr_loads, phdr_text_cnt); /* Finalize program headers */ hdr_off = PTR_DIFF(ptr, hdr); *addr = (unsigned long long) hdr; diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 243ee86cb1b1..f2051644de94 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -105,9 +105,9 @@ vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_mixed.o -vmlinux-objs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a -$(obj)/vmlinux: $(vmlinux-objs-y) FORCE +$(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE $(call if_changed,ld) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index e64eaa8dda5a..9d6e8f13d13a 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -114,6 +114,7 @@ #include "../perf_event.h" #include "../probe.h" +MODULE_DESCRIPTION("Support for Intel cstate performance events"); MODULE_LICENSE("GPL"); #define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \ diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 419c517b8594..c68f5b39952b 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -34,6 +34,7 @@ static struct event_constraint uncore_constraint_fixed = struct event_constraint uncore_constraint_empty = EVENT_CONSTRAINT(0, 0, 0); +MODULE_DESCRIPTION("Support for Intel uncore performance events"); MODULE_LICENSE("GPL"); int uncore_pcibus_to_dieid(struct pci_bus *bus) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 46e673585560..0c5e7a7c43ac 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -64,6 +64,7 @@ #include "perf_event.h" #include "probe.h" +MODULE_DESCRIPTION("Support Intel/AMD RAPL energy consumption counters"); MODULE_LICENSE("GPL"); /* diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 1dc600fa3ba5..481096177500 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -401,7 +401,6 @@ extern int __init efi_memmap_alloc(unsigned int num_entries, struct efi_memory_map_data *data); extern void __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags); -#define __efi_memmap_free __efi_memmap_free extern int __init efi_memmap_install(struct efi_memory_map_data *data); extern int __init efi_memmap_split_count(efi_memory_desc_t *md, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ece45b3f6f20..f8ca74e7678f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2154,6 +2154,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); +void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, unsigned long roots); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 0f9bab92a43d..3a7755c1a441 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -78,10 +78,10 @@ extern int __get_user_bad(void); int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ - asm volatile("call __" #fn "_%c4" \ + asm volatile("call __" #fn "_%c[size]" \ : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ - : "0" (ptr), "i" (sizeof(*(ptr)))); \ + : "0" (ptr), [size] "i" (sizeof(*(ptr)))); \ instrument_get_user(__val_gu); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index 266daf5b5b84..695f36664889 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -77,7 +77,7 @@ #define VMX_FEATURE_ENCLS_EXITING ( 2*32+ 15) /* "" VM-Exit on ENCLS (leaf dependent) */ #define VMX_FEATURE_RDSEED_EXITING ( 2*32+ 16) /* "" VM-Exit on RDSEED */ #define VMX_FEATURE_PAGE_MOD_LOGGING ( 2*32+ 17) /* "pml" Log dirty pages into buffer */ -#define VMX_FEATURE_EPT_VIOLATION_VE ( 2*32+ 18) /* "" Conditionally reflect EPT violations as #VE exceptions */ +#define VMX_FEATURE_EPT_VIOLATION_VE ( 2*32+ 18) /* Conditionally reflect EPT violations as #VE exceptions */ #define VMX_FEATURE_PT_CONCEAL_VMX ( 2*32+ 19) /* "" Suppress VMX indicators in Processor Trace */ #define VMX_FEATURE_XSAVES ( 2*32+ 20) /* "" Enable XSAVES and XRSTORS in guest */ #define VMX_FEATURE_MODE_BASED_EPT_EXEC ( 2*32+ 22) /* "ept_mode_based_exec" Enable separate EPT EXEC bits for supervisor vs. user */ diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 3cf156f70859..027a8c7a2c9e 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -215,7 +215,14 @@ out: int amd_smn_read(u16 node, u32 address, u32 *value) { - return __amd_smn_rw(node, address, value, false); + int err = __amd_smn_rw(node, address, value, false); + + if (PCI_POSSIBLE_ERROR(*value)) { + err = -ENODEV; + *value = 0; + } + + return err; } EXPORT_SYMBOL_GPL(amd_smn_read); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index f9a8c7b7943f..b3fa61d45352 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -345,6 +345,7 @@ static DECLARE_WORK(disable_freq_invariance_work, disable_freq_invariance_workfn); DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; +EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); static void scale_freq_tick(u64 acnt, u64 mcnt) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2b170da84f97..d4e539d4e158 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1075,6 +1075,10 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c) c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + + /* Provide a sane default if not enumerated: */ + if (!c->x86_clflush_size) + c->x86_clflush_size = 32; } c->x86_cache_bits = c->x86_phys_bits; @@ -1585,6 +1589,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (have_cpuid_p()) { cpu_detect(c); get_cpu_vendor(c); + intel_unlock_cpuid_leafs(c); get_cpu_cap(c); setup_force_cpu_cap(X86_FEATURE_CPUID); get_cpu_address_sizes(c); @@ -1744,7 +1749,7 @@ static void generic_identify(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); - + intel_unlock_cpuid_leafs(c); get_cpu_cap(c); get_cpu_address_sizes(c); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index ea9e07d57c8d..1beccefbaff9 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -61,9 +61,11 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; extern void __init tsx_init(void); void tsx_ap_init(void); +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c); #else static inline void tsx_init(void) { } static inline void tsx_ap_init(void) { } +static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { } #endif /* CONFIG_CPU_SUP_INTEL */ extern void init_spectral_chicken(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3c3e7e5695ba..fdf3489d92a4 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -269,19 +269,26 @@ detect_keyid_bits: c->x86_phys_bits -= keyid_bits; } +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd)) + return; + + /* + * The BIOS can have limited CPUID to leaf 2, which breaks feature + * enumeration. Unlock it and update the maximum leaf info. + */ + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) + c->cpuid_level = cpuid_eax(0); +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; - /* Unmask CPUID levels if masked: */ - if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - if (msr_clear_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { - c->cpuid_level = cpuid_eax(0); - get_cpu_cap(c); - } - } - if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 2345e6836593..366f496ca3ce 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -519,7 +519,8 @@ void free_rmid(u32 closid, u32 rmid) * allows architectures that ignore the closid parameter to avoid an * unnecessary check. */ - if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + if (!resctrl_arch_mon_capable() || + idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID)) return; diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index d419deed6a48..7d476fa697ca 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -84,9 +84,9 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) /* * If leaf 0xb is available, then the domain shifts are set - * already and nothing to do here. + * already and nothing to do here. Only valid for family >= 0x17. */ - if (!has_topoext) { + if (!has_topoext && tscan->c->x86 >= 0x17) { /* * Leaf 0x80000008 set the CORE domain shift already. * Update the SMT domain, but do not propagate it. diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index b180d8e497c3..cc0f7f70b17b 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -295,8 +295,15 @@ void machine_kexec_cleanup(struct kimage *image) void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; - void *control_page; + unsigned int host_mem_enc_active; int save_ftrace_enabled; + void *control_page; + + /* + * This must be done before load_segments() since if call depth tracking + * is used then GS must be valid to make any function calls. + */ + host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -358,7 +365,7 @@ void machine_kexec(struct kimage *image) (unsigned long)page_list, image->start, image->preserve_context, - cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)); + host_mem_enc_active); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index d64fb2b3eb69..fec95a770270 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -44,6 +44,7 @@ config KVM select KVM_VFIO select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING + select KVM_WERROR if WERROR help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -66,7 +67,7 @@ config KVM_WERROR # FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning. # Building KVM with -Werror and KASAN is still doable via enabling # the kernel-wide WERROR=y. - depends on KVM && EXPERT && !KASAN + depends on KVM && ((EXPERT && !KASAN) || WERROR) help Add -Werror to the build flags for KVM. @@ -97,15 +98,17 @@ config KVM_INTEL config KVM_INTEL_PROVE_VE bool "Check that guests do not receive #VE exceptions" - default KVM_PROVE_MMU || DEBUG_KERNEL - depends on KVM_INTEL + depends on KVM_INTEL && EXPERT help - Checks that KVM's page table management code will not incorrectly let guests receive a virtualization exception. Virtualization exceptions will be trapped by the hypervisor rather than injected in the guest. + Note: some CPUs appear to generate spurious EPT Violations #VEs + that trigger KVM's WARN, in particular with eptad=0 and/or nested + virtualization. + If unsure, say N. config X86_SGX_KVM diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ebf41023be38..acd7d48100a1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -59,7 +59,17 @@ #define MAX_APIC_VECTOR 256 #define APIC_VECTORS_PER_REG 32 -static bool lapic_timer_advance_dynamic __read_mostly; +/* + * Enable local APIC timer advancement (tscdeadline mode only) with adaptive + * tuning. When enabled, KVM programs the host timer event to fire early, i.e. + * before the deadline expires, to account for the delay between taking the + * VM-Exit (to inject the guest event) and the subsequent VM-Enter to resume + * the guest, i.e. so that the interrupt arrives in the guest with minimal + * latency relative to the deadline programmed by the guest. + */ +static bool lapic_timer_advance __read_mostly = true; +module_param(lapic_timer_advance, bool, 0444); + #define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ #define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ #define LAPIC_TIMER_ADVANCE_NS_INIT 1000 @@ -1854,16 +1864,14 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); - if (lapic_timer_advance_dynamic) { - adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline); - /* - * If the timer fired early, reread the TSC to account for the - * overhead of the above adjustment to avoid waiting longer - * than is necessary. - */ - if (guest_tsc < tsc_deadline) - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - } + adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline); + + /* + * If the timer fired early, reread the TSC to account for the overhead + * of the above adjustment to avoid waiting longer than is necessary. + */ + if (guest_tsc < tsc_deadline) + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); @@ -2812,7 +2820,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) +int kvm_create_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; @@ -2845,13 +2853,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; - if (timer_advance_ns == -1) { + if (lapic_timer_advance) apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; - lapic_timer_advance_dynamic = true; - } else { - apic->lapic_timer.timer_advance_ns = timer_advance_ns; - lapic_timer_advance_dynamic = false; - } /* * Stuff the APIC ENABLE bit in lieu of temporarily incrementing diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 0a0ea4b5dd8c..a69e706b9080 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -85,7 +85,7 @@ struct kvm_lapic { struct dest_map; -int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns); +int kvm_create_lapic(struct kvm_vcpu *vcpu); void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 662f62dfb2aa..8d74bdef68c1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -336,16 +336,19 @@ static int is_cpuid_PSE36(void) #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static void __update_clear_spte_fast(u64 *sptep, u64 spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); return xchg(sptep, spte); } @@ -4101,23 +4104,31 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level return leaf; } -/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ -static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) { - u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; - struct rsvd_bits_validate *rsvd_check; - int root, leaf, level; - bool reserved = false; + int leaf; walk_shadow_page_lockless_begin(vcpu); if (is_tdp_mmu_active(vcpu)) - leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); + leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level); else - leaf = get_walk(vcpu, addr, sptes, &root); + leaf = get_walk(vcpu, addr, sptes, root_level); walk_shadow_page_lockless_end(vcpu); + return leaf; +} + +/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ +static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; + struct rsvd_bits_validate *rsvd_check; + int root, leaf, level; + bool reserved = false; + leaf = get_sptes_lockless(vcpu, addr, sptes, &root); if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; @@ -4400,9 +4411,6 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, return RET_PF_EMULATE; } - fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; - smp_rmb(); - /* * Check for a relevant mmu_notifier invalidation event before getting * the pfn from the primary MMU, and before acquiring mmu_lock. @@ -5921,6 +5929,22 @@ emulate: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; + int root_level, leaf, level; + + leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level); + if (unlikely(leaf < 0)) + return; + + pr_err("%s %llx", msg, gpa); + for (level = root_level; level >= leaf; level--) + pr_cont(", spte[%d] = 0x%llx", level, sptes[level]); + pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes); + static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, hpa_t root_hpa) { diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 5dd5405fa07a..52fa004a1fbc 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -3,6 +3,8 @@ #ifndef KVM_X86_MMU_SPTE_H #define KVM_X86_MMU_SPTE_H +#include <asm/vmx.h> + #include "mmu.h" #include "mmu_internal.h" @@ -276,6 +278,13 @@ static inline bool is_shadow_present_pte(u64 pte) return !!(pte & SPTE_MMU_PRESENT_MASK); } +static inline bool is_ept_ve_possible(u64 spte) +{ + return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) && + !(spte & VMX_EPT_SUPPRESS_VE_BIT) && + (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE; +} + /* * Returns true if A/D bits are supported in hardware and are enabled by KVM. * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index fae559559a80..2880fd392e0c 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -21,11 +21,13 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep) static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte)); return xchg(rcu_dereference(sptep), new_spte); } static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte)); WRITE_ONCE(*rcu_dereference(sptep), new_spte); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1259dd63defc..36539c1b36cd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -626,7 +626,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * SPTEs. */ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - 0, iter->level, true); + SHADOW_NONPRESENT_VALUE, iter->level, true); return 0; } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0623cfaa7bb0..95095a233a45 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -779,6 +779,14 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, */ fpstate_set_confidential(&vcpu->arch.guest_fpu); vcpu->arch.guest_state_protected = true; + + /* + * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it + * only after setting guest_state_protected because KVM_SET_MSRS allows + * dynamic toggling of LBRV (for performance reason) on write access to + * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. + */ + svm_enable_lbrv(vcpu); return 0; } @@ -2406,6 +2414,12 @@ void __init sev_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_SEV_ES)) goto out; + if (!lbrv) { + WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV), + "LBRV must be present for SEV-ES support"); + goto out; + } + /* Has the system been allocated ASIDs for SEV-ES? */ if (min_sev_asid == 1) goto out; @@ -3216,7 +3230,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; - svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; /* * An SEV-ES guest requires a VMSA area that is a separate from the @@ -3268,10 +3281,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* Clear intercepts on selected MSRs */ set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } void sev_init_vmcb(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c8dc25886c16..c95d3900fe56 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -99,6 +99,7 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_SPEC_CTRL, .always = false }, { .index = MSR_IA32_PRED_CMD, .always = false }, { .index = MSR_IA32_FLUSH_CMD, .always = false }, + { .index = MSR_IA32_DEBUGCTLMSR, .always = false }, { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, { .index = MSR_IA32_LASTINTFROMIP, .always = false }, @@ -215,7 +216,7 @@ int vgif = true; module_param(vgif, int, 0444); /* enable/disable LBR virtualization */ -static int lbrv = true; +int lbrv = true; module_param(lbrv, int, 0444); static int tsc_scaling = true; @@ -990,7 +991,7 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) vmcb_mark_dirty(to_vmcb, VMCB_LBR); } -static void svm_enable_lbrv(struct kvm_vcpu *vcpu) +void svm_enable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1000,6 +1001,9 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + if (sev_es_guest(vcpu->kvm)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1); + /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ if (is_guest_mode(vcpu)) svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); @@ -1009,6 +1013,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); @@ -2822,10 +2828,24 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) return 0; } +static bool +sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + return sev_es_guest(vcpu->kvm) && + vcpu->arch.guest_state_protected && + svm_msrpm_offset(msr_info->index) != MSR_INVALID && + !msr_write_intercepted(vcpu, msr_info->index); +} + static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); + if (sev_es_prevent_msr_access(vcpu, msr_info)) { + msr_info->data = 0; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; + } + switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: if (!msr_info->host_initiated && @@ -2976,6 +2996,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; + + if (sev_es_prevent_msr_access(vcpu, msr)) + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; + switch (ecx) { case MSR_AMD64_TSC_RATIO: @@ -3846,16 +3870,27 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); /* - * KVM should never request an NMI window when vNMI is enabled, as KVM - * allows at most one to-be-injected NMI and one pending NMI, i.e. if - * two NMIs arrive simultaneously, KVM will inject one and set - * V_NMI_PENDING for the other. WARN, but continue with the standard - * single-step approach to try and salvage the pending NMI. + * If NMIs are outright masked, i.e. the vCPU is already handling an + * NMI, and KVM has not yet intercepted an IRET, then there is nothing + * more to do at this time as KVM has already enabled IRET intercepts. + * If KVM has already intercepted IRET, then single-step over the IRET, + * as NMIs aren't architecturally unmasked until the IRET completes. + * + * If vNMI is enabled, KVM should never request an NMI window if NMIs + * are masked, as KVM allows at most one to-be-injected NMI and one + * pending NMI. If two NMIs arrive simultaneously, KVM will inject one + * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are + * unmasked. KVM _will_ request an NMI window in some situations, e.g. + * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately + * inject the NMI. In those situations, KVM needs to single-step over + * the STI shadow or intercept STGI. */ - WARN_ON_ONCE(is_vnmi_enabled(svm)); + if (svm_get_nmi_mask(vcpu)) { + WARN_ON_ONCE(is_vnmi_enabled(svm)); - if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion) - return; /* IRET will cause a vm exit */ + if (!svm->awaiting_iret_completion) + return; /* IRET will cause a vm exit */ + } /* * SEV-ES guests are responsible for signaling when a vCPU is ready to @@ -5265,6 +5300,12 @@ static __init int svm_hardware_setup(void) nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } /* * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which * may be modified by svm_adjust_mmio_mask()), as well as nrips. @@ -5318,14 +5359,6 @@ static __init int svm_hardware_setup(void) svm_x86_ops.set_vnmi_pending = NULL; } - - if (lbrv) { - if (!boot_cpu_has(X86_FEATURE_LBRV)) - lbrv = false; - else - pr_info("LBR virtualization supported\n"); - } - if (!enable_pmu) pr_info("PMU virtualization is disabled\n"); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index be57213cd295..0f1472690b59 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -30,7 +30,7 @@ #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 47 +#define MAX_DIRECT_ACCESS_MSRS 48 #define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -39,6 +39,7 @@ extern int vgif; extern bool intercept_smi; extern bool x2avic_enabled; extern bool vnmi; +extern int lbrv; /* * Clean bits in VMCB. @@ -552,6 +553,7 @@ u32 *svm_vcpu_alloc_msrpm(void); void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); void svm_vcpu_free_msrpm(u32 *msrpm); void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); +void svm_enable_lbrv(struct kvm_vcpu *vcpu); void svm_update_lbrv(struct kvm_vcpu *vcpu); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d5b832126e34..643935a0f70a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2242,6 +2242,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); + if (vmx->ve_info) + vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); + /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); @@ -6230,6 +6233,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, else if (is_alignment_check(intr_info) && !vmx_guest_inject_ac(vcpu)) return true; + else if (is_ve_fault(intr_info)) + return true; return false; case EXIT_REASON_EXTERNAL_INTERRUPT: return true; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6051fad5945f..b3c83c06f826 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5218,8 +5218,15 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); - if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm)) - return -EIO; + if (WARN_ON_ONCE(is_ve_fault(intr_info))) { + struct vmx_ve_information *ve_info = vmx->ve_info; + + WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, + "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); + dump_vmcs(vcpu); + kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); + return 1; + } error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 082ac6d95a3a..0763a0f72a06 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -164,15 +164,6 @@ module_param(kvmclock_periodic_sync, bool, 0444); static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, 0644); -/* - * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables - * adaptive tuning starting from default advancement of 1000ns. '0' disables - * advancement entirely. Any other value is used as-is and disables adaptive - * tuning, i.e. allows privileged userspace to set an exact advancement time. - */ -static int __read_mostly lapic_timer_advance_ns = -1; -module_param(lapic_timer_advance_ns, int, 0644); - static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, 0444); @@ -10727,13 +10718,12 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); + if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); - else { - static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); - if (ioapic_in_kernel(vcpu->kvm)) - kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); - } + else if (ioapic_in_kernel(vcpu->kvm)) + kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); if (is_guest_mode(vcpu)) vcpu->arch.load_eoi_exitmap_pending = true; @@ -12169,7 +12159,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (r < 0) return r; - r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); + r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 10d5ed8b5990..a1cb3a4e6742 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -44,7 +44,11 @@ or %rdx, %rax .else cmp $TASK_SIZE_MAX-\size+1, %eax +.if \size != 8 jae .Lbad_get_user +.else + jae .Lbad_get_user_8 +.endif sbb %edx, %edx /* array_index_mask_nospec() */ and %edx, %eax .endif @@ -154,7 +158,7 @@ SYM_CODE_END(__get_user_handle_exception) #ifdef CONFIG_X86_32 SYM_CODE_START_LOCAL(__get_user_8_handle_exception) ASM_CLAC -bad_get_user_8: +.Lbad_get_user_8: xor %edx,%edx xor %ecx,%ecx mov $(-EFAULT),%_ASM_AX diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ce84ba86e69e..6ce10e3c6228 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -493,7 +493,7 @@ static void __init numa_clear_kernel_node_hotplug(void) for_each_reserved_mem_region(mb_region) { int nid = memblock_get_region_node(mb_region); - if (nid != MAX_NUMNODES) + if (nid != NUMA_NO_NODE) node_set(nid, reserved_nodemask); } @@ -614,9 +614,9 @@ static int __init numa_init(int (*init_func)(void)) nodes_clear(node_online_map); memset(&numa_meminfo, 0, sizeof(numa_meminfo)); WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, - MAX_NUMNODES)); + NUMA_NO_NODE)); WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, - MAX_NUMNODES)); + NUMA_NO_NODE)); /* In case that parsing SRAT failed. */ WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); numa_reset_distance(); diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c index 4ef20b49eb5e..6ed1935504b9 100644 --- a/arch/x86/platform/efi/memmap.c +++ b/arch/x86/platform/efi/memmap.c @@ -92,12 +92,22 @@ int __init efi_memmap_alloc(unsigned int num_entries, */ int __init efi_memmap_install(struct efi_memory_map_data *data) { + unsigned long size = efi.memmap.desc_size * efi.memmap.nr_map; + unsigned long flags = efi.memmap.flags; + u64 phys = efi.memmap.phys_map; + int ret; + efi_memmap_unmap(); if (efi_enabled(EFI_PARAVIRT)) return 0; - return __efi_memmap_init(data); + ret = __efi_memmap_init(data); + if (ret) + return ret; + + __efi_memmap_free(phys, size, flags); + return 0; } /** |