From f0616abd4e67143b45b04b565839148458857347 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Sun, 5 Dec 2021 16:47:35 -0800 Subject: arm64: clear_page() shouldn't use DC ZVA when DCZID_EL0.DZP == 1 Currently, clear_page() uses DC ZVA instruction unconditionally. But it should make sure that DCZID_EL0.DZP, which indicates whether or not use of DC ZVA instruction is prohibited, is zero when using the instruction. Use STNP instead when DCZID_EL0.DZP == 1. Fixes: f27bb139c387 ("arm64: Miscellaneous library functions") Signed-off-by: Reiji Watanabe Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/20211206004736.1520989-2-reijiw@google.com Signed-off-by: Catalin Marinas --- arch/arm64/lib/clear_page.S | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/arm64/lib') diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S index b84b179edba3..1fd5d790ab80 100644 --- a/arch/arm64/lib/clear_page.S +++ b/arch/arm64/lib/clear_page.S @@ -16,6 +16,7 @@ */ SYM_FUNC_START_PI(clear_page) mrs x1, dczid_el0 + tbnz x1, #4, 2f /* Branch if DC ZVA is prohibited */ and w1, w1, #0xf mov x2, #4 lsl x1, x2, x1 @@ -25,5 +26,14 @@ SYM_FUNC_START_PI(clear_page) tst x0, #(PAGE_SIZE - 1) b.ne 1b ret + +2: stnp xzr, xzr, [x0] + stnp xzr, xzr, [x0, #16] + stnp xzr, xzr, [x0, #32] + stnp xzr, xzr, [x0, #48] + add x0, x0, #64 + tst x0, #(PAGE_SIZE - 1) + b.ne 2b + ret SYM_FUNC_END_PI(clear_page) EXPORT_SYMBOL(clear_page) -- cgit v1.2.3-70-g09d2 From 685e2564daa1493053fcd7f1dbed38b35ee2f3cb Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Sun, 5 Dec 2021 16:47:36 -0800 Subject: arm64: mte: DC {GVA,GZVA} shouldn't be used when DCZID_EL0.DZP == 1 Currently, mte_set_mem_tag_range() and mte_zero_clear_page_tags() use DC {GVA,GZVA} unconditionally. But, they should make sure that DCZID_EL0.DZP, which indicates whether or not use of those instructions is prohibited, is zero when using those instructions. Use ST{G,ZG,Z2G} instead when DCZID_EL0.DZP == 1. Fixes: 013bb59dbb7c ("arm64: mte: handle tags zeroing at page allocation time") Fixes: 3d0cca0b02ac ("kasan: speed up mte_set_mem_tag_range") Signed-off-by: Reiji Watanabe Link: https://lore.kernel.org/r/20211206004736.1520989-3-reijiw@google.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mte-kasan.h | 8 +++++--- arch/arm64/lib/mte.S | 8 +++++++- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/arm64/lib') diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h index 478b9bcf69ad..e4704a403237 100644 --- a/arch/arm64/include/asm/mte-kasan.h +++ b/arch/arm64/include/asm/mte-kasan.h @@ -84,10 +84,12 @@ static inline void __dc_gzva(u64 p) static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag, bool init) { - u64 curr, mask, dczid_bs, end1, end2, end3; + u64 curr, mask, dczid, dczid_bs, dczid_dzp, end1, end2, end3; /* Read DC G(Z)VA block size from the system register. */ - dczid_bs = 4ul << (read_cpuid(DCZID_EL0) & 0xf); + dczid = read_cpuid(DCZID_EL0); + dczid_bs = 4ul << (dczid & 0xf); + dczid_dzp = (dczid >> 4) & 1; curr = (u64)__tag_set(addr, tag); mask = dczid_bs - 1; @@ -106,7 +108,7 @@ static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag, */ #define SET_MEMTAG_RANGE(stg_post, dc_gva) \ do { \ - if (size >= 2 * dczid_bs) { \ + if (!dczid_dzp && size >= 2 * dczid_bs) {\ do { \ curr = stg_post(curr); \ } while (curr < end1); \ diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S index e83643b3995f..f531dcb95174 100644 --- a/arch/arm64/lib/mte.S +++ b/arch/arm64/lib/mte.S @@ -43,17 +43,23 @@ SYM_FUNC_END(mte_clear_page_tags) * x0 - address to the beginning of the page */ SYM_FUNC_START(mte_zero_clear_page_tags) + and x0, x0, #(1 << MTE_TAG_SHIFT) - 1 // clear the tag mrs x1, dczid_el0 + tbnz x1, #4, 2f // Branch if DC GZVA is prohibited and w1, w1, #0xf mov x2, #4 lsl x1, x2, x1 - and x0, x0, #(1 << MTE_TAG_SHIFT) - 1 // clear the tag 1: dc gzva, x0 add x0, x0, x1 tst x0, #(PAGE_SIZE - 1) b.ne 1b ret + +2: stz2g x0, [x0], #(MTE_GRANULE_SIZE * 2) + tst x0, #(PAGE_SIZE - 1) + b.ne 2b + ret SYM_FUNC_END(mte_zero_clear_page_tags) /* -- cgit v1.2.3-70-g09d2 From 2c54b423cf85baed5ad9f9546f6c8ea741774a06 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 13 Dec 2021 15:02:52 +0100 Subject: arm64/xor: use EOR3 instructions when available Use the EOR3 instruction to implement xor_blocks() if the instruction is available, which is the case if the CPU implements the SHA-3 extension. This is about 20% faster on Apple M1 when using the 5-way version. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20211213140252.2856053-1-ardb@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 6 ++ arch/arm64/Makefile | 5 ++ arch/arm64/lib/xor-neon.c | 147 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 157 insertions(+), 1 deletion(-) (limited to 'arch/arm64/lib') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c4207cf9bb17..63d41ba4e716 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1545,6 +1545,12 @@ endmenu menu "ARMv8.2 architectural features" +config AS_HAS_ARMV8_2 + def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a) + +config AS_HAS_SHA3 + def_bool $(as-instr,.arch armv8.2-a+sha3) + config ARM64_PMEM bool "Enable support for persistent memory" select ARCH_HAS_PMEM_API diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index e8cfc5868aa8..2f1de88651e6 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -58,6 +58,11 @@ stack_protector_prepare: prepare0 include/generated/asm-offsets.h)) endif +ifeq ($(CONFIG_AS_HAS_ARMV8_2), y) +# make sure to pass the newest target architecture to -march. +asm-arch := armv8.2-a +endif + # Ensure that if the compiler supports branch protection we default it # off, this will be overridden if we are using branch protection. branch-prot-flags-y += $(call cc-option,-mbranch-protection=none) diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c index 11bf4f8aca68..d189cf4e70ea 100644 --- a/arch/arm64/lib/xor-neon.c +++ b/arch/arm64/lib/xor-neon.c @@ -167,7 +167,7 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, } while (--lines > 0); } -struct xor_block_template const xor_block_inner_neon = { +struct xor_block_template xor_block_inner_neon __ro_after_init = { .name = "__inner_neon__", .do_2 = xor_arm64_neon_2, .do_3 = xor_arm64_neon_3, @@ -176,6 +176,151 @@ struct xor_block_template const xor_block_inner_neon = { }; EXPORT_SYMBOL(xor_block_inner_neon); +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) +{ + uint64x2_t res; + + asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" + "eor3 %0.16b, %1.16b, %2.16b, %3.16b" + : "=w"(res) : "w"(p), "w"(q), "w"(r)); + return res; +} + +static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1, + unsigned long *p2, unsigned long *p3) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + } while (--lines > 0); +} + +static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1, + unsigned long *p2, unsigned long *p3, + unsigned long *p4) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 */ + v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + } while (--lines > 0); +} + +static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1, + unsigned long *p2, unsigned long *p3, + unsigned long *p4, unsigned long *p5) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + uint64_t *dp5 = (uint64_t *)p5; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 ^ p5 */ + v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); + v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); + v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); + v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + dp5 += 8; + } while (--lines > 0); +} + +static int __init xor_neon_init(void) +{ + if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) { + xor_block_inner_neon.do_3 = xor_arm64_eor3_3; + xor_block_inner_neon.do_4 = xor_arm64_eor3_4; + xor_block_inner_neon.do_5 = xor_arm64_eor3_5; + } + return 0; +} +module_init(xor_neon_init); + +static void __exit xor_neon_exit(void) +{ +} +module_exit(xor_neon_exit); + MODULE_AUTHOR("Jackie Liu "); MODULE_DESCRIPTION("ARMv8 XOR Extensions"); MODULE_LICENSE("GPL"); -- cgit v1.2.3-70-g09d2 From 742a15b1a23aa43bde2d9d681281ec1925be13fd Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 14 Dec 2021 15:27:14 +0000 Subject: arm64: Use BTI C directly and unconditionally Now we have a macro for BTI C that looks like a regular instruction change all the users of the current BTI_C macro to just emit a BTI C directly and remove the macro. This does mean that we now unconditionally BTI annotate all assembly functions, meaning that they are worse in this respect than code generated by the compiler. The overhead should be minimal for implementations with a reasonable HINT implementation. Signed-off-by: Mark Brown Reviewed-by: Ard Biesheuvel Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20211214152714.2380849-4-broonie@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/linkage.h | 22 ++++++---------------- arch/arm64/kernel/entry-ftrace.S | 8 ++------ arch/arm64/lib/kasan_sw_tags.S | 4 +--- 3 files changed, 9 insertions(+), 25 deletions(-) (limited to 'arch/arm64/lib') diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h index 1cfa8bb33edd..9065e4749b42 100644 --- a/arch/arm64/include/asm/linkage.h +++ b/arch/arm64/include/asm/linkage.h @@ -4,16 +4,6 @@ #define __ALIGN .align 2 #define __ALIGN_STR ".align 2" -#if defined(CONFIG_ARM64_BTI_KERNEL) && defined(__aarch64__) - -#define BTI_C bti c ; - -#else - -#define BTI_C - -#endif - /* * When using in-kernel BTI we need to ensure that PCS-conformant * assembly functions have suitable annotations. Override @@ -23,27 +13,27 @@ */ #define SYM_FUNC_START(name) \ SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ - BTI_C + bti c ; #define SYM_FUNC_START_NOALIGN(name) \ SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \ - BTI_C + bti c ; #define SYM_FUNC_START_LOCAL(name) \ SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \ - BTI_C + bti c ; #define SYM_FUNC_START_LOCAL_NOALIGN(name) \ SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \ - BTI_C + bti c ; #define SYM_FUNC_START_WEAK(name) \ SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \ - BTI_C + bti c ; #define SYM_FUNC_START_WEAK_NOALIGN(name) \ SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \ - BTI_C + bti c ; /* * Annotate a function as position independent, i.e., safe to be called before diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index 8cf970d219f5..e535480a4069 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -77,17 +77,13 @@ .endm SYM_CODE_START(ftrace_regs_caller) -#ifdef BTI_C - BTI_C -#endif + bti c ftrace_regs_entry 1 b ftrace_common SYM_CODE_END(ftrace_regs_caller) SYM_CODE_START(ftrace_caller) -#ifdef BTI_C - BTI_C -#endif + bti c ftrace_regs_entry 0 b ftrace_common SYM_CODE_END(ftrace_caller) diff --git a/arch/arm64/lib/kasan_sw_tags.S b/arch/arm64/lib/kasan_sw_tags.S index 5b04464c045e..20784ce75def 100644 --- a/arch/arm64/lib/kasan_sw_tags.S +++ b/arch/arm64/lib/kasan_sw_tags.S @@ -38,9 +38,7 @@ * incremented by 256 prior to return). */ SYM_CODE_START(__hwasan_tag_mismatch) -#ifdef BTI_C - BTI_C -#endif + bti c add x29, sp, #232 stp x2, x3, [sp, #8 * 2] stp x4, x5, [sp, #8 * 4] -- cgit v1.2.3-70-g09d2