From 2d9ee327adce5f6becea2dd51d282a6183e40b0f Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Thu, 7 Jun 2018 09:57:51 +0800 Subject: powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently memcmp() 64bytes version in powerpc will fall back to .Lshort (compare per byte mode) if either src or dst address is not 8 bytes aligned. It can be opmitized in 2 situations: 1) if both addresses are with the same offset with 8 bytes boundary: memcmp() can compare the unaligned bytes within 8 bytes boundary firstly and then compare the rest 8-bytes-aligned content with .Llong mode. 2) If src/dst addrs are not with the same offset of 8 bytes boundary: memcmp() can align src addr with 8 bytes, increment dst addr accordingly, then load src with aligned mode and load dst with unaligned mode. This patch optmizes memcmp() behavior in the above 2 situations. Tested with both little/big endian. Performance result below is based on little endian. Following is the test result with src/dst having the same offset case: (a similar result was observed when src/dst having different offset): (1) 256 bytes Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp: - without patch 29.773018302 seconds time elapsed ( +- 0.09% ) - with patch 16.485568173 seconds time elapsed ( +- 0.02% ) -> There is ~+80% percent improvement (2) 32 bytes To observe performance impact on < 32 bytes, modify tools/testing/selftests/powerpc/stringloops/memcmp.c with following: ------- #include #include "utils.h" -#define SIZE 256 +#define SIZE 32 #define ITERATIONS 10000 int test_memcmp(const void *s1, const void *s2, size_t n); -------- - Without patch 0.244746482 seconds time elapsed ( +- 0.36%) - with patch 0.215069477 seconds time elapsed ( +- 0.51%) -> There is ~+13% improvement (3) 0~8 bytes To observe <8 bytes performance impact, modify tools/testing/selftests/powerpc/stringloops/memcmp.c with following: ------- #include #include "utils.h" -#define SIZE 256 -#define ITERATIONS 10000 +#define SIZE 8 +#define ITERATIONS 1000000 int test_memcmp(const void *s1, const void *s2, size_t n); ------- - Without patch 1.845642503 seconds time elapsed ( +- 0.12% ) - With patch 1.849767135 seconds time elapsed ( +- 0.26% ) -> They are nearly the same. (-0.2%) Signed-off-by: Simon Guo Signed-off-by: Michael Ellerman --- arch/powerpc/lib/memcmp_64.S | 140 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 133 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S index d75d18b7bd55..5776f91da29e 100644 --- a/arch/powerpc/lib/memcmp_64.S +++ b/arch/powerpc/lib/memcmp_64.S @@ -24,28 +24,41 @@ #define rH r31 #ifdef __LITTLE_ENDIAN__ +#define LH lhbrx +#define LW lwbrx #define LD ldbrx #else +#define LH lhzx +#define LW lwzx #define LD ldx #endif +/* + * There are 2 categories for memcmp: + * 1) src/dst has the same offset to the 8 bytes boundary. The handlers + * are named like .Lsameoffset_xxxx + * 2) src/dst has different offset to the 8 bytes boundary. The handlers + * are named like .Ldiffoffset_xxxx + */ _GLOBAL(memcmp) cmpdi cr1,r5,0 - /* Use the short loop if both strings are not 8B aligned */ - or r6,r3,r4 + /* Use the short loop if the src/dst addresses are not + * with the same offset of 8 bytes align boundary. + */ + xor r6,r3,r4 andi. r6,r6,7 - /* Use the short loop if length is less than 32B */ - cmpdi cr6,r5,31 + /* Fall back to short loop if compare at aligned addrs + * with less than 8 bytes. + */ + cmpdi cr6,r5,7 beq cr1,.Lzero - bne .Lshort - bgt cr6,.Llong + bgt cr6,.Lno_short .Lshort: mtctr r5 - 1: lbz rA,0(r3) lbz rB,0(r4) subf. rC,rB,rA @@ -78,11 +91,89 @@ _GLOBAL(memcmp) li r3,0 blr +.Lno_short: + dcbt 0,r3 + dcbt 0,r4 + bne .Ldiffoffset_8bytes_make_align_start + + +.Lsameoffset_8bytes_make_align_start: + /* attempt to compare bytes not aligned with 8 bytes so that + * rest comparison can run based on 8 bytes alignment. + */ + andi. r6,r3,7 + + /* Try to compare the first double word which is not 8 bytes aligned: + * load the first double word at (src & ~7UL) and shift left appropriate + * bits before comparision. + */ + rlwinm r6,r3,3,26,28 + beq .Lsameoffset_8bytes_aligned + clrrdi r3,r3,3 + clrrdi r4,r4,3 + LD rA,0,r3 + LD rB,0,r4 + sld rA,rA,r6 + sld rB,rB,r6 + cmpld cr0,rA,rB + srwi r6,r6,3 + bne cr0,.LcmpAB_lightweight + subfic r6,r6,8 + subf. r5,r6,r5 + addi r3,r3,8 + addi r4,r4,8 + beq .Lzero + +.Lsameoffset_8bytes_aligned: + /* now we are aligned with 8 bytes. + * Use .Llong loop if left cmp bytes are equal or greater than 32B. + */ + cmpdi cr6,r5,31 + bgt cr6,.Llong + +.Lcmp_lt32bytes: + /* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */ + cmpdi cr5,r5,7 + srdi r0,r5,3 + ble cr5,.Lcmp_rest_lt8bytes + + /* handle 8 ~ 31 bytes */ + clrldi r5,r5,61 + mtctr r0 +2: + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + bne cr0,.LcmpAB_lightweight + bdnz 2b + + cmpwi r5,0 + beq .Lzero + +.Lcmp_rest_lt8bytes: + /* Here we have only less than 8 bytes to compare with. at least s1 + * Address is aligned with 8 bytes. + * The next double words are load and shift right with appropriate + * bits. + */ + subfic r6,r5,8 + slwi r6,r6,3 + LD rA,0,r3 + LD rB,0,r4 + srd rA,rA,r6 + srd rB,rB,r6 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + b .Lzero + .Lnon_zero: mr r3,rC blr .Llong: + /* At least s1 addr is aligned with 8 bytes */ li off8,8 li off16,16 li off24,24 @@ -232,4 +323,39 @@ _GLOBAL(memcmp) ld r28,-32(r1) ld r27,-40(r1) blr + +.LcmpAB_lightweight: /* skip NV GPRS restore */ + li r3,1 + bgtlr + li r3,-1 + blr + +.Ldiffoffset_8bytes_make_align_start: + /* now try to align s1 with 8 bytes */ + rlwinm r6,r3,3,26,28 + beq .Ldiffoffset_align_s1_8bytes + + clrrdi r3,r3,3 + LD rA,0,r3 + LD rB,0,r4 /* unaligned load */ + sld rA,rA,r6 + srd rA,rA,r6 + srd rB,rB,r6 + cmpld cr0,rA,rB + srwi r6,r6,3 + bne cr0,.LcmpAB_lightweight + + subfic r6,r6,8 + subf. r5,r6,r5 + addi r3,r3,8 + add r4,r4,r6 + + beq .Lzero + +.Ldiffoffset_align_s1_8bytes: + /* now s1 is aligned with 8 bytes. */ + cmpdi cr5,r5,31 + ble cr5,.Lcmp_lt32bytes + b .Llong + EXPORT_SYMBOL(memcmp) -- cgit v1.2.3-70-g09d2 From d58badfb7cf1792ab4f1d0cd7896d733b85d650f Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Thu, 7 Jun 2018 09:57:53 +0800 Subject: powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision This patch add VMX primitives to do memcmp() in case the compare size is equal or greater than 4K bytes. KSM feature can benefit from this. Test result with following test program(replace the "^>" with ""): ------ ># cat tools/testing/selftests/powerpc/stringloops/memcmp.c >#include >#include >#include >#include >#include "utils.h" >#define SIZE (1024 * 1024 * 900) >#define ITERATIONS 40 int test_memcmp(const void *s1, const void *s2, size_t n); static int testcase(void) { char *s1; char *s2; unsigned long i; s1 = memalign(128, SIZE); if (!s1) { perror("memalign"); exit(1); } s2 = memalign(128, SIZE); if (!s2) { perror("memalign"); exit(1); } for (i = 0; i < SIZE; i++) { s1[i] = i & 0xff; s2[i] = i & 0xff; } for (i = 0; i < ITERATIONS; i++) { int ret = test_memcmp(s1, s2, SIZE); if (ret) { printf("return %d at[%ld]! should have returned zero\n", ret, i); abort(); } } return 0; } int main(void) { return test_harness(testcase, "memcmp"); } ------ Without this patch (but with the first patch "powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()." in the series): 4.726728762 seconds time elapsed ( +- 3.54%) With VMX patch: 4.234335473 seconds time elapsed ( +- 2.63%) There is ~+10% improvement. Testing with unaligned and different offset version (make s1 and s2 shift random offset within 16 bytes) can archieve higher improvement than 10%.. Signed-off-by: Simon Guo Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/asm-prototypes.h | 4 +- arch/powerpc/lib/copypage_power7.S | 4 +- arch/powerpc/lib/memcmp_64.S | 241 +++++++++++++++++++++++++++++- arch/powerpc/lib/memcpy_power7.S | 6 +- arch/powerpc/lib/vmx-helper.c | 4 +- 5 files changed, 248 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 7841b8a60657..769567b66c0c 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -48,8 +48,8 @@ void __trace_opal_exit(long opcode, unsigned long retval); /* VMX copying */ int enter_vmx_usercopy(void); int exit_vmx_usercopy(void); -int enter_vmx_copy(void); -void * exit_vmx_copy(void *dest); +int enter_vmx_ops(void); +void *exit_vmx_ops(void *dest); /* Traps */ long machine_check_early(struct pt_regs *regs); diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S index 8fa73b7ab20e..e38f956f7d9f 100644 --- a/arch/powerpc/lib/copypage_power7.S +++ b/arch/powerpc/lib/copypage_power7.S @@ -57,7 +57,7 @@ _GLOBAL(copypage_power7) std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) - bl enter_vmx_copy + bl enter_vmx_ops cmpwi r3,0 ld r0,STACKFRAMESIZE+16(r1) ld r3,STK_REG(R31)(r1) @@ -100,7 +100,7 @@ _GLOBAL(copypage_power7) addi r3,r3,128 bdnz 1b - b exit_vmx_copy /* tail call optimise */ + b exit_vmx_ops /* tail call optimise */ #else li r0,(PAGE_SIZE/128) diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S index 5776f91da29e..be2f7925926b 100644 --- a/arch/powerpc/lib/memcmp_64.S +++ b/arch/powerpc/lib/memcmp_64.S @@ -9,6 +9,7 @@ */ #include #include +#include #define off8 r6 #define off16 r7 @@ -27,12 +28,73 @@ #define LH lhbrx #define LW lwbrx #define LD ldbrx +#define LVS lvsr +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ + vperm _VRT,_VRB,_VRA,_VRC #else #define LH lhzx #define LW lwzx #define LD ldx +#define LVS lvsl +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ + vperm _VRT,_VRA,_VRB,_VRC #endif +#define VMX_THRESH 4096 +#define ENTER_VMX_OPS \ + mflr r0; \ + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ + std r0,16(r1); \ + stdu r1,-STACKFRAMESIZE(r1); \ + bl enter_vmx_ops; \ + cmpwi cr1,r3,0; \ + ld r0,STACKFRAMESIZE+16(r1); \ + ld r3,STK_REG(R31)(r1); \ + ld r4,STK_REG(R30)(r1); \ + ld r5,STK_REG(R29)(r1); \ + addi r1,r1,STACKFRAMESIZE; \ + mtlr r0 + +#define EXIT_VMX_OPS \ + mflr r0; \ + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ + std r0,16(r1); \ + stdu r1,-STACKFRAMESIZE(r1); \ + bl exit_vmx_ops; \ + ld r0,STACKFRAMESIZE+16(r1); \ + ld r3,STK_REG(R31)(r1); \ + ld r4,STK_REG(R30)(r1); \ + ld r5,STK_REG(R29)(r1); \ + addi r1,r1,STACKFRAMESIZE; \ + mtlr r0 + +/* + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with + * 16 bytes boundary and permute the result with the 1st 16 bytes. + + * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | + * ^ ^ ^ + * 0xbbbb10 0xbbbb20 0xbbb30 + * ^ + * _vaddr + * + * + * _vmask is the mask generated by LVS + * _v1st_qw is the 1st aligned QW of current addr which is already loaded. + * for example: 0xyyyyyyyyyyyyy012 for big endian + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. + * for example: 0x3456789abcdefzzz for big endian + * The permute result is saved in _v_res. + * for example: 0x0123456789abcdef for big endian. + */ +#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ + lvx _v2nd_qw,_vaddr,off16; \ + VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) + /* * There are 2 categories for memcmp: * 1) src/dst has the same offset to the 8 bytes boundary. The handlers @@ -40,7 +102,7 @@ * 2) src/dst has different offset to the 8 bytes boundary. The handlers * are named like .Ldiffoffset_xxxx */ -_GLOBAL(memcmp) +_GLOBAL_TOC(memcmp) cmpdi cr1,r5,0 /* Use the short loop if the src/dst addresses are not @@ -132,7 +194,7 @@ _GLOBAL(memcmp) bgt cr6,.Llong .Lcmp_lt32bytes: - /* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */ + /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ cmpdi cr5,r5,7 srdi r0,r5,3 ble cr5,.Lcmp_rest_lt8bytes @@ -173,6 +235,15 @@ _GLOBAL(memcmp) blr .Llong: +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + /* Try to use vmx loop if length is equal or greater than 4K */ + cmpldi cr6,r5,VMX_THRESH + bge cr6,.Lsameoffset_vmx_cmp +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +.Llong_novmx_cmp: +#endif /* At least s1 addr is aligned with 8 bytes */ li off8,8 li off16,16 @@ -330,7 +401,97 @@ _GLOBAL(memcmp) li r3,-1 blr +#ifdef CONFIG_ALTIVEC +.Lsameoffset_vmx_cmp: + /* Enter with src/dst addrs has the same offset with 8 bytes + * align boundary + */ + ENTER_VMX_OPS + beq cr1,.Llong_novmx_cmp + +3: + /* need to check whether r4 has the same offset with r3 + * for 16 bytes boundary. + */ + xor r0,r3,r4 + andi. r0,r0,0xf + bne .Ldiffoffset_vmx_cmp_start + + /* len is no less than 4KB. Need to align with 16 bytes further. + */ + andi. rA,r3,8 + LD rA,0,r3 + beq 4f + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + addi r5,r5,-8 + + beq cr0,4f + /* save and restore cr0 */ + mfocrf r5,128 + EXIT_VMX_OPS + mtocrf 128,r5 + b .LcmpAB_lightweight + +4: + /* compare 32 bytes for each loop */ + srdi r0,r5,5 + mtctr r0 + clrldi r5,r5,59 + li off16,16 + +.balign 16 +5: + lvx v0,0,r3 + lvx v1,0,r4 + VCMPEQUD_RC(v0,v0,v1) + bnl cr6,7f + lvx v0,off16,r3 + lvx v1,off16,r4 + VCMPEQUD_RC(v0,v0,v1) + bnl cr6,6f + addi r3,r3,32 + addi r4,r4,32 + bdnz 5b + + EXIT_VMX_OPS + cmpdi r5,0 + beq .Lzero + b .Lcmp_lt32bytes + +6: + addi r3,r3,16 + addi r4,r4,16 + +7: + /* diff the last 16 bytes */ + EXIT_VMX_OPS + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + li off8,8 + bne cr0,.LcmpAB_lightweight + + LD rA,off8,r3 + LD rB,off8,r4 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + b .Lzero +#endif + .Ldiffoffset_8bytes_make_align_start: +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + /* only do vmx ops when the size equal or greater than 4K bytes */ + cmpdi cr5,r5,VMX_THRESH + bge cr5,.Ldiffoffset_vmx_cmp +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +.Ldiffoffset_novmx_cmp: +#endif + /* now try to align s1 with 8 bytes */ rlwinm r6,r3,3,26,28 beq .Ldiffoffset_align_s1_8bytes @@ -356,6 +517,82 @@ _GLOBAL(memcmp) /* now s1 is aligned with 8 bytes. */ cmpdi cr5,r5,31 ble cr5,.Lcmp_lt32bytes + +#ifdef CONFIG_ALTIVEC + b .Llong_novmx_cmp +#else b .Llong +#endif + +#ifdef CONFIG_ALTIVEC +.Ldiffoffset_vmx_cmp: + ENTER_VMX_OPS + beq cr1,.Ldiffoffset_novmx_cmp + +.Ldiffoffset_vmx_cmp_start: + /* Firstly try to align r3 with 16 bytes */ + andi. r6,r3,0xf + li off16,16 + beq .Ldiffoffset_vmx_s1_16bytes_align + LVS v3,0,r3 + LVS v4,0,r4 + + lvx v5,0,r3 + lvx v6,0,r4 + LD_VSR_CROSS16B(r3,v3,v5,v7,v9) + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + + VCMPEQUB_RC(v7,v9,v10) + bnl cr6,.Ldiffoffset_vmx_diff_found + + subfic r6,r6,16 + subf r5,r6,r5 + add r3,r3,r6 + add r4,r4,r6 + +.Ldiffoffset_vmx_s1_16bytes_align: + /* now s1 is aligned with 16 bytes */ + lvx v6,0,r4 + LVS v4,0,r4 + srdi r6,r5,5 /* loop for 32 bytes each */ + clrldi r5,r5,59 + mtctr r6 + +.balign 16 +.Ldiffoffset_vmx_32bytesloop: + /* the first qw of r4 was saved in v6 */ + lvx v9,0,r3 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + VCMPEQUB_RC(v7,v9,v10) + vor v6,v8,v8 + bnl cr6,.Ldiffoffset_vmx_diff_found + + addi r3,r3,16 + addi r4,r4,16 + + lvx v9,0,r3 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + VCMPEQUB_RC(v7,v9,v10) + vor v6,v8,v8 + bnl cr6,.Ldiffoffset_vmx_diff_found + + addi r3,r3,16 + addi r4,r4,16 + + bdnz .Ldiffoffset_vmx_32bytesloop + + EXIT_VMX_OPS + + cmpdi r5,0 + beq .Lzero + b .Lcmp_lt32bytes + +.Ldiffoffset_vmx_diff_found: + EXIT_VMX_OPS + /* anyway, the diff will appear in next 16 bytes */ + li r5,16 + b .Lcmp_lt32bytes + +#endif EXPORT_SYMBOL(memcmp) diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S index df7de9d3da08..070cdf6f584f 100644 --- a/arch/powerpc/lib/memcpy_power7.S +++ b/arch/powerpc/lib/memcpy_power7.S @@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7) std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) - bl enter_vmx_copy + bl enter_vmx_ops cmpwi cr1,r3,0 ld r0,STACKFRAMESIZE+16(r1) ld r3,STK_REG(R31)(r1) @@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7) 15: addi r1,r1,STACKFRAMESIZE ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) - b exit_vmx_copy /* tail call optimise */ + b exit_vmx_ops /* tail call optimise */ .Lvmx_unaligned_copy: /* Get the destination 16B aligned */ @@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7) 15: addi r1,r1,STACKFRAMESIZE ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) - b exit_vmx_copy /* tail call optimise */ + b exit_vmx_ops /* tail call optimise */ #endif /* CONFIG_ALTIVEC */ diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index bf925cdcaca9..9f340494a8ac 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -53,7 +53,7 @@ int exit_vmx_usercopy(void) return 0; } -int enter_vmx_copy(void) +int enter_vmx_ops(void) { if (in_interrupt()) return 0; @@ -70,7 +70,7 @@ int enter_vmx_copy(void) * passed a pointer to the destination which we return as required by a * memcpy implementation. */ -void *exit_vmx_copy(void *dest) +void *exit_vmx_ops(void *dest) { disable_kernel_altivec(); preempt_enable(); -- cgit v1.2.3-70-g09d2 From c2a4e54e8b6a89dde574608c47e460a0371e44be Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Thu, 7 Jun 2018 09:57:54 +0800 Subject: powerpc/64: add 32 bytes prechecking before using VMX optimization on memcmp() This patch is based on the previous VMX patch on memcmp(). To optimize ppc64 memcmp() with VMX instruction, we need to think about the VMX penalty brought with: If kernel uses VMX instruction, it needs to save/restore current thread's VMX registers. There are 32 x 128 bits VMX registers in PPC, which means 32 x 16 = 512 bytes for load and store. The major concern regarding the memcmp() performance in kernel is KSM, who will use memcmp() frequently to merge identical pages. So it will make sense to take some measures/enhancement on KSM to see whether any improvement can be done here. Cyril Bur indicates that the memcmp() for KSM has a higher possibility to fail (unmatch) early in previous bytes in following mail. https://patchwork.ozlabs.org/patch/817322/#1773629 And I am taking a follow-up on this with this patch. Per some testing, it shows KSM memcmp() will fail early at previous 32 bytes. More specifically: - 76% cases will fail/unmatch before 16 bytes; - 83% cases will fail/unmatch before 32 bytes; - 84% cases will fail/unmatch before 64 bytes; So 32 bytes looks a better choice than other bytes for pre-checking. The early failure is also true for memcmp() for non-KSM case. With a non-typical call load, it shows ~73% cases fail before first 32 bytes. This patch adds a 32 bytes pre-checking firstly before jumping into VMX operations, to avoid the unnecessary VMX penalty. It is not limited to KSM case. And the testing shows ~20% improvement on memcmp() average execution time with this patch. And note the 32B pre-checking is only performed when the compare size is long enough (>=4K currently) to allow VMX operation. The detail data and analysis is at: https://github.com/justdoitqd/publicFiles/blob/master/memcmp/README.md Signed-off-by: Simon Guo Signed-off-by: Michael Ellerman --- arch/powerpc/lib/memcmp_64.S | 57 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S index be2f7925926b..844d8e774492 100644 --- a/arch/powerpc/lib/memcmp_64.S +++ b/arch/powerpc/lib/memcmp_64.S @@ -404,8 +404,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) #ifdef CONFIG_ALTIVEC .Lsameoffset_vmx_cmp: /* Enter with src/dst addrs has the same offset with 8 bytes - * align boundary + * align boundary. + * + * There is an optimization based on following fact: memcmp() + * prones to fail early at the first 32 bytes. + * Before applying VMX instructions which will lead to 32x128bits + * VMX regs load/restore penalty, we compare the first 32 bytes + * so that we can catch the ~80% fail cases. */ + + li r0,4 + mtctr r0 +.Lsameoffset_prechk_32B_loop: + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + bne cr0,.LcmpAB_lightweight + addi r5,r5,-8 + bdnz .Lsameoffset_prechk_32B_loop + ENTER_VMX_OPS beq cr1,.Llong_novmx_cmp @@ -482,16 +501,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) #endif .Ldiffoffset_8bytes_make_align_start: -#ifdef CONFIG_ALTIVEC -BEGIN_FTR_SECTION - /* only do vmx ops when the size equal or greater than 4K bytes */ - cmpdi cr5,r5,VMX_THRESH - bge cr5,.Ldiffoffset_vmx_cmp -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - -.Ldiffoffset_novmx_cmp: -#endif - /* now try to align s1 with 8 bytes */ rlwinm r6,r3,3,26,28 beq .Ldiffoffset_align_s1_8bytes @@ -515,6 +524,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) .Ldiffoffset_align_s1_8bytes: /* now s1 is aligned with 8 bytes. */ +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + /* only do vmx ops when the size equal or greater than 4K bytes */ + cmpdi cr5,r5,VMX_THRESH + bge cr5,.Ldiffoffset_vmx_cmp +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +.Ldiffoffset_novmx_cmp: +#endif + + cmpdi cr5,r5,31 ble cr5,.Lcmp_lt32bytes @@ -526,6 +546,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) #ifdef CONFIG_ALTIVEC .Ldiffoffset_vmx_cmp: + /* perform a 32 bytes pre-checking before + * enable VMX operations. + */ + li r0,4 + mtctr r0 +.Ldiffoffset_prechk_32B_loop: + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + bne cr0,.LcmpAB_lightweight + addi r5,r5,-8 + bdnz .Ldiffoffset_prechk_32B_loop + ENTER_VMX_OPS beq cr1,.Ldiffoffset_novmx_cmp -- cgit v1.2.3-70-g09d2 From ec0c464cdbf38bf6ddabec8bfa595bd421cab203 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 5 Jul 2018 16:24:57 +0000 Subject: powerpc: move ASM_CONST and stringify_in_c() into asm-const.h This patch moves ASM_CONST() and stringify_in_c() into dedicated asm-const.h, then cleans all related inclusions. Signed-off-by: Christophe Leroy [mpe: asm-compat.h should include asm-const.h] Signed-off-by: Michael Ellerman --- arch/powerpc/crypto/md5-asm.S | 1 + arch/powerpc/crypto/sha1-powerpc-asm.S | 1 + arch/powerpc/include/asm/asm-compat.h | 13 +------------ arch/powerpc/include/asm/asm-const.h | 14 ++++++++++++++ arch/powerpc/include/asm/barrier.h | 2 ++ arch/powerpc/include/asm/book3s/64/hash.h | 2 ++ arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 +- arch/powerpc/include/asm/book3s/64/radix.h | 2 ++ arch/powerpc/include/asm/cmpxchg.h | 1 - arch/powerpc/include/asm/code-patching.h | 1 + arch/powerpc/include/asm/cputable.h | 2 +- arch/powerpc/include/asm/dt_cpu_ftrs.h | 1 - arch/powerpc/include/asm/feature-fixups.h | 2 ++ arch/powerpc/include/asm/firmware.h | 2 +- arch/powerpc/include/asm/futex.h | 1 - arch/powerpc/include/asm/iommu.h | 1 + arch/powerpc/include/asm/jump_label.h | 2 +- arch/powerpc/include/asm/mmu-44x.h | 1 + arch/powerpc/include/asm/mmu.h | 2 +- arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + arch/powerpc/include/asm/page.h | 2 +- arch/powerpc/include/asm/page_64.h | 2 ++ arch/powerpc/include/asm/ppc-opcode.h | 2 +- arch/powerpc/include/asm/ptrace.h | 1 + arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/include/asm/reg_a2.h | 2 ++ arch/powerpc/include/asm/spinlock.h | 1 - arch/powerpc/include/asm/synch.h | 1 + arch/powerpc/include/asm/thread_info.h | 2 ++ arch/powerpc/include/asm/uaccess.h | 1 - arch/powerpc/kernel/entry_64.S | 1 + arch/powerpc/kernel/fpu.S | 1 + arch/powerpc/kernel/idle_book3s.S | 1 + arch/powerpc/kernel/kvm_emul.S | 1 + arch/powerpc/kernel/ppc_save_regs.S | 1 + arch/powerpc/kernel/vector.S | 1 + arch/powerpc/kvm/book3s_64_slb.S | 2 ++ arch/powerpc/kvm/book3s_hv_interrupts.S | 1 + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 1 + arch/powerpc/kvm/book3s_interrupts.S | 1 + arch/powerpc/kvm/book3s_rmhandlers.S | 1 + arch/powerpc/kvm/book3s_segment.S | 2 ++ arch/powerpc/lib/copyuser_64.S | 1 + arch/powerpc/lib/feature-fixups-test.S | 1 + arch/powerpc/lib/ldstfp.S | 1 + arch/powerpc/lib/memcpy_64.S | 1 + arch/powerpc/mm/tlb_nohash_low.S | 1 + arch/powerpc/net/bpf_jit32.h | 1 + arch/powerpc/net/bpf_jit_asm.S | 1 + arch/powerpc/net/bpf_jit_comp.c | 1 + arch/powerpc/net/bpf_jit_comp64.c | 1 + arch/powerpc/platforms/powernv/opal-wrappers.S | 1 + arch/powerpc/platforms/pseries/setup.c | 1 + arch/powerpc/purgatory/trampoline.S | 10 +--------- arch/powerpc/xmon/spr_access.S | 1 + tools/testing/selftests/powerpc/copyloops/asm/asm-compat.h | 0 tools/testing/selftests/powerpc/primitives/asm/asm-const.h | 1 + 57 files changed, 73 insertions(+), 33 deletions(-) create mode 100644 arch/powerpc/include/asm/asm-const.h create mode 100644 tools/testing/selftests/powerpc/copyloops/asm/asm-compat.h create mode 120000 tools/testing/selftests/powerpc/primitives/asm/asm-const.h (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/crypto/md5-asm.S b/arch/powerpc/crypto/md5-asm.S index 10cdf5bceebb..1834065362c7 100644 --- a/arch/powerpc/crypto/md5-asm.S +++ b/arch/powerpc/crypto/md5-asm.S @@ -11,6 +11,7 @@ */ #include #include +#include #define rHP r3 #define rWP r4 diff --git a/arch/powerpc/crypto/sha1-powerpc-asm.S b/arch/powerpc/crypto/sha1-powerpc-asm.S index c8951ce0dcc4..23e248beff71 100644 --- a/arch/powerpc/crypto/sha1-powerpc-asm.S +++ b/arch/powerpc/crypto/sha1-powerpc-asm.S @@ -7,6 +7,7 @@ #include #include +#include #ifdef __BIG_ENDIAN__ #define LWZ(rt, d, ra) \ diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h index d2cf3593e987..19b70c5b5f18 100644 --- a/arch/powerpc/include/asm/asm-compat.h +++ b/arch/powerpc/include/asm/asm-compat.h @@ -1,21 +1,10 @@ #ifndef _ASM_POWERPC_ASM_COMPAT_H #define _ASM_POWERPC_ASM_COMPAT_H +#include #include #include -#ifdef __ASSEMBLY__ -# define stringify_in_c(...) __VA_ARGS__ -# define ASM_CONST(x) x -#else -/* This version of stringify will deal with commas... */ -# define __stringify_in_c(...) #__VA_ARGS__ -# define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " -# define __ASM_CONST(x) x##UL -# define ASM_CONST(x) __ASM_CONST(x) -#endif - - #ifdef __powerpc64__ /* operations for longs and pointers */ diff --git a/arch/powerpc/include/asm/asm-const.h b/arch/powerpc/include/asm/asm-const.h new file mode 100644 index 000000000000..082c1538c562 --- /dev/null +++ b/arch/powerpc/include/asm/asm-const.h @@ -0,0 +1,14 @@ +#ifndef _ASM_POWERPC_ASM_CONST_H +#define _ASM_POWERPC_ASM_CONST_H + +#ifdef __ASSEMBLY__ +# define stringify_in_c(...) __VA_ARGS__ +# define ASM_CONST(x) x +#else +/* This version of stringify will deal with commas... */ +# define __stringify_in_c(...) #__VA_ARGS__ +# define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " +# define __ASM_CONST(x) x##UL +# define ASM_CONST(x) __ASM_CONST(x) +#endif +#endif /* _ASM_POWERPC_ASM_CONST_H */ diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index f67b3f6e36be..de1316874e45 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -5,6 +5,8 @@ #ifndef _ASM_POWERPC_BARRIER_H #define _ASM_POWERPC_BARRIER_H +#include + /* * Memory barrier. * The sync instruction guarantees that all memory accesses initiated diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 0387b155f13d..d52a51b2ce7b 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -3,6 +3,8 @@ #define _ASM_POWERPC_BOOK3S_64_HASH_H #ifdef __KERNEL__ +#include + /* * Common bits between 4K and 64K pages in a linux-style PTE. * Additional bits may be defined in pgtable-hash64-*.h diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index eee0b5b8a23f..2f74bdc805e0 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -12,9 +12,9 @@ * 2 of the License, or (at your option) any later version. */ -#include #include #include +#include /* * This is necessary to get the definition of PGTABLE_RANGE which we diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 3ab3f7aef022..77440e837869 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -2,6 +2,8 @@ #ifndef _ASM_POWERPC_PGTABLE_RADIX_H #define _ASM_POWERPC_PGTABLE_RADIX_H +#include + #ifndef __ASSEMBLY__ #include #endif diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h index 67ec1073ac97..27183871eb3b 100644 --- a/arch/powerpc/include/asm/cmpxchg.h +++ b/arch/powerpc/include/asm/cmpxchg.h @@ -5,7 +5,6 @@ #ifdef __KERNEL__ #include #include -#include #include #include diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 812535f40124..9ecc7bfc8ae7 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -14,6 +14,7 @@ #include #include #include +#include /* Flags for create_branch: * "b" == create_branch(addr, target, 0); diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 82816a2043b9..751126c22ed9 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -4,9 +4,9 @@ #include -#include #include #include +#include #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/dt_cpu_ftrs.h b/arch/powerpc/include/asm/dt_cpu_ftrs.h index 71515d909ed1..55113432fc91 100644 --- a/arch/powerpc/include/asm/dt_cpu_ftrs.h +++ b/arch/powerpc/include/asm/dt_cpu_ftrs.h @@ -10,7 +10,6 @@ */ #include -#include #include #include diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index fcfd05672b1b..33b6f9c892c8 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -1,6 +1,8 @@ #ifndef __ASM_POWERPC_FEATURE_FIXUPS_H #define __ASM_POWERPC_FEATURE_FIXUPS_H +#include + /* * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 535add3f7791..ce8aab72c21b 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -14,8 +14,8 @@ #ifdef __KERNEL__ -#include #include +#include /* firmware feature bitmask values */ diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index 76c8648d0fa8..94542776a62d 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -8,7 +8,6 @@ #include #include #include -#include #include #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \ diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index daa3ee5d7ad2..ab3a4fba38e3 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -30,6 +30,7 @@ #include #include #include +#include #define IOMMU_PAGE_SHIFT_4K 12 #define IOMMU_PAGE_SIZE_4K (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K) diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 9a287e0ac8b1..a3b2cf940b4e 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -14,7 +14,7 @@ #include #include -#include +#include #define JUMP_ENTRY_TYPE stringify_in_c(FTR_ENTRY_LONG) #define JUMP_LABEL_NOP_SIZE 4 diff --git a/arch/powerpc/include/asm/mmu-44x.h b/arch/powerpc/include/asm/mmu-44x.h index cb57f29f531d..9bdbe1d1c9b9 100644 --- a/arch/powerpc/include/asm/mmu-44x.h +++ b/arch/powerpc/include/asm/mmu-44x.h @@ -6,6 +6,7 @@ */ #include +#include #define PPC44x_MMUCR_TID 0x000000ff #define PPC44x_MMUCR_STS 0x00010000 diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 61d15ce92278..8418d83b5eb0 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -5,8 +5,8 @@ #include -#include #include +#include /* * MMU features bit definitions diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index dd0c7236208f..fe05b3e03cf1 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -8,6 +8,7 @@ #include #include +#include #ifdef CONFIG_PPC_64K_PAGES #error "Page size not supported" diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index a9fbefaacf10..f6a1265face2 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -16,7 +16,7 @@ #else #include #endif -#include +#include /* * On regular PPC32 page size is 4K (but we support 4K/16K/64K/256K pages diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index af04acdb873f..c0ce17e909ef 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -10,6 +10,8 @@ * 2 of the License, or (at your option) any later version. */ +#include + /* * We always define HW_PAGE_SHIFT to 12 as use of 64K pages remains Linux * specific, every notion of page number shared with the firmware, TCEs, diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index c103caf99897..954edf935158 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -13,7 +13,7 @@ #define _ASM_POWERPC_PPC_OPCODE_H #include -#include +#include #define __REG_R0 0 #define __REG_R1 1 diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index e4923686e43a..447cbd1bee99 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -24,6 +24,7 @@ #define _ASM_POWERPC_PTRACE_H #include +#include #ifdef __powerpc64__ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 562568414cf4..d4a8dc71a057 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -13,6 +13,7 @@ #include #include +#include /* Pickup Book E specific registers. */ #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) diff --git a/arch/powerpc/include/asm/reg_a2.h b/arch/powerpc/include/asm/reg_a2.h index 3ba9c6f096fc..74c2c57c492a 100644 --- a/arch/powerpc/include/asm/reg_a2.h +++ b/arch/powerpc/include/asm/reg_a2.h @@ -12,6 +12,8 @@ #ifndef __ASM_POWERPC_REG_A2_H__ #define __ASM_POWERPC_REG_A2_H__ +#include + #define SPRN_TENSR 0x1b5 #define SPRN_TENS 0x1b6 /* Thread ENable Set */ #define SPRN_TENC 0x1b7 /* Thread ENable Clear */ diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 7ec38f4ee927..685c72310f5d 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -24,7 +24,6 @@ #include #include #endif -#include #include #include #include diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h index 6ec546090ba1..f6f8c75bbb24 100644 --- a/arch/powerpc/include/asm/synch.h +++ b/arch/powerpc/include/asm/synch.h @@ -5,6 +5,7 @@ #include #include +#include #ifndef __ASSEMBLY__ extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup; diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index f308dfeb2746..ae554b6fe6b9 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -9,6 +9,8 @@ #ifndef _ASM_POWERPC_THREAD_INFO_H #define _ASM_POWERPC_THREAD_INFO_H +#include + #ifdef __KERNEL__ #define THREAD_SHIFT CONFIG_THREAD_SHIFT diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 468653ce844c..643cfbd5bcb5 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -2,7 +2,6 @@ #ifndef _ARCH_POWERPC_UACCESS_H #define _ARCH_POWERPC_UACCESS_H -#include #include #include #include diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 0357f87a013c..f62d9ddc0312 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -38,6 +38,7 @@ #include #include #include +#include #ifdef CONFIG_PPC_BOOK3S #include #else diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 6c509f39bbde..07c913fd5aba 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -25,6 +25,7 @@ #include #include #include +#include #ifdef CONFIG_VSX #define __REST_32FPVSRS(n,c,base) \ diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index d85d5515a091..436caa9d6eec 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -23,6 +23,7 @@ #include #include #include +#include #undef DEBUG diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S index e100ff324a85..c005088f6c9c 100644 --- a/arch/powerpc/kernel/kvm_emul.S +++ b/arch/powerpc/kernel/kvm_emul.S @@ -23,6 +23,7 @@ #include #include #include +#include #define KVM_MAGIC_PAGE (-4096) diff --git a/arch/powerpc/kernel/ppc_save_regs.S b/arch/powerpc/kernel/ppc_save_regs.S index 8afbe213d729..6d1b42ee797c 100644 --- a/arch/powerpc/kernel/ppc_save_regs.S +++ b/arch/powerpc/kernel/ppc_save_regs.S @@ -12,6 +12,7 @@ #include #include #include +#include /* * Grab the register values as they are now. diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index f314fd475491..21165da0052d 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -8,6 +8,7 @@ #include #include #include +#include /* * Load state from memory into VMX registers including VSCR. diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index 688722acd692..d293485c1a60 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -17,6 +17,8 @@ * Authors: Alexander Graf */ +#include + #define SHADOW_SLB_ENTRY_LEN 0x10 #define OFFSET_ESID(x) (SHADOW_SLB_ENTRY_LEN * x) #define OFFSET_VSID(x) ((SHADOW_SLB_ENTRY_LEN * x) + 8) diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 82f2ff9410b6..4218073eea1f 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -27,6 +27,7 @@ #include #include #include +#include /***************************************************************************** * * diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 6e4554b273f1..7405222a4e28 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -32,6 +32,7 @@ #include #include #include +#include /* Sign-extend HDEC if not on POWER9 */ #define EXTEND_HDEC(reg) \ diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index c18e845019ec..d71dab16dc6f 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -23,6 +23,7 @@ #include #include #include +#include #if defined(CONFIG_PPC_BOOK3S_64) #ifdef PPC64_ELF_ABI_v2 diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 34a5adeff084..b0089e04c8c8 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -23,6 +23,7 @@ #include #include #include +#include #ifdef CONFIG_PPC_BOOK3S_64 #include diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 98ccc7ec5d48..7fec258bb072 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -19,6 +19,8 @@ /* Real mode helpers */ +#include + #if defined(CONFIG_PPC_BOOK3S_64) #define GET_SHADOW_VCPU(reg) \ diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S index 506677395681..65244263b6a3 100644 --- a/arch/powerpc/lib/copyuser_64.S +++ b/arch/powerpc/lib/copyuser_64.S @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef __BIG_ENDIAN__ #define sLd sld /* Shift towards low-numbered address. */ diff --git a/arch/powerpc/lib/feature-fixups-test.S b/arch/powerpc/lib/feature-fixups-test.S index f16cec989506..ee7c5fd5fc64 100644 --- a/arch/powerpc/lib/feature-fixups-test.S +++ b/arch/powerpc/lib/feature-fixups-test.S @@ -11,6 +11,7 @@ #include #include #include +#include .text diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S index ae15eba49c1f..32e91994b6b2 100644 --- a/arch/powerpc/lib/ldstfp.S +++ b/arch/powerpc/lib/ldstfp.S @@ -15,6 +15,7 @@ #include #include #include +#include #include #ifdef CONFIG_PPC_FPU diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S index 8d8265be1a59..26ea02b7311f 100644 --- a/arch/powerpc/lib/memcpy_64.S +++ b/arch/powerpc/lib/memcpy_64.S @@ -9,6 +9,7 @@ #include #include #include +#include .align 7 _GLOBAL_TOC(memcpy) diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index 048b8e9f4492..505a3d010c47 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -34,6 +34,7 @@ #include #include #include +#include #if defined(CONFIG_40x) diff --git a/arch/powerpc/net/bpf_jit32.h b/arch/powerpc/net/bpf_jit32.h index a8cd7e289ecd..6f4daacad296 100644 --- a/arch/powerpc/net/bpf_jit32.h +++ b/arch/powerpc/net/bpf_jit32.h @@ -13,6 +13,7 @@ #ifndef _BPF_JIT32_H #define _BPF_JIT32_H +#include #include "bpf_jit.h" #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/net/bpf_jit_asm.S b/arch/powerpc/net/bpf_jit_asm.S index 3dd9c43d40c9..c80280dc2e04 100644 --- a/arch/powerpc/net/bpf_jit_asm.S +++ b/arch/powerpc/net/bpf_jit_asm.S @@ -10,6 +10,7 @@ */ #include +#include #include "bpf_jit32.h" /* diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 5b061fc81df3..d5bfe24bb3b5 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -12,6 +12,7 @@ */ #include #include +#include #include #include #include diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 380cbf9a40d9..b8de5244a58c 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -13,6 +13,7 @@ */ #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index a8d9b4089c31..4016e3c3d18b 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -14,6 +14,7 @@ #include #include #include +#include .section ".text" diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 139f0af6c3d9..e14ccf32a97d 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -69,6 +69,7 @@ #include #include #include +#include #include "pseries.h" diff --git a/arch/powerpc/purgatory/trampoline.S b/arch/powerpc/purgatory/trampoline.S index 4aad9dd10ace..1e1129553fd7 100644 --- a/arch/powerpc/purgatory/trampoline.S +++ b/arch/powerpc/purgatory/trampoline.S @@ -12,15 +12,7 @@ * Software Foundation (version 2 of the License). */ -#if defined(__LITTLE_ENDIAN__) -#define STWX_BE stwbrx -#define LWZX_BE lwbrx -#elif defined(__BIG_ENDIAN__) -#define STWX_BE stwx -#define LWZX_BE lwzx -#else -#error no endianness defined! -#endif +#include .machine ppc64 .balign 256 diff --git a/arch/powerpc/xmon/spr_access.S b/arch/powerpc/xmon/spr_access.S index 4099cbcddaaa..720a52afdd58 100644 --- a/arch/powerpc/xmon/spr_access.S +++ b/arch/powerpc/xmon/spr_access.S @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include /* unsigned long xmon_mfspr(sprn, default_value) */ _GLOBAL(xmon_mfspr) diff --git a/tools/testing/selftests/powerpc/copyloops/asm/asm-compat.h b/tools/testing/selftests/powerpc/copyloops/asm/asm-compat.h new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tools/testing/selftests/powerpc/primitives/asm/asm-const.h b/tools/testing/selftests/powerpc/primitives/asm/asm-const.h new file mode 120000 index 000000000000..18d8be13e67f --- /dev/null +++ b/tools/testing/selftests/powerpc/primitives/asm/asm-const.h @@ -0,0 +1 @@ +../../../../../../arch/powerpc/include/asm/asm-const.h \ No newline at end of file -- cgit v1.2.3-70-g09d2 From 5c35a02c545a7bbe77f3a1ae337d9e29beed079b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 5 Jul 2018 16:24:59 +0000 Subject: powerpc: clean the inclusion of stringify.h Only include linux/stringify.h is files using __stringify() Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dcr-native.h | 1 + arch/powerpc/include/asm/ppc-opcode.h | 1 - arch/powerpc/include/asm/reg_fsl_emb.h | 2 ++ arch/powerpc/include/asm/synch.h | 1 - arch/powerpc/include/asm/thread_info.h | 1 - arch/powerpc/kernel/prom.c | 1 - arch/powerpc/kernel/prom_init.c | 1 - arch/powerpc/kvm/book3s_64_vio_hv.c | 1 + arch/powerpc/lib/locks.c | 1 - arch/powerpc/perf/req-gen/_begin.h | 2 ++ arch/powerpc/perf/req-gen/perf.h | 1 + arch/powerpc/platforms/cell/cbe_thermal.c | 1 + arch/powerpc/platforms/cell/spufs/sputrace.h | 1 + arch/powerpc/platforms/powernv/vas.h | 1 + arch/powerpc/platforms/pseries/mobility.c | 1 + 15 files changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/include/asm/dcr-native.h b/arch/powerpc/include/asm/dcr-native.h index 4a2beef74277..151dff555f50 100644 --- a/arch/powerpc/include/asm/dcr-native.h +++ b/arch/powerpc/include/asm/dcr-native.h @@ -25,6 +25,7 @@ #include #include #include +#include typedef struct { unsigned int base; diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 954edf935158..665af14850e4 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -12,7 +12,6 @@ #ifndef _ASM_POWERPC_PPC_OPCODE_H #define _ASM_POWERPC_PPC_OPCODE_H -#include #include #define __REG_R0 0 diff --git a/arch/powerpc/include/asm/reg_fsl_emb.h b/arch/powerpc/include/asm/reg_fsl_emb.h index d7ccf93e6279..a21f529c43d9 100644 --- a/arch/powerpc/include/asm/reg_fsl_emb.h +++ b/arch/powerpc/include/asm/reg_fsl_emb.h @@ -7,6 +7,8 @@ #ifndef __ASM_POWERPC_REG_FSL_EMB_H__ #define __ASM_POWERPC_REG_FSL_EMB_H__ +#include + #ifndef __ASSEMBLY__ /* Performance Monitor Registers */ #define mfpmr(rn) ({unsigned int rval; \ diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h index f6f8c75bbb24..aca70fb43147 100644 --- a/arch/powerpc/include/asm/synch.h +++ b/arch/powerpc/include/asm/synch.h @@ -3,7 +3,6 @@ #define _ASM_POWERPC_SYNCH_H #ifdef __KERNEL__ -#include #include #include diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index ae554b6fe6b9..3c0002044bc9 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -27,7 +27,6 @@ #include #include #include -#include #include /* diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 8f32f14ba508..c4d7078e5295 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index c45fb463c9e5..0433bf24a10d 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index d4bcd1b17b09..12318fa62655 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c index b7b1237d4aa6..35a0ef932e1a 100644 --- a/arch/powerpc/lib/locks.c +++ b/arch/powerpc/lib/locks.c @@ -15,7 +15,6 @@ #include #include #include -#include #include /* waiting for a spinlock... */ diff --git a/arch/powerpc/perf/req-gen/_begin.h b/arch/powerpc/perf/req-gen/_begin.h index 549f8782c52d..a200b86eba3b 100644 --- a/arch/powerpc/perf/req-gen/_begin.h +++ b/arch/powerpc/perf/req-gen/_begin.h @@ -3,6 +3,8 @@ #ifndef POWERPC_PERF_REQ_GEN_H_ #define POWERPC_PERF_REQ_GEN_H_ +#include + #define CAT2_STR_(t, s) __stringify(t/s) #define CAT2_STR(t, s) CAT2_STR_(t, s) #define I(...) __VA_ARGS__ diff --git a/arch/powerpc/perf/req-gen/perf.h b/arch/powerpc/perf/req-gen/perf.h index 871a9a1766c2..fa9bc804e67a 100644 --- a/arch/powerpc/perf/req-gen/perf.h +++ b/arch/powerpc/perf/req-gen/perf.h @@ -3,6 +3,7 @@ #define LINUX_POWERPC_PERF_REQ_GEN_PERF_H_ #include +#include #ifndef REQUEST_FILE #error "REQUEST_FILE must be defined before including" diff --git a/arch/powerpc/platforms/cell/cbe_thermal.c b/arch/powerpc/platforms/cell/cbe_thermal.c index 2c15ff094483..55aac74e1cb9 100644 --- a/arch/powerpc/platforms/cell/cbe_thermal.c +++ b/arch/powerpc/platforms/cell/cbe_thermal.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/cell/spufs/sputrace.h b/arch/powerpc/platforms/cell/spufs/sputrace.h index d557e999b662..1def11e911ac 100644 --- a/arch/powerpc/platforms/cell/spufs/sputrace.h +++ b/arch/powerpc/platforms/cell/spufs/sputrace.h @@ -3,6 +3,7 @@ #define _TRACE_SPUFS_H #include +#include #undef TRACE_SYSTEM #define TRACE_SYSTEM spufs diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index ae0100fd35bb..f5493dbdd7ff 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -15,6 +15,7 @@ #include #include #include +#include /* * Overview of Virtual Accelerator Switchboard (VAS). diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 8a8033a249c7..f0e30dc94988 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3-70-g09d2 From 2c86cd188f8a5631f3d75a1dea14d22df85189b4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 5 Jul 2018 16:25:01 +0000 Subject: powerpc: clean inclusions of asm/feature-fixups.h files not using feature fixup don't need asm/feature-fixups.h files using feature fixup need asm/feature-fixups.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/cputable.h | 1 - arch/powerpc/include/asm/dbell.h | 1 + arch/powerpc/include/asm/dt_cpu_ftrs.h | 1 - arch/powerpc/include/asm/exception-64s.h | 1 + arch/powerpc/include/asm/firmware.h | 1 - arch/powerpc/include/asm/kvm_booke_hv_asm.h | 2 ++ arch/powerpc/include/asm/mmu.h | 1 - arch/powerpc/include/asm/ppc_asm.h | 1 + arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/kernel/cpu_setup_6xx.S | 1 + arch/powerpc/kernel/entry_32.S | 1 + arch/powerpc/kernel/entry_64.S | 1 + arch/powerpc/kernel/exceptions-64e.S | 1 + arch/powerpc/kernel/exceptions-64s.S | 1 + arch/powerpc/kernel/fpu.S | 1 + arch/powerpc/kernel/head_32.S | 1 + arch/powerpc/kernel/head_64.S | 1 + arch/powerpc/kernel/head_fsl_booke.S | 1 + arch/powerpc/kernel/idle_6xx.S | 1 + arch/powerpc/kernel/idle_book3s.S | 1 + arch/powerpc/kernel/idle_e500.S | 1 + arch/powerpc/kernel/idle_power4.S | 1 + arch/powerpc/kernel/l2cr_6xx.S | 1 + arch/powerpc/kernel/misc_32.S | 1 + arch/powerpc/kernel/misc_64.S | 1 + arch/powerpc/kernel/setup_32.c | 1 + arch/powerpc/kernel/setup_64.c | 1 + arch/powerpc/kernel/swsusp_32.S | 1 + arch/powerpc/kernel/swsusp_asm64.S | 1 + arch/powerpc/kernel/tm.S | 1 + arch/powerpc/kvm/book3s_64_slb.S | 1 + arch/powerpc/kvm/book3s_hv_interrupts.S | 1 + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 1 + arch/powerpc/kvm/book3s_segment.S | 1 + arch/powerpc/lib/copypage_64.S | 1 + arch/powerpc/lib/copyuser_64.S | 1 + arch/powerpc/lib/hweight_64.S | 1 + arch/powerpc/lib/memcpy_64.S | 1 + arch/powerpc/mm/hash_low_32.S | 1 + arch/powerpc/mm/hash_native_64.c | 1 + arch/powerpc/mm/slb_low.S | 1 + arch/powerpc/mm/tlb_low_64e.S | 1 + arch/powerpc/mm/tlb_nohash_low.S | 1 + arch/powerpc/platforms/powermac/cache.S | 1 + arch/powerpc/platforms/powermac/sleep.S | 1 + arch/powerpc/platforms/powernv/opal-wrappers.S | 1 + arch/powerpc/platforms/pseries/hvCall.S | 1 + tools/testing/selftests/powerpc/copyloops/asm/feature-fixups.h | 0 tools/testing/selftests/powerpc/primitives/asm/feature-fixups.h | 1 + 49 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 tools/testing/selftests/powerpc/copyloops/asm/feature-fixups.h create mode 120000 tools/testing/selftests/powerpc/primitives/asm/feature-fixups.h (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 751126c22ed9..29f49a35d6ee 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -4,7 +4,6 @@ #include -#include #include #include diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h index 998c42ff1caa..99b84db23e8c 100644 --- a/arch/powerpc/include/asm/dbell.h +++ b/arch/powerpc/include/asm/dbell.h @@ -16,6 +16,7 @@ #include #include +#include #define PPC_DBELL_MSG_BRDCAST (0x04000000) #define PPC_DBELL_TYPE(x) (((x) & 0xf) << (63-36)) diff --git a/arch/powerpc/include/asm/dt_cpu_ftrs.h b/arch/powerpc/include/asm/dt_cpu_ftrs.h index 55113432fc91..0c729e2d0e8a 100644 --- a/arch/powerpc/include/asm/dt_cpu_ftrs.h +++ b/arch/powerpc/include/asm/dt_cpu_ftrs.h @@ -10,7 +10,6 @@ */ #include -#include #include #ifdef CONFIG_PPC_DT_CPU_FTRS diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index c40b4380951c..1f2efc1a9769 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -35,6 +35,7 @@ * implementations as possible. */ #include +#include /* PACA save area offsets (exgen, exmc, etc) */ #define EX_R9 0 diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index ce8aab72c21b..7a051bd21f87 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -14,7 +14,6 @@ #ifdef __KERNEL__ -#include #include /* firmware feature bitmask values */ diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h index e5f048bbcb7c..931260b59ac6 100644 --- a/arch/powerpc/include/asm/kvm_booke_hv_asm.h +++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h @@ -9,6 +9,8 @@ #ifndef ASM_KVM_BOOKE_HV_ASM_H #define ASM_KVM_BOOKE_HV_ASM_H +#include + #ifdef __ASSEMBLY__ /* diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 8418d83b5eb0..13ea441ac531 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -5,7 +5,6 @@ #include -#include #include /* diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 75ece56dcd62..b5d023680801 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index d4a8dc71a057..486b7c83b8c5 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -14,6 +14,7 @@ #include #include #include +#include /* Pickup Book E specific registers. */ #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S index a9f3970693e1..fa3c2c91290c 100644 --- a/arch/powerpc/kernel/cpu_setup_6xx.S +++ b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -16,6 +16,7 @@ #include #include #include +#include _GLOBAL(__setup_cpu_603) mflr r5 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 7642cb984d3a..3bd097be90d9 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -34,6 +34,7 @@ #include #include #include +#include /* * MSR_KERNEL is > 0x10000 on 4xx/Book-E since it include MSR_CE. diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index f62d9ddc0312..a9e74ecdab0b 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -44,6 +44,7 @@ #else #include #endif +#include /* * System calls. diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 3325f721e7b2..6d6e144a28ce 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -27,6 +27,7 @@ #include #include #include +#include /* XXX This will ultimately add space for a special exception save * structure used to save things like SRR0/SRR1, SPRGs, MAS, etc... diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 36aec1f1cb2d..7a672dafd94f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -18,6 +18,7 @@ #include #include #include +#include /* * There are a few constraints to be concerned with. diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 07c913fd5aba..529dcc21c3f9 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -26,6 +26,7 @@ #include #include #include +#include #ifdef CONFIG_VSX #define __REST_32FPVSRS(n,c,base) \ diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 29b2fed93289..61ca27929355 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -35,6 +35,7 @@ #include #include #include +#include /* 601 only have IBAT; cr0.eq is set on 601 when using this macro */ #define LOAD_BAT(n, reg, RA, RB) \ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 6eca15f25c73..4898e9491a1c 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -44,6 +44,7 @@ #include #include #include +#include /* The physical memory is laid out such that the secondary processor * spin code sits at 0x0000...0x00ff. On server, the vectors follow diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index bf4c6021515f..e2750b856c8f 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -43,6 +43,7 @@ #include #include #include +#include #include "head_booke.h" /* As with the other PowerPC ports, it is expected that when code diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S index 1686916cc7f0..ff026c9d3cab 100644 --- a/arch/powerpc/kernel/idle_6xx.S +++ b/arch/powerpc/kernel/idle_6xx.S @@ -20,6 +20,7 @@ #include #include #include +#include .text diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 436caa9d6eec..b107e3df7f8f 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -24,6 +24,7 @@ #include #include #include +#include #undef DEBUG diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S index b9b6ef510be1..583e55ac7d26 100644 --- a/arch/powerpc/kernel/idle_e500.S +++ b/arch/powerpc/kernel/idle_e500.S @@ -17,6 +17,7 @@ #include #include #include +#include .text diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S index 08faa93755f9..dd7471fe20bd 100644 --- a/arch/powerpc/kernel/idle_power4.S +++ b/arch/powerpc/kernel/idle_power4.S @@ -16,6 +16,7 @@ #include #include #include +#include #undef DEBUG diff --git a/arch/powerpc/kernel/l2cr_6xx.S b/arch/powerpc/kernel/l2cr_6xx.S index 6408f09dbbd9..6e7dbb7d527c 100644 --- a/arch/powerpc/kernel/l2cr_6xx.S +++ b/arch/powerpc/kernel/l2cr_6xx.S @@ -45,6 +45,7 @@ #include #include #include +#include /* Usage: diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 3f7a9a2d2435..695b24a2d954 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -34,6 +34,7 @@ #include #include #include +#include .text diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index fa267e94090a..262ba9481781 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -28,6 +28,7 @@ #include #include #include +#include .text diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index ef747a5a30b9..0e3743343280 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -41,6 +41,7 @@ #include #include #include +#include #define DBG(fmt...) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 225bc5f91049..6a501b25dd85 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -68,6 +68,7 @@ #include #include #include +#include #include "setup.h" diff --git a/arch/powerpc/kernel/swsusp_32.S b/arch/powerpc/kernel/swsusp_32.S index 34b73a262709..7a919e9a3400 100644 --- a/arch/powerpc/kernel/swsusp_32.S +++ b/arch/powerpc/kernel/swsusp_32.S @@ -7,6 +7,7 @@ #include #include #include +#include /* * Structure for storing CPU registers on the save area. diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S index 82d8aae81c6a..f83bf6f72cb0 100644 --- a/arch/powerpc/kernel/swsusp_asm64.S +++ b/arch/powerpc/kernel/swsusp_asm64.S @@ -13,6 +13,7 @@ #include #include #include +#include /* * Structure for storing CPU registers on the save area. diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 6bb6f5123dcf..6bffbc5affe7 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_VSX /* See fpu.S, this is borrowed from there */ diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index d293485c1a60..066c665dc86f 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -18,6 +18,7 @@ */ #include +#include #define SHADOW_SLB_ENTRY_LEN 0x10 #define OFFSET_ESID(x) (SHADOW_SLB_ENTRY_LEN * x) diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 4218073eea1f..666b91c79eb4 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -28,6 +28,7 @@ #include #include #include +#include /***************************************************************************** * * diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 7405222a4e28..1d14046124a0 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -33,6 +33,7 @@ #include #include #include +#include /* Sign-extend HDEC if not on POWER9 */ #define EXTEND_HDEC(reg) \ diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 7fec258bb072..e5c542a7c5ac 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -20,6 +20,7 @@ /* Real mode helpers */ #include +#include #if defined(CONFIG_PPC_BOOK3S_64) diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S index 8d5034f645f3..694390357667 100644 --- a/arch/powerpc/lib/copypage_64.S +++ b/arch/powerpc/lib/copypage_64.S @@ -11,6 +11,7 @@ #include #include #include +#include .section ".toc","aw" PPC64_CACHES: diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S index 65244263b6a3..2d6f128d3ebe 100644 --- a/arch/powerpc/lib/copyuser_64.S +++ b/arch/powerpc/lib/copyuser_64.S @@ -10,6 +10,7 @@ #include #include #include +#include #ifdef __BIG_ENDIAN__ #define sLd sld /* Shift towards low-numbered address. */ diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S index 3de7ac154f24..0526b2225260 100644 --- a/arch/powerpc/lib/hweight_64.S +++ b/arch/powerpc/lib/hweight_64.S @@ -20,6 +20,7 @@ #include #include #include +#include /* Note: This code relies on -mminimal-toc */ diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S index 26ea02b7311f..94650d6eae9c 100644 --- a/arch/powerpc/lib/memcpy_64.S +++ b/arch/powerpc/lib/memcpy_64.S @@ -10,6 +10,7 @@ #include #include #include +#include .align 7 _GLOBAL_TOC(memcpy) diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index ffbd7c0bda96..26acf6c8c20c 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -27,6 +27,7 @@ #include #include #include +#include #ifdef CONFIG_SMP .section .bss diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index ffbd5ed4e8de..fc5dbbfd09fe 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -30,6 +30,7 @@ #include #include #include +#include #include diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index a83fbd2a4a24..4ac5057ad439 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -22,6 +22,7 @@ #include #include #include +#include /* * This macro generates asm code to compute the VSID scramble diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index eb82d787d99a..7fd20c52a8ec 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -22,6 +22,7 @@ #include #include #include +#include #ifdef CONFIG_PPC_64K_PAGES #define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index 505a3d010c47..e066a658acac 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -35,6 +35,7 @@ #include #include #include +#include #if defined(CONFIG_40x) diff --git a/arch/powerpc/platforms/powermac/cache.S b/arch/powerpc/platforms/powermac/cache.S index cc5347eb1662..27862feee4a5 100644 --- a/arch/powerpc/platforms/powermac/cache.S +++ b/arch/powerpc/platforms/powermac/cache.S @@ -17,6 +17,7 @@ #include #include #include +#include /* * Flush and disable all data caches (dL1, L2, L3). This is used diff --git a/arch/powerpc/platforms/powermac/sleep.S b/arch/powerpc/platforms/powermac/sleep.S index 1c2802fabd57..f89808b9713d 100644 --- a/arch/powerpc/platforms/powermac/sleep.S +++ b/arch/powerpc/platforms/powermac/sleep.S @@ -18,6 +18,7 @@ #include #include #include +#include #define MAGIC 0x4c617273 /* 'Lars' */ diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 4016e3c3d18b..3508be7d758a 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -15,6 +15,7 @@ #include #include #include +#include .section ".text" diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S index c511a1743a44..d91412c591ef 100644 --- a/arch/powerpc/platforms/pseries/hvCall.S +++ b/arch/powerpc/platforms/pseries/hvCall.S @@ -13,6 +13,7 @@ #include #include #include +#include .section ".text" diff --git a/tools/testing/selftests/powerpc/copyloops/asm/feature-fixups.h b/tools/testing/selftests/powerpc/copyloops/asm/feature-fixups.h new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tools/testing/selftests/powerpc/primitives/asm/feature-fixups.h b/tools/testing/selftests/powerpc/primitives/asm/feature-fixups.h new file mode 120000 index 000000000000..8dc6d4d46e8e --- /dev/null +++ b/tools/testing/selftests/powerpc/primitives/asm/feature-fixups.h @@ -0,0 +1 @@ +../../../../../../arch/powerpc/include/asm/feature-fixups.h \ No newline at end of file -- cgit v1.2.3-70-g09d2 From 9412b234501eef1d273cc759387a232f0027c574 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 1 Aug 2018 09:01:14 +0000 Subject: powerpc/lib: Implement strlen() in assembly for PPC32 The generic implementation of strlen() reads strings byte per byte. This patch implements strlen() in assembly based on a read of entire words, in the same spirit as what some other arches and glibc do. On a 8xx the time spent in strlen is reduced by 3/4 for long strings. strlen() selftest on an 8xx provides the following values: Before the patch (ie with the generic strlen() in lib/string.c): len 256 : time = 1.195055 len 016 : time = 0.083745 len 008 : time = 0.046828 len 004 : time = 0.028390 After the patch: len 256 : time = 0.272185 ==> 78% improvment len 016 : time = 0.040632 ==> 51% improvment len 008 : time = 0.033060 ==> 29% improvment len 004 : time = 0.029149 ==> 2% degradation On a 832x: Before the patch: len 256 : time = 0.236125 len 016 : time = 0.018136 len 008 : time = 0.011000 len 004 : time = 0.007229 After the patch: len 256 : time = 0.094950 ==> 60% improvment len 016 : time = 0.013357 ==> 26% improvment len 008 : time = 0.010586 ==> 4% improvment len 004 : time = 0.008784 Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/string.h | 2 + arch/powerpc/lib/Makefile | 2 +- arch/powerpc/lib/strlen_32.S | 78 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/lib/strlen_32.S (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index 9b8cedf618f4..1647de15a31e 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -50,6 +50,8 @@ static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n) return __memset64(p, v, n * 8); } #else +#define __HAVE_ARCH_STRLEN + extern void *memset16(uint16_t *, uint16_t, __kernel_size_t); #endif #endif /* __KERNEL__ */ diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index d0ca13ad8231..670286808928 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -12,7 +12,7 @@ CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE) obj-y += string.o alloc.o code-patching.o feature-fixups.o -obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o +obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o strlen_32.o # See corresponding test in arch/powerpc/Makefile # 64-bit linker creates .sfpr on demand for final link (vmlinux), diff --git a/arch/powerpc/lib/strlen_32.S b/arch/powerpc/lib/strlen_32.S new file mode 100644 index 000000000000..0a8d3f64d493 --- /dev/null +++ b/arch/powerpc/lib/strlen_32.S @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * strlen() for PPC32 + * + * Copyright (C) 2018 Christophe Leroy CS Systemes d'Information. + * + * Inspired from glibc implementation + */ +#include +#include +#include + + .text + +/* + * Algorithm: + * + * 1) Given a word 'x', we can test to see if it contains any 0 bytes + * by subtracting 0x01010101, and seeing if any of the high bits of each + * byte changed from 0 to 1. This works because the least significant + * 0 byte must have had no incoming carry (otherwise it's not the least + * significant), so it is 0x00 - 0x01 == 0xff. For all other + * byte values, either they have the high bit set initially, or when + * 1 is subtracted you get a value in the range 0x00-0x7f, none of which + * have their high bit set. The expression here is + * (x - 0x01010101) & ~x & 0x80808080), which gives 0x00000000 when + * there were no 0x00 bytes in the word. You get 0x80 in bytes that + * match, but possibly false 0x80 matches in the next more significant + * byte to a true match due to carries. For little-endian this is + * of no consequence since the least significant match is the one + * we're interested in, but big-endian needs method 2 to find which + * byte matches. + * 2) Given a word 'x', we can test to see _which_ byte was zero by + * calculating ~(((x & ~0x80808080) - 0x80808080 - 1) | x | ~0x80808080). + * This produces 0x80 in each byte that was zero, and 0x00 in all + * the other bytes. The '| ~0x80808080' clears the low 7 bits in each + * byte, and the '| x' part ensures that bytes with the high bit set + * produce 0x00. The addition will carry into the high bit of each byte + * iff that byte had one of its low 7 bits set. We can then just see + * which was the most significant bit set and divide by 8 to find how + * many to add to the index. + * This is from the book 'The PowerPC Compiler Writer's Guide', + * by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren. + */ + +_GLOBAL(strlen) + andi. r0, r3, 3 + lis r7, 0x0101 + addi r10, r3, -4 + addic r7, r7, 0x0101 /* r7 = 0x01010101 (lomagic) & clear XER[CA] */ + rotlwi r6, r7, 31 /* r6 = 0x80808080 (himagic) */ + bne- 3f + .balign IFETCH_ALIGN_BYTES +1: lwzu r9, 4(r10) +2: subf r8, r7, r9 + and. r8, r8, r6 + beq+ 1b + andc. r8, r8, r9 + beq+ 1b + andc r8, r9, r6 + orc r9, r9, r6 + subfe r8, r6, r8 + nor r8, r8, r9 + cntlzw r8, r8 + subf r3, r3, r10 + srwi r8, r8, 3 + add r3, r3, r8 + blr + + /* Missaligned string: make sure bytes before string are seen not 0 */ +3: xor r10, r10, r0 + orc r8, r8, r8 + lwzu r9, 4(r10) + slwi r0, r0, 3 + srw r8, r8, r0 + orc r9, r9, r8 + b 2b +EXPORT_SYMBOL(strlen) -- cgit v1.2.3-70-g09d2 From 179ab1cbf883575c3a585bcfc0f2160f1d22a149 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 28 Jul 2018 09:06:34 +1000 Subject: powerpc/64: Add CONFIG_PPC_BARRIER_NOSPEC Add a config symbol to encode which platforms support the barrier_nospec speculation barrier. Currently this is just Book3S 64 but we will add Book3E in a future patch. Signed-off-by: Diana Craciun Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 7 ++++++- arch/powerpc/include/asm/barrier.h | 6 +++--- arch/powerpc/include/asm/setup.h | 2 +- arch/powerpc/kernel/Makefile | 3 ++- arch/powerpc/kernel/module.c | 4 +++- arch/powerpc/kernel/vmlinux.lds.S | 4 +++- arch/powerpc/lib/feature-fixups.c | 6 ++++-- 7 files changed, 22 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index ee38fce075ee..846c89ed7fa3 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -165,7 +165,7 @@ config PPC select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE - select GENERIC_CPU_VULNERABILITIES if PPC_BOOK3S_64 + select GENERIC_CPU_VULNERABILITIES if PPC_BARRIER_NOSPEC select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_SMP_IDLE_THREAD @@ -242,6 +242,11 @@ config PPC # Please keep this list sorted alphabetically. # +config PPC_BARRIER_NOSPEC + bool + default y + depends on PPC_BOOK3S_64 + config GENERIC_CSUM def_bool n diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index de1316874e45..cdc6960506e2 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -78,7 +78,7 @@ do { \ ___p1; \ }) -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_BARRIER_NOSPEC /* * Prevent execution of subsequent instructions until preceding branches have * been fully resolved and are no longer executing speculatively. @@ -88,10 +88,10 @@ do { \ // This also acts as a compiler barrier due to the memory clobber. #define barrier_nospec() asm (stringify_in_c(barrier_nospec_asm) ::: "memory") -#else /* !CONFIG_PPC_BOOK3S_64 */ +#else /* !CONFIG_PPC_BARRIER_NOSPEC */ #define barrier_nospec_asm #define barrier_nospec() -#endif +#endif /* CONFIG_PPC_BARRIER_NOSPEC */ #include diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 8721fd004291..8205f9fdfd67 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -56,7 +56,7 @@ void setup_barrier_nospec(void); void do_barrier_nospec_fixups(bool enable); extern bool barrier_nospec_enabled; -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_BARRIER_NOSPEC void do_barrier_nospec_fixups_range(bool enable, void *start, void *end); #else static inline void do_barrier_nospec_fixups_range(bool enable, void *start, void *end) { }; diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 2b4c40b255e4..dbe2cf04b406 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -42,9 +42,10 @@ obj-$(CONFIG_VDSO32) += vdso32/ obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o -obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o security.o +obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o +obj-$(CONFIG_PPC_BARRIER_NOSPEC) += security.o obj-$(CONFIG_PPC64) += vdso64/ obj-$(CONFIG_ALTIVEC) += vecemu.o obj-$(CONFIG_PPC_970_NAP) += idle_power4.o diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index 1b3c6835e730..77371c9ef3d8 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -72,13 +72,15 @@ int module_finalize(const Elf_Ehdr *hdr, do_feature_fixups(powerpc_firmware_features, (void *)sect->sh_addr, (void *)sect->sh_addr + sect->sh_size); +#endif /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC_BARRIER_NOSPEC sect = find_section(hdr, sechdrs, "__spec_barrier_fixup"); if (sect != NULL) do_barrier_nospec_fixups_range(barrier_nospec_enabled, (void *)sect->sh_addr, (void *)sect->sh_addr + sect->sh_size); -#endif +#endif /* CONFIG_PPC_BARRIER_NOSPEC */ sect = find_section(hdr, sechdrs, "__lwsync_fixup"); if (sect != NULL) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 5baac79df97e..07ae018e550e 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -153,14 +153,16 @@ SECTIONS *(__rfi_flush_fixup) __stop___rfi_flush_fixup = .; } +#endif /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC_BARRIER_NOSPEC . = ALIGN(8); __spec_barrier_fixup : AT(ADDR(__spec_barrier_fixup) - LOAD_OFFSET) { __start___barrier_nospec_fixup = .; *(__barrier_nospec_fixup) __stop___barrier_nospec_fixup = .; } -#endif +#endif /* CONFIG_PPC_BARRIER_NOSPEC */ EXCEPTION_TABLE(0) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 8b69f868298c..0e604b41b5d1 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -304,6 +304,9 @@ void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_ printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i); } +#endif /* CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_BARRIER_NOSPEC void do_barrier_nospec_fixups(bool enable) { void *start, *end; @@ -313,8 +316,7 @@ void do_barrier_nospec_fixups(bool enable) do_barrier_nospec_fixups_range(enable, start, end); } - -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_BARRIER_NOSPEC */ void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) { -- cgit v1.2.3-70-g09d2 From ebcd1bfc33c7a90df941df68a6e5d4018c022fba Mon Sep 17 00:00:00 2001 From: Diana Craciun Date: Sat, 28 Jul 2018 09:06:37 +1000 Subject: powerpc/fsl: Add barrier_nospec implementation for NXP PowerPC Book3E Implement the barrier_nospec as a isync;sync instruction sequence. The implementation uses the infrastructure built for BOOK3S 64. Signed-off-by: Diana Craciun [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/barrier.h | 8 +++++++- arch/powerpc/lib/feature-fixups.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 846c89ed7fa3..8b6cd6780d8f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -245,7 +245,7 @@ config PPC config PPC_BARRIER_NOSPEC bool default y - depends on PPC_BOOK3S_64 + depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E config GENERIC_CSUM def_bool n diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index cdc6960506e2..fbe8df433019 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -78,12 +78,18 @@ do { \ ___p1; \ }) +#ifdef CONFIG_PPC_BOOK3S_64 +#define NOSPEC_BARRIER_SLOT nop +#elif defined(CONFIG_PPC_FSL_BOOK3E) +#define NOSPEC_BARRIER_SLOT nop; nop +#endif + #ifdef CONFIG_PPC_BARRIER_NOSPEC /* * Prevent execution of subsequent instructions until preceding branches have * been fully resolved and are no longer executing speculatively. */ -#define barrier_nospec_asm NOSPEC_BARRIER_FIXUP_SECTION; nop +#define barrier_nospec_asm NOSPEC_BARRIER_FIXUP_SECTION; NOSPEC_BARRIER_SLOT // This also acts as a compiler barrier due to the memory clobber. #define barrier_nospec() asm (stringify_in_c(barrier_nospec_asm) ::: "memory") diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 0e604b41b5d1..e613b02bb2f0 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -318,6 +318,37 @@ void do_barrier_nospec_fixups(bool enable) } #endif /* CONFIG_PPC_BARRIER_NOSPEC */ +#ifdef CONFIG_PPC_FSL_BOOK3E +void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end) +{ + unsigned int instr[2], *dest; + long *start, *end; + int i; + + start = fixup_start; + end = fixup_end; + + instr[0] = PPC_INST_NOP; + instr[1] = PPC_INST_NOP; + + if (enable) { + pr_info("barrier-nospec: using isync; sync as speculation barrier\n"); + instr[0] = PPC_INST_ISYNC; + instr[1] = PPC_INST_SYNC; + } + + for (i = 0; start < end; start++, i++) { + dest = (void *)start + *start; + + pr_devel("patching dest %lx\n", (unsigned long)dest); + patch_instruction(dest, instr[0]); + patch_instruction(dest + 1, instr[1]); + } + + printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i); +} +#endif /* CONFIG_PPC_FSL_BOOK3E */ + void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) { long *start, *end; -- cgit v1.2.3-70-g09d2 From 06d0bbc6d0f56dacac3a79900e9a9a0d5972d818 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 24 Jul 2018 01:07:52 +1000 Subject: powerpc/asm: Add a patch_site macro & helpers for patching instructions Add a macro and some helper C functions for patching single asm instructions. The gas macro means we can do something like: 1: nop patch_site 1b, patch__foo Which is less visually distracting than defining a GLOBAL symbol at 1, and also doesn't pollute the symbol table which can confuse eg. perf. These are obviously similar to our existing feature sections, but are not automatically patched based on CPU/MMU features, rather they are designed to be manually patched by C code at some arbitrary point. Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/code-patching-asm.h | 18 ++++++++++++++++++ arch/powerpc/include/asm/code-patching.h | 2 ++ arch/powerpc/lib/code-patching.c | 16 ++++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 arch/powerpc/include/asm/code-patching-asm.h (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/include/asm/code-patching-asm.h b/arch/powerpc/include/asm/code-patching-asm.h new file mode 100644 index 000000000000..ed7b1448493a --- /dev/null +++ b/arch/powerpc/include/asm/code-patching-asm.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright 2018, Michael Ellerman, IBM Corporation. + */ +#ifndef _ASM_POWERPC_CODE_PATCHING_ASM_H +#define _ASM_POWERPC_CODE_PATCHING_ASM_H + +/* Define a "site" that can be patched */ +.macro patch_site label name + .pushsection ".rodata" + .balign 4 + .global \name +\name: + .4byte \label - . + .popsection +.endm + +#endif /* _ASM_POWERPC_CODE_PATCHING_ASM_H */ diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 9ecc7bfc8ae7..31733a95bbd0 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -33,6 +33,8 @@ unsigned int create_cond_branch(const unsigned int *addr, int patch_branch(unsigned int *addr, unsigned long target, int flags); int patch_instruction(unsigned int *addr, unsigned int instr); int raw_patch_instruction(unsigned int *addr, unsigned int instr); +int patch_instruction_site(s32 *addr, unsigned int instr); +int patch_branch_site(s32 *site, unsigned long target, int flags); int instr_is_relative_branch(unsigned int instr); int instr_is_relative_link_branch(unsigned int instr); diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index e0d881ab304e..850f3b8f4da5 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -195,6 +195,22 @@ int patch_branch(unsigned int *addr, unsigned long target, int flags) return patch_instruction(addr, create_branch(addr, target, flags)); } +int patch_branch_site(s32 *site, unsigned long target, int flags) +{ + unsigned int *addr; + + addr = (unsigned int *)((unsigned long)site + *site); + return patch_instruction(addr, create_branch(addr, target, flags)); +} + +int patch_instruction_site(s32 *site, unsigned int instr) +{ + unsigned int *addr; + + addr = (unsigned int *)((unsigned long)site + *site); + return patch_instruction(addr, instr); +} + bool is_offset_in_branch_range(long offset) { /* -- cgit v1.2.3-70-g09d2 From a7c81ce398e2ad304f61d6167155f3ef65a96524 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 3 Aug 2018 20:13:03 +1000 Subject: powerpc/64: Make exception table clearer in __copy_tofrom_user_base This aims to make the generation of exception table entries for the loads and stores in __copy_tofrom_user_base clearer and easier to verify. Instead of having a series of local labels on the loads and stores, with a series of corresponding labels later for the exception handlers, we now use macros to generate exception table entries at the point of each load and store that could potentially trap. We do this with the macros lex (load exception) and stex (store exception). These macros are used right before the load or store to which they apply. Some complexity is introduced by the fact that we have some more work to do after hitting an exception, because we need to calculate and return the number of bytes not copied. The code uses r3 as the current pointer into the destination buffer, that is, the address of the first byte of the destination that has not been modified. However, at various points in the copy loops, r3 can be 4, 8, 16 or 24 bytes behind that point. To express this offset in an understandable way, we define a symbol r3_offset which is updated at various points so that it equal to the difference between the address of the first unmodified byte of the destination and the value in r3. (In fact it only needs to be accurate at the point of each lex or stex macro invocation.) The rules for updating r3_offset are as follows: * It starts out at 0 * An addi r3,r3,N instruction decreases r3_offset by N * A store instruction (stb, sth, stw, std) to N(r3) increases r3_offset by the width of the store (1, 2, 4, 8) * A store with update instruction (stbu, sthu, stwu, stdu) to N(r3) sets r3_offset to the width of the store. There is some trickiness to the way that the lex and stex macros and the associated exception handlers work. I would have liked to use the current value of r3_offset in the name of the symbol used as the exception handler, as in ".Lld_exc_$(r3_offset)" and then have symbols .Lld_exc_0, .Lld_exc_8, .Lld_exc_16 etc. corresponding to the offsets that needed to be added to r3. However, I couldn't see a way to do that with gas. Instead, the exception handler address is .Lld_exc - r3_offset or .Lst_exc - r3_offset, that is, the distance ahead of .Lld_exc/.Lst_exc that we start executing is equal to the amount that we need to add to r3. This works because r3_offset is always a small multiple of 4, and our instructions are 4 bytes long. This means that before .Lld_exc and .Lst_exc, we have a sequence of instructions that increments r3 by 4, 8, 16 or 24 depending on where we start. The sequence increments r3 by 4 per instruction (on average). We also replace the exception table for the 4k copy loop by a macro per load or store. These loads and stores all use exactly the same exception handler, which simply resets the argument registers r3, r4 and r5 to there original values and re-does the whole copy using the slower loop. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/lib/copyuser_64.S | 551 +++++++++++++++++------------------------ 1 file changed, 225 insertions(+), 326 deletions(-) (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S index 2d6f128d3ebe..8c5033c85e37 100644 --- a/arch/powerpc/lib/copyuser_64.S +++ b/arch/powerpc/lib/copyuser_64.S @@ -20,6 +20,28 @@ #define sHd sld /* Shift towards high-numbered address. */ #endif +/* + * These macros are used to generate exception table entries. + * The exception handlers below use the original arguments + * (stored on the stack) and the point where we're up to in + * the destination buffer, i.e. the address of the first + * unmodified byte. Generally r3 points into the destination + * buffer, but the first unmodified byte is at a variable + * offset from r3. In the code below, the symbol r3_offset + * is set to indicate the current offset at each point in + * the code. This offset is then used as a negative offset + * from the exception handler code, and those instructions + * before the exception handlers are addi instructions that + * adjust r3 to point to the correct place. + */ + .macro lex /* exception handler for load */ +100: EX_TABLE(100b, .Lld_exc - r3_offset) + .endm + + .macro stex /* exception handler for store */ +100: EX_TABLE(100b, .Lst_exc - r3_offset) + .endm + .align 7 _GLOBAL_TOC(__copy_tofrom_user) #ifdef CONFIG_PPC_BOOK3S_64 @@ -30,7 +52,7 @@ FTR_SECTION_ELSE ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) #endif _GLOBAL(__copy_tofrom_user_base) - /* first check for a whole page copy on a page boundary */ + /* first check for a 4kB copy on a 4kB boundary */ cmpldi cr1,r5,16 cmpdi cr6,r5,4096 or r0,r3,r4 @@ -59,6 +81,7 @@ ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \ CPU_FTR_UNALIGNED_LD_STD) .Ldst_aligned: addi r3,r3,-16 +r3_offset = 16 BEGIN_FTR_SECTION andi. r0,r4,7 bne .Lsrc_unaligned @@ -66,57 +89,69 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) blt cr1,.Ldo_tail /* if < 16 bytes to copy */ srdi r0,r5,5 cmpdi cr1,r0,0 -20: ld r7,0(r4) -220: ld r6,8(r4) +lex; ld r7,0(r4) +lex; ld r6,8(r4) addi r4,r4,16 mtctr r0 andi. r0,r5,0x10 beq 22f addi r3,r3,16 +r3_offset = 0 addi r4,r4,-16 mr r9,r7 mr r8,r6 beq cr1,72f -21: ld r7,16(r4) -221: ld r6,24(r4) +21: +lex; ld r7,16(r4) +lex; ld r6,24(r4) addi r4,r4,32 -70: std r9,0(r3) -270: std r8,8(r3) -22: ld r9,0(r4) -222: ld r8,8(r4) -71: std r7,16(r3) -271: std r6,24(r3) +stex; std r9,0(r3) +r3_offset = 8 +stex; std r8,8(r3) +r3_offset = 16 +22: +lex; ld r9,0(r4) +lex; ld r8,8(r4) +stex; std r7,16(r3) +r3_offset = 24 +stex; std r6,24(r3) addi r3,r3,32 +r3_offset = 0 bdnz 21b -72: std r9,0(r3) -272: std r8,8(r3) +72: +stex; std r9,0(r3) +r3_offset = 8 +stex; std r8,8(r3) +r3_offset = 16 andi. r5,r5,0xf beq+ 3f addi r4,r4,16 .Ldo_tail: addi r3,r3,16 +r3_offset = 0 bf cr7*4+0,246f -244: ld r9,0(r4) +lex; ld r9,0(r4) addi r4,r4,8 -245: std r9,0(r3) +stex; std r9,0(r3) addi r3,r3,8 246: bf cr7*4+1,1f -23: lwz r9,0(r4) +lex; lwz r9,0(r4) addi r4,r4,4 -73: stw r9,0(r3) +stex; stw r9,0(r3) addi r3,r3,4 1: bf cr7*4+2,2f -44: lhz r9,0(r4) +lex; lhz r9,0(r4) addi r4,r4,2 -74: sth r9,0(r3) +stex; sth r9,0(r3) addi r3,r3,2 2: bf cr7*4+3,3f -45: lbz r9,0(r4) -75: stb r9,0(r3) +lex; lbz r9,0(r4) +stex; stb r9,0(r3) 3: li r3,0 blr .Lsrc_unaligned: +r3_offset = 16 srdi r6,r5,3 addi r5,r5,-16 subf r4,r0,r4 @@ -129,58 +164,69 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) add r5,r5,r0 bt cr7*4+0,28f -24: ld r9,0(r4) /* 3+2n loads, 2+2n stores */ -25: ld r0,8(r4) +lex; ld r9,0(r4) /* 3+2n loads, 2+2n stores */ +lex; ld r0,8(r4) sLd r6,r9,r10 -26: ldu r9,16(r4) +lex; ldu r9,16(r4) sHd r7,r0,r11 sLd r8,r0,r10 or r7,r7,r6 blt cr6,79f -27: ld r0,8(r4) +lex; ld r0,8(r4) b 2f -28: ld r0,0(r4) /* 4+2n loads, 3+2n stores */ -29: ldu r9,8(r4) +28: +lex; ld r0,0(r4) /* 4+2n loads, 3+2n stores */ +lex; ldu r9,8(r4) sLd r8,r0,r10 addi r3,r3,-8 +r3_offset = 24 blt cr6,5f -30: ld r0,8(r4) +lex; ld r0,8(r4) sHd r12,r9,r11 sLd r6,r9,r10 -31: ldu r9,16(r4) +lex; ldu r9,16(r4) or r12,r8,r12 sHd r7,r0,r11 sLd r8,r0,r10 addi r3,r3,16 +r3_offset = 8 beq cr6,78f 1: or r7,r7,r6 -32: ld r0,8(r4) -76: std r12,8(r3) +lex; ld r0,8(r4) +stex; std r12,8(r3) +r3_offset = 16 2: sHd r12,r9,r11 sLd r6,r9,r10 -33: ldu r9,16(r4) +lex; ldu r9,16(r4) or r12,r8,r12 -77: stdu r7,16(r3) +stex; stdu r7,16(r3) +r3_offset = 8 sHd r7,r0,r11 sLd r8,r0,r10 bdnz 1b -78: std r12,8(r3) +78: +stex; std r12,8(r3) +r3_offset = 16 or r7,r7,r6 -79: std r7,16(r3) +79: +stex; std r7,16(r3) +r3_offset = 24 5: sHd r12,r9,r11 or r12,r8,r12 -80: std r12,24(r3) +stex; std r12,24(r3) +r3_offset = 32 bne 6f li r3,0 blr 6: cmpwi cr1,r5,8 addi r3,r3,32 +r3_offset = 0 sLd r9,r9,r10 ble cr1,7f -34: ld r0,8(r4) +lex; ld r0,8(r4) sHd r7,r0,r11 or r9,r7,r9 7: @@ -188,7 +234,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) #ifdef __BIG_ENDIAN__ rotldi r9,r9,32 #endif -94: stw r9,0(r3) +stex; stw r9,0(r3) #ifdef __LITTLE_ENDIAN__ rotrdi r9,r9,32 #endif @@ -197,7 +243,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) #ifdef __BIG_ENDIAN__ rotldi r9,r9,16 #endif -95: sth r9,0(r3) +stex; sth r9,0(r3) #ifdef __LITTLE_ENDIAN__ rotrdi r9,r9,16 #endif @@ -206,7 +252,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) #ifdef __BIG_ENDIAN__ rotldi r9,r9,8 #endif -96: stb r9,0(r3) +stex; stb r9,0(r3) #ifdef __LITTLE_ENDIAN__ rotrdi r9,r9,8 #endif @@ -214,47 +260,55 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) blr .Ldst_unaligned: +r3_offset = 0 PPC_MTOCRF(0x01,r6) /* put #bytes to 8B bdry into cr7 */ subf r5,r6,r5 li r7,0 cmpldi cr1,r5,16 bf cr7*4+3,1f -35: lbz r0,0(r4) -81: stb r0,0(r3) +100: EX_TABLE(100b, .Lld_exc_r7) + lbz r0,0(r4) +100: EX_TABLE(100b, .Lst_exc_r7) + stb r0,0(r3) addi r7,r7,1 1: bf cr7*4+2,2f -36: lhzx r0,r7,r4 -82: sthx r0,r7,r3 +100: EX_TABLE(100b, .Lld_exc_r7) + lhzx r0,r7,r4 +100: EX_TABLE(100b, .Lst_exc_r7) + sthx r0,r7,r3 addi r7,r7,2 2: bf cr7*4+1,3f -37: lwzx r0,r7,r4 -83: stwx r0,r7,r3 +100: EX_TABLE(100b, .Lld_exc_r7) + lwzx r0,r7,r4 +100: EX_TABLE(100b, .Lst_exc_r7) + stwx r0,r7,r3 3: PPC_MTOCRF(0x01,r5) add r4,r6,r4 add r3,r6,r3 b .Ldst_aligned .Lshort_copy: +r3_offset = 0 bf cr7*4+0,1f -38: lwz r0,0(r4) -39: lwz r9,4(r4) +lex; lwz r0,0(r4) +lex; lwz r9,4(r4) addi r4,r4,8 -84: stw r0,0(r3) -85: stw r9,4(r3) +stex; stw r0,0(r3) +stex; stw r9,4(r3) addi r3,r3,8 1: bf cr7*4+1,2f -40: lwz r0,0(r4) +lex; lwz r0,0(r4) addi r4,r4,4 -86: stw r0,0(r3) +stex; stw r0,0(r3) addi r3,r3,4 2: bf cr7*4+2,3f -41: lhz r0,0(r4) +lex; lhz r0,0(r4) addi r4,r4,2 -87: sth r0,0(r3) +stex; sth r0,0(r3) addi r3,r3,2 3: bf cr7*4+3,4f -42: lbz r0,0(r4) -88: stb r0,0(r3) +lex; lbz r0,0(r4) +stex; stb r0,0(r3) 4: li r3,0 blr @@ -262,48 +316,34 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) * exception handlers follow * we have to return the number of bytes not copied * for an exception on a load, we set the rest of the destination to 0 + * Note that the number of bytes of instructions for adjusting r3 needs + * to equal the amount of the adjustment, due to the trick of using + * .Lld_exc - r3_offset as the handler address. */ -136: -137: +.Lld_exc_r7: add r3,r3,r7 - b 1f -130: -131: + b .Lld_exc + + /* adjust by 24 */ addi r3,r3,8 -120: -320: -122: -322: -124: -125: -126: -127: -128: -129: -133: + nop + /* adjust by 16 */ addi r3,r3,8 -132: + nop + /* adjust by 8 */ addi r3,r3,8 -121: -321: -344: -134: -135: -138: -139: -140: -141: -142: -123: -144: -145: + nop /* - * here we have had a fault on a load and r3 points to the first - * unmodified byte of the destination + * Here we have had a fault on a load and r3 points to the first + * unmodified byte of the destination. We use the original arguments + * and r3 to work out how much wasn't copied. Since we load some + * distance ahead of the stores, we continue copying byte-by-byte until + * we hit the load fault again in order to copy as much as possible. */ -1: ld r6,-24(r1) +.Lld_exc: + ld r6,-24(r1) ld r4,-16(r1) ld r5,-8(r1) subf r6,r6,r3 @@ -314,9 +354,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) * first see if we can copy any more bytes before hitting another exception */ mtctr r5 +r3_offset = 0 +100: EX_TABLE(100b, .Ldone) 43: lbz r0,0(r4) addi r4,r4,1 -89: stb r0,0(r3) +stex; stb r0,0(r3) addi r3,r3,1 bdnz 43b li r3,0 /* huh? all copied successfully this time? */ @@ -325,116 +367,46 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) /* * here we have trapped again, amount remaining is in ctr. */ -143: mfctr r3 +.Ldone: + mfctr r3 blr /* * exception handlers for stores: we just need to work * out how many bytes weren't copied + * Note that the number of bytes of instructions for adjusting r3 needs + * to equal the amount of the adjustment, due to the trick of using + * .Lst_exc - r3_offset as the handler address. */ -182: -183: +.Lst_exc_r7: add r3,r3,r7 - b 1f -371: -180: + b .Lst_exc + + /* adjust by 24 */ addi r3,r3,8 -171: -177: -179: + nop + /* adjust by 16 */ addi r3,r3,8 -370: -372: -176: -178: + nop + /* adjust by 8 */ addi r3,r3,4 -185: + /* adjust by 4 */ addi r3,r3,4 -170: -172: -345: -173: -174: -175: -181: -184: -186: -187: -188: -189: -194: -195: -196: -1: +.Lst_exc: ld r6,-24(r1) ld r5,-8(r1) add r6,r6,r5 - subf r3,r3,r6 /* #bytes not copied */ + subf r3,r3,r6 /* #bytes not copied in r3 */ blr - EX_TABLE(20b,120b) - EX_TABLE(220b,320b) - EX_TABLE(21b,121b) - EX_TABLE(221b,321b) - EX_TABLE(70b,170b) - EX_TABLE(270b,370b) - EX_TABLE(22b,122b) - EX_TABLE(222b,322b) - EX_TABLE(71b,171b) - EX_TABLE(271b,371b) - EX_TABLE(72b,172b) - EX_TABLE(272b,372b) - EX_TABLE(244b,344b) - EX_TABLE(245b,345b) - EX_TABLE(23b,123b) - EX_TABLE(73b,173b) - EX_TABLE(44b,144b) - EX_TABLE(74b,174b) - EX_TABLE(45b,145b) - EX_TABLE(75b,175b) - EX_TABLE(24b,124b) - EX_TABLE(25b,125b) - EX_TABLE(26b,126b) - EX_TABLE(27b,127b) - EX_TABLE(28b,128b) - EX_TABLE(29b,129b) - EX_TABLE(30b,130b) - EX_TABLE(31b,131b) - EX_TABLE(32b,132b) - EX_TABLE(76b,176b) - EX_TABLE(33b,133b) - EX_TABLE(77b,177b) - EX_TABLE(78b,178b) - EX_TABLE(79b,179b) - EX_TABLE(80b,180b) - EX_TABLE(34b,134b) - EX_TABLE(94b,194b) - EX_TABLE(95b,195b) - EX_TABLE(96b,196b) - EX_TABLE(35b,135b) - EX_TABLE(81b,181b) - EX_TABLE(36b,136b) - EX_TABLE(82b,182b) - EX_TABLE(37b,137b) - EX_TABLE(83b,183b) - EX_TABLE(38b,138b) - EX_TABLE(39b,139b) - EX_TABLE(84b,184b) - EX_TABLE(85b,185b) - EX_TABLE(40b,140b) - EX_TABLE(86b,186b) - EX_TABLE(41b,141b) - EX_TABLE(87b,187b) - EX_TABLE(42b,142b) - EX_TABLE(88b,188b) - EX_TABLE(43b,143b) - EX_TABLE(89b,189b) - /* * Routine to copy a whole page of data, optimized for POWER4. * On POWER4 it is more than 50% faster than the simple loop * above (following the .Ldst_aligned label). */ + .macro exc +100: EX_TABLE(100b, .Labort) + .endm .Lcopy_page_4K: std r31,-32(1) std r30,-40(1) @@ -453,86 +425,86 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) li r0,5 0: addi r5,r5,-24 mtctr r0 -20: ld r22,640(4) -21: ld r21,512(4) -22: ld r20,384(4) -23: ld r11,256(4) -24: ld r9,128(4) -25: ld r7,0(4) -26: ld r25,648(4) -27: ld r24,520(4) -28: ld r23,392(4) -29: ld r10,264(4) -30: ld r8,136(4) -31: ldu r6,8(4) +exc; ld r22,640(4) +exc; ld r21,512(4) +exc; ld r20,384(4) +exc; ld r11,256(4) +exc; ld r9,128(4) +exc; ld r7,0(4) +exc; ld r25,648(4) +exc; ld r24,520(4) +exc; ld r23,392(4) +exc; ld r10,264(4) +exc; ld r8,136(4) +exc; ldu r6,8(4) cmpwi r5,24 1: -32: std r22,648(3) -33: std r21,520(3) -34: std r20,392(3) -35: std r11,264(3) -36: std r9,136(3) -37: std r7,8(3) -38: ld r28,648(4) -39: ld r27,520(4) -40: ld r26,392(4) -41: ld r31,264(4) -42: ld r30,136(4) -43: ld r29,8(4) -44: std r25,656(3) -45: std r24,528(3) -46: std r23,400(3) -47: std r10,272(3) -48: std r8,144(3) -49: std r6,16(3) -50: ld r22,656(4) -51: ld r21,528(4) -52: ld r20,400(4) -53: ld r11,272(4) -54: ld r9,144(4) -55: ld r7,16(4) -56: std r28,664(3) -57: std r27,536(3) -58: std r26,408(3) -59: std r31,280(3) -60: std r30,152(3) -61: stdu r29,24(3) -62: ld r25,664(4) -63: ld r24,536(4) -64: ld r23,408(4) -65: ld r10,280(4) -66: ld r8,152(4) -67: ldu r6,24(4) +exc; std r22,648(3) +exc; std r21,520(3) +exc; std r20,392(3) +exc; std r11,264(3) +exc; std r9,136(3) +exc; std r7,8(3) +exc; ld r28,648(4) +exc; ld r27,520(4) +exc; ld r26,392(4) +exc; ld r31,264(4) +exc; ld r30,136(4) +exc; ld r29,8(4) +exc; std r25,656(3) +exc; std r24,528(3) +exc; std r23,400(3) +exc; std r10,272(3) +exc; std r8,144(3) +exc; std r6,16(3) +exc; ld r22,656(4) +exc; ld r21,528(4) +exc; ld r20,400(4) +exc; ld r11,272(4) +exc; ld r9,144(4) +exc; ld r7,16(4) +exc; std r28,664(3) +exc; std r27,536(3) +exc; std r26,408(3) +exc; std r31,280(3) +exc; std r30,152(3) +exc; stdu r29,24(3) +exc; ld r25,664(4) +exc; ld r24,536(4) +exc; ld r23,408(4) +exc; ld r10,280(4) +exc; ld r8,152(4) +exc; ldu r6,24(4) bdnz 1b -68: std r22,648(3) -69: std r21,520(3) -70: std r20,392(3) -71: std r11,264(3) -72: std r9,136(3) -73: std r7,8(3) -74: addi r4,r4,640 -75: addi r3,r3,648 +exc; std r22,648(3) +exc; std r21,520(3) +exc; std r20,392(3) +exc; std r11,264(3) +exc; std r9,136(3) +exc; std r7,8(3) + addi r4,r4,640 + addi r3,r3,648 bge 0b mtctr r5 -76: ld r7,0(4) -77: ld r8,8(4) -78: ldu r9,16(4) +exc; ld r7,0(4) +exc; ld r8,8(4) +exc; ldu r9,16(4) 3: -79: ld r10,8(4) -80: std r7,8(3) -81: ld r7,16(4) -82: std r8,16(3) -83: ld r8,24(4) -84: std r9,24(3) -85: ldu r9,32(4) -86: stdu r10,32(3) +exc; ld r10,8(4) +exc; std r7,8(3) +exc; ld r7,16(4) +exc; std r8,16(3) +exc; ld r8,24(4) +exc; std r9,24(3) +exc; ldu r9,32(4) +exc; stdu r10,32(3) bdnz 3b 4: -87: ld r10,8(4) -88: std r7,8(3) -89: std r8,16(3) -90: std r9,24(3) -91: std r10,32(3) +exc; ld r10,8(4) +exc; std r7,8(3) +exc; std r8,16(3) +exc; std r9,24(3) +exc; std r10,32(3) 9: ld r20,-120(1) ld r21,-112(1) ld r22,-104(1) @@ -552,7 +524,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) * on an exception, reset to the beginning and jump back into the * standard __copy_tofrom_user */ -100: ld r20,-120(1) +.Labort: + ld r20,-120(1) ld r21,-112(1) ld r22,-104(1) ld r23,-96(1) @@ -568,78 +541,4 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) ld r4,-16(r1) li r5,4096 b .Ldst_aligned - - EX_TABLE(20b,100b) - EX_TABLE(21b,100b) - EX_TABLE(22b,100b) - EX_TABLE(23b,100b) - EX_TABLE(24b,100b) - EX_TABLE(25b,100b) - EX_TABLE(26b,100b) - EX_TABLE(27b,100b) - EX_TABLE(28b,100b) - EX_TABLE(29b,100b) - EX_TABLE(30b,100b) - EX_TABLE(31b,100b) - EX_TABLE(32b,100b) - EX_TABLE(33b,100b) - EX_TABLE(34b,100b) - EX_TABLE(35b,100b) - EX_TABLE(36b,100b) - EX_TABLE(37b,100b) - EX_TABLE(38b,100b) - EX_TABLE(39b,100b) - EX_TABLE(40b,100b) - EX_TABLE(41b,100b) - EX_TABLE(42b,100b) - EX_TABLE(43b,100b) - EX_TABLE(44b,100b) - EX_TABLE(45b,100b) - EX_TABLE(46b,100b) - EX_TABLE(47b,100b) - EX_TABLE(48b,100b) - EX_TABLE(49b,100b) - EX_TABLE(50b,100b) - EX_TABLE(51b,100b) - EX_TABLE(52b,100b) - EX_TABLE(53b,100b) - EX_TABLE(54b,100b) - EX_TABLE(55b,100b) - EX_TABLE(56b,100b) - EX_TABLE(57b,100b) - EX_TABLE(58b,100b) - EX_TABLE(59b,100b) - EX_TABLE(60b,100b) - EX_TABLE(61b,100b) - EX_TABLE(62b,100b) - EX_TABLE(63b,100b) - EX_TABLE(64b,100b) - EX_TABLE(65b,100b) - EX_TABLE(66b,100b) - EX_TABLE(67b,100b) - EX_TABLE(68b,100b) - EX_TABLE(69b,100b) - EX_TABLE(70b,100b) - EX_TABLE(71b,100b) - EX_TABLE(72b,100b) - EX_TABLE(73b,100b) - EX_TABLE(74b,100b) - EX_TABLE(75b,100b) - EX_TABLE(76b,100b) - EX_TABLE(77b,100b) - EX_TABLE(78b,100b) - EX_TABLE(79b,100b) - EX_TABLE(80b,100b) - EX_TABLE(81b,100b) - EX_TABLE(82b,100b) - EX_TABLE(83b,100b) - EX_TABLE(84b,100b) - EX_TABLE(85b,100b) - EX_TABLE(86b,100b) - EX_TABLE(87b,100b) - EX_TABLE(88b,100b) - EX_TABLE(89b,100b) - EX_TABLE(90b,100b) - EX_TABLE(91b,100b) - EXPORT_SYMBOL(__copy_tofrom_user) -- cgit v1.2.3-70-g09d2 From 98c45f51f767bfdd71d773cceaceb403352e51ae Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 3 Aug 2018 20:13:04 +1000 Subject: selftests/powerpc/64: Test all paths through copy routines The hand-coded assembler 64-bit copy routines include feature sections that select one code path or another depending on which CPU we are executing on. The self-tests for these copy routines end up testing just one path. This adds a mechanism for selecting any desired code path at compile time, and makes 2 or 3 versions of each test, each using a different code path, so as to cover all the possible paths. Signed-off-by: Paul Mackerras [mpe: Add -mcpu=power4 to CFLAGS for older compilers] Signed-off-by: Michael Ellerman --- arch/powerpc/lib/copyuser_64.S | 7 +++++ arch/powerpc/lib/copyuser_power7.S | 21 ++++++------- arch/powerpc/lib/memcpy_64.S | 9 ++++-- arch/powerpc/lib/memcpy_power7.S | 22 +++++++------- .../testing/selftests/powerpc/copyloops/.gitignore | 14 ++++++--- tools/testing/selftests/powerpc/copyloops/Makefile | 35 ++++++++++++++++++---- .../selftests/powerpc/copyloops/asm/ppc_asm.h | 21 +++++++------ 7 files changed, 85 insertions(+), 44 deletions(-) (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S index 8c5033c85e37..2197a35097c5 100644 --- a/arch/powerpc/lib/copyuser_64.S +++ b/arch/powerpc/lib/copyuser_64.S @@ -12,6 +12,11 @@ #include #include +#ifndef SELFTEST_CASE +/* 0 == most CPUs, 1 == POWER6, 2 == Cell */ +#define SELFTEST_CASE 0 +#endif + #ifdef __BIG_ENDIAN__ #define sLd sld /* Shift towards low-numbered address. */ #define sHd srd /* Shift towards high-numbered address. */ @@ -73,6 +78,7 @@ _GLOBAL(__copy_tofrom_user_base) * At the time of writing the only CPU that has this combination of bits * set is Power6. */ +test_feature = (SELFTEST_CASE == 1) BEGIN_FTR_SECTION nop FTR_SECTION_ELSE @@ -82,6 +88,7 @@ ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \ .Ldst_aligned: addi r3,r3,-16 r3_offset = 16 +test_feature = (SELFTEST_CASE == 0) BEGIN_FTR_SECTION andi. r0,r4,7 bne .Lsrc_unaligned diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S index 215e4760c09f..1a1fe180af62 100644 --- a/arch/powerpc/lib/copyuser_power7.S +++ b/arch/powerpc/lib/copyuser_power7.S @@ -19,6 +19,11 @@ */ #include +#ifndef SELFTEST_CASE +/* 0 == don't use VMX, 1 == use VMX */ +#define SELFTEST_CASE 0 +#endif + #ifdef __BIG_ENDIAN__ #define LVS(VRT,RA,RB) lvsl VRT,RA,RB #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC @@ -80,7 +85,6 @@ _GLOBAL(__copy_tofrom_user_power7) -#ifdef CONFIG_ALTIVEC cmpldi r5,16 cmpldi cr1,r5,3328 @@ -89,15 +93,12 @@ _GLOBAL(__copy_tofrom_user_power7) std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) blt .Lshort_copy - bge cr1,.Lvmx_copy -#else - cmpldi r5,16 - std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) - std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) - std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) - - blt .Lshort_copy +#ifdef CONFIG_ALTIVEC +test_feature = SELFTEST_CASE +BEGIN_FTR_SECTION + bgt cr1,.Lvmx_copy +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif .Lnonvmx_copy: @@ -278,8 +279,8 @@ err1; stb r0,0(r3) addi r1,r1,STACKFRAMESIZE b .Lnonvmx_copy -#ifdef CONFIG_ALTIVEC .Lvmx_copy: +#ifdef CONFIG_ALTIVEC mflr r0 std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S index 94650d6eae9c..273ea67e60a1 100644 --- a/arch/powerpc/lib/memcpy_64.S +++ b/arch/powerpc/lib/memcpy_64.S @@ -12,6 +12,11 @@ #include #include +#ifndef SELFTEST_CASE +/* For big-endian, 0 == most CPUs, 1 == POWER6, 2 == Cell */ +#define SELFTEST_CASE 0 +#endif + .align 7 _GLOBAL_TOC(memcpy) BEGIN_FTR_SECTION @@ -22,10 +27,8 @@ BEGIN_FTR_SECTION #endif FTR_SECTION_ELSE #ifdef CONFIG_PPC_BOOK3S_64 -#ifndef SELFTEST b memcpy_power7 #endif -#endif ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) #ifdef __LITTLE_ENDIAN__ /* dumb little-endian memcpy that will get replaced at runtime */ @@ -49,6 +52,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) cleared. At the time of writing the only CPU that has this combination of bits set is Power6. */ +test_feature = (SELFTEST_CASE == 1) BEGIN_FTR_SECTION nop FTR_SECTION_ELSE @@ -57,6 +61,7 @@ ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \ CPU_FTR_UNALIGNED_LD_STD) .Ldst_aligned: addi r3,r3,-16 +test_feature = (SELFTEST_CASE == 0) BEGIN_FTR_SECTION andi. r0,r4,7 bne .Lsrc_unaligned diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S index 070cdf6f584f..89bfefcf7fcc 100644 --- a/arch/powerpc/lib/memcpy_power7.S +++ b/arch/powerpc/lib/memcpy_power7.S @@ -19,7 +19,10 @@ */ #include -_GLOBAL(memcpy_power7) +#ifndef SELFTEST_CASE +/* 0 == don't use VMX, 1 == use VMX */ +#define SELFTEST_CASE 0 +#endif #ifdef __BIG_ENDIAN__ #define LVS(VRT,RA,RB) lvsl VRT,RA,RB @@ -29,20 +32,17 @@ _GLOBAL(memcpy_power7) #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC #endif -#ifdef CONFIG_ALTIVEC +_GLOBAL(memcpy_power7) cmpldi r5,16 cmpldi cr1,r5,4096 - std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) - blt .Lshort_copy - bgt cr1,.Lvmx_copy -#else - cmpldi r5,16 - - std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) - blt .Lshort_copy +#ifdef CONFIG_ALTIVEC +test_feature = SELFTEST_CASE +BEGIN_FTR_SECTION + bgt cr1, .Lvmx_copy +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif .Lnonvmx_copy: @@ -223,8 +223,8 @@ _GLOBAL(memcpy_power7) addi r1,r1,STACKFRAMESIZE b .Lnonvmx_copy -#ifdef CONFIG_ALTIVEC .Lvmx_copy: +#ifdef CONFIG_ALTIVEC mflr r0 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) diff --git a/tools/testing/selftests/powerpc/copyloops/.gitignore b/tools/testing/selftests/powerpc/copyloops/.gitignore index 25a192f62c4d..bc6c4ba153af 100644 --- a/tools/testing/selftests/powerpc/copyloops/.gitignore +++ b/tools/testing/selftests/powerpc/copyloops/.gitignore @@ -1,4 +1,10 @@ -copyuser_64 -copyuser_power7 -memcpy_64 -memcpy_power7 +copyuser_64_t0 +copyuser_64_t1 +copyuser_64_t2 +copyuser_power7_t0 +copyuser_power7_t1 +memcpy_64_t0 +memcpy_64_t1 +memcpy_64_t2 +memcpy_power7_t0 +memcpy_power7_t1 diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile index eedce3366f64..498d8d875ab8 100644 --- a/tools/testing/selftests/powerpc/copyloops/Makefile +++ b/tools/testing/selftests/powerpc/copyloops/Makefile @@ -4,18 +4,41 @@ CFLAGS += -m64 CFLAGS += -I$(CURDIR) CFLAGS += -D SELFTEST CFLAGS += -maltivec +CFLAGS += -mcpu=power4 # Use our CFLAGS for the implicit .S rule & set the asm machine type ASFLAGS = $(CFLAGS) -Wa,-mpower4 -TEST_GEN_PROGS := copyuser_64 copyuser_power7 memcpy_64 memcpy_power7 +TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \ + copyuser_p7_t0 copyuser_p7_t1 \ + memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \ + memcpy_p7_t0 memcpy_p7_t1 + EXTRA_SOURCES := validate.c ../harness.c include ../../lib.mk -$(OUTPUT)/copyuser_64: CPPFLAGS += -D COPY_LOOP=test___copy_tofrom_user_base -$(OUTPUT)/copyuser_power7: CPPFLAGS += -D COPY_LOOP=test___copy_tofrom_user_power7 -$(OUTPUT)/memcpy_64: CPPFLAGS += -D COPY_LOOP=test_memcpy -$(OUTPUT)/memcpy_power7: CPPFLAGS += -D COPY_LOOP=test_memcpy_power7 +$(OUTPUT)/copyuser_64_t%: copyuser_64.S $(EXTRA_SOURCES) + $(CC) $(CPPFLAGS) $(CFLAGS) \ + -D COPY_LOOP=test___copy_tofrom_user_base \ + -D SELFTEST_CASE=$(subst copyuser_64_t,,$(notdir $@)) \ + -o $@ $^ + +$(OUTPUT)/copyuser_p7_t%: copyuser_power7.S $(EXTRA_SOURCES) + $(CC) $(CPPFLAGS) $(CFLAGS) \ + -D COPY_LOOP=test___copy_tofrom_user_power7 \ + -D SELFTEST_CASE=$(subst copyuser_p7_t,,$(notdir $@)) \ + -o $@ $^ + +# Strictly speaking, we only need the memcpy_64 test cases for big-endian +$(OUTPUT)/memcpy_64_t%: memcpy_64.S $(EXTRA_SOURCES) + $(CC) $(CPPFLAGS) $(CFLAGS) \ + -D COPY_LOOP=test_memcpy \ + -D SELFTEST_CASE=$(subst memcpy_64_t,,$(notdir $@)) \ + -o $@ $^ -$(TEST_GEN_PROGS): $(EXTRA_SOURCES) +$(OUTPUT)/memcpy_p7_t%: memcpy_power7.S $(EXTRA_SOURCES) + $(CC) $(CPPFLAGS) $(CFLAGS) \ + -D COPY_LOOP=test_memcpy_power7 \ + -D SELFTEST_CASE=$(subst memcpy_p7_t,,$(notdir $@)) \ + -o $@ $^ diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h index dfce161aebfc..91bc40392eb3 100644 --- a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h +++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h @@ -43,17 +43,16 @@ FUNC_START(enter_vmx_ops) FUNC_START(exit_vmx_ops) blr -FUNC_START(memcpy_power7) - blr - -FUNC_START(__copy_tofrom_user_power7) - blr - FUNC_START(__copy_tofrom_user_base) blr -#define BEGIN_FTR_SECTION -#define FTR_SECTION_ELSE -#define ALT_FTR_SECTION_END_IFCLR(x) -#define ALT_FTR_SECTION_END(x, y) -#define END_FTR_SECTION_IFCLR(x) +#define BEGIN_FTR_SECTION .if test_feature +#define FTR_SECTION_ELSE .else +#define ALT_FTR_SECTION_END_IFCLR(x) .endif +#define ALT_FTR_SECTION_END_IFSET(x) .endif +#define ALT_FTR_SECTION_END(x, y) .endif +#define END_FTR_SECTION_IFCLR(x) .endif +#define END_FTR_SECTION_IFSET(x) .endif + +/* Default to taking the first of any alternative feature sections */ +test_feature = 1 -- cgit v1.2.3-70-g09d2 From f8db2007ff5838aff696bd4297eefcc77af2cf46 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 3 Aug 2018 20:13:06 +1000 Subject: powerpc/64: Copy as much as possible in __copy_tofrom_user In __copy_tofrom_user, if we encounter an exception on a store, we stop copying and return the number of bytes not copied. However, if the store is wider than one byte and is to an unaligned address, it is possible that the store operand overlaps a page boundary and the exception occurred on the latter part of the store operand, meaning that it would be possible to copy a few more bytes. Since copy_to_user is generally expected to copy as much as possible, it would be better to copy those extra few bytes. This adds code to do that. Since this edge case is not performance-critical, the code has been written to be compact rather than as fast as possible. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/lib/copyuser_64.S | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S index 2197a35097c5..96c514bee66b 100644 --- a/arch/powerpc/lib/copyuser_64.S +++ b/arch/powerpc/lib/copyuser_64.S @@ -379,8 +379,8 @@ stex; stb r0,0(r3) blr /* - * exception handlers for stores: we just need to work - * out how many bytes weren't copied + * exception handlers for stores: we need to work out how many bytes + * weren't copied, and we may need to copy some more. * Note that the number of bytes of instructions for adjusting r3 needs * to equal the amount of the adjustment, due to the trick of using * .Lst_exc - r3_offset as the handler address. @@ -400,10 +400,27 @@ stex; stb r0,0(r3) /* adjust by 4 */ addi r3,r3,4 .Lst_exc: - ld r6,-24(r1) - ld r5,-8(r1) - add r6,r6,r5 - subf r3,r3,r6 /* #bytes not copied in r3 */ + ld r6,-24(r1) /* original destination pointer */ + ld r4,-16(r1) /* original source pointer */ + ld r5,-8(r1) /* original number of bytes */ + add r7,r6,r5 + /* + * If the destination pointer isn't 8-byte aligned, + * we may have got the exception as a result of a + * store that overlapped a page boundary, so we may be + * able to copy a few more bytes. + */ +17: andi. r0,r3,7 + beq 19f + subf r8,r6,r3 /* #bytes copied */ +100: EX_TABLE(100b,19f) + lbzx r0,r8,r4 +100: EX_TABLE(100b,19f) + stb r0,0(r3) + addi r3,r3,1 + cmpld r3,r7 + blt 17b +19: subf r3,r3,r7 /* #bytes not copied in r3 */ blr /* -- cgit v1.2.3-70-g09d2 From fa54a981ea7a852c145b05c95abba11e81fc1157 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 9 Aug 2018 08:14:41 +0000 Subject: powerpc/lib: Use patch_site to patch copy_32 functions once cache is enabled The symbol memcpy_nocache_branch defined in order to allow patching of memset function once cache is enabled leads to confusing reports by perf tool. Using the new patch_site functionality solves this issue. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/asm-prototypes.h | 1 + arch/powerpc/kernel/setup_32.c | 7 +++---- arch/powerpc/lib/copy_32.S | 9 ++++++--- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 70fdc5b9b9fb..1f4691ce4126 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -146,6 +146,7 @@ void _kvmppc_save_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr); /* Patch sites */ extern s32 patch__call_flush_count_cache; extern s32 patch__flush_count_cache_return; +extern s32 patch__memset_nocache, patch__memcpy_nocache; extern long flush_count_cache; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 0e3743343280..ba969278bf4d 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -97,11 +97,10 @@ notrace unsigned long __init early_init(unsigned long dt_ptr) * We do the initial parsing of the flat device-tree and prepares * for the MMU to be fully initialized. */ -extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */ - notrace void __init machine_init(u64 dt_ptr) { - unsigned int *addr = &memset_nocache_branch; + unsigned int *addr = (unsigned int *)((unsigned long)&patch__memset_nocache + + patch__memset_nocache); unsigned long insn; /* Configure static keys first, now that we're relocated. */ @@ -110,7 +109,7 @@ notrace void __init machine_init(u64 dt_ptr) /* Enable early debugging if any specified (see udbg.h) */ udbg_early_init(); - patch_instruction((unsigned int *)&memcpy, PPC_INST_NOP); + patch_instruction_site(&patch__memcpy_nocache, PPC_INST_NOP); insn = create_cond_branch(addr, branch_target(addr), 0x820000); patch_instruction(addr, insn); /* replace b by bne cr0 */ diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S index da425bb6b369..ba66846fe973 100644 --- a/arch/powerpc/lib/copy_32.S +++ b/arch/powerpc/lib/copy_32.S @@ -13,6 +13,7 @@ #include #include #include +#include #define COPY_16_BYTES \ lwz r7,4(r4); \ @@ -107,8 +108,8 @@ _GLOBAL(memset) * Skip optimised bloc until cache is enabled. Will be replaced * by 'bne' during boot to use normal procedure if r4 is not zero */ -_GLOBAL(memset_nocache_branch) - b 2f +5: b 2f + patch_site 5b, patch__memset_nocache clrlwi r7,r6,32-LG_CACHELINE_BYTES add r8,r7,r5 @@ -168,7 +169,9 @@ _GLOBAL(memmove) /* fall through */ _GLOBAL(memcpy) - b generic_memcpy +1: b generic_memcpy + patch_site 1b, patch__memcpy_nocache + add r7,r3,r5 /* test if the src & dst overlap */ add r8,r4,r5 cmplw 0,r4,r7 -- cgit v1.2.3-70-g09d2