summaryrefslogtreecommitdiff
path: root/arch/powerpc/lib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-17 11:32:50 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-17 11:32:50 -0700
commit5e2d059b52e397d9ac42f4c4d9d9a841887b5818 (patch)
treec8cd8fd7187113be33e29fcc75f45a8bbc27e6b2 /arch/powerpc/lib
parentd190775206d06397a9309421cac5ba2f2c243521 (diff)
parenta2dc009afa9ae8b92305be7728676562a104cb40 (diff)
Merge tag 'powerpc-4.19-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc updates from Michael Ellerman: "Notable changes: - A fix for a bug in our page table fragment allocator, where a page table page could be freed and reallocated for something else while still in use, leading to memory corruption etc. The fix reuses pt_mm in struct page (x86 only) for a powerpc only refcount. - Fixes to our pkey support. Several are user-visible changes, but bring us in to line with x86 behaviour and/or fix outright bugs. Thanks to Florian Weimer for reporting many of these. - A series to improve the hvc driver & related OPAL console code, which have been seen to cause hardlockups at times. The hvc driver changes in particular have been in linux-next for ~month. - Increase our MAX_PHYSMEM_BITS to 128TB when SPARSEMEM_VMEMMAP=y. - Remove Power8 DD1 and Power9 DD1 support, neither chip should be in use anywhere other than as a paper weight. - An optimised memcmp implementation using Power7-or-later VMX instructions - Support for barrier_nospec on some NXP CPUs. - Support for flushing the count cache on context switch on some IBM CPUs (controlled by firmware), as a Spectre v2 mitigation. - A series to enhance the information we print on unhandled signals to bring it into line with other arches, including showing the offending VMA and dumping the instructions around the fault. Thanks to: Aaro Koskinen, Akshay Adiga, Alastair D'Silva, Alexey Kardashevskiy, Alexey Spirkov, Alistair Popple, Andrew Donnellan, Aneesh Kumar K.V, Anju T Sudhakar, Arnd Bergmann, Bartosz Golaszewski, Benjamin Herrenschmidt, Bharat Bhushan, Bjoern Noetel, Boqun Feng, Breno Leitao, Bryant G. Ly, Camelia Groza, Christophe Leroy, Christoph Hellwig, Cyril Bur, Dan Carpenter, Daniel Klamt, Darren Stevens, Dave Young, David Gibson, Diana Craciun, Finn Thain, Florian Weimer, Frederic Barrat, Gautham R. Shenoy, Geert Uytterhoeven, Geoff Levand, Guenter Roeck, Gustavo Romero, Haren Myneni, Hari Bathini, Joel Stanley, Jonathan Neuschäfer, Kees Cook, Madhavan Srinivasan, Mahesh Salgaonkar, Markus Elfring, Mathieu Malaterre, Mauro S. M. Rodrigues, Michael Hanselmann, Michael Neuling, Michael Schmitz, Mukesh Ojha, Murilo Opsfelder Araujo, Nicholas Piggin, Parth Y Shah, Paul Mackerras, Paul Menzel, Ram Pai, Randy Dunlap, Rashmica Gupta, Reza Arbab, Rodrigo R. Galvao, Russell Currey, Sam Bobroff, Scott Wood, Shilpasri G Bhat, Simon Guo, Souptick Joarder, Stan Johnson, Thiago Jung Bauermann, Tyrel Datwyler, Vaibhav Jain, Vasant Hegde, Venkat Rao, zhong jiang" * tag 'powerpc-4.19-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (234 commits) powerpc/mm/book3s/radix: Add mapping statistics powerpc/uaccess: Enable get_user(u64, *p) on 32-bit powerpc/mm/hash: Remove unnecessary do { } while(0) loop powerpc/64s: move machine check SLB flushing to mm/slb.c powerpc/powernv/idle: Fix build error powerpc/mm/tlbflush: update the mmu_gather page size while iterating address range powerpc/mm: remove warning about ‘type’ being set powerpc/32: Include setup.h header file to fix warnings powerpc: Move `path` variable inside DEBUG_PROM powerpc/powermac: Make some functions static powerpc/powermac: Remove variable x that's never read cxl: remove a dead branch powerpc/powermac: Add missing include of header pmac.h powerpc/kexec: Use common error handling code in setup_new_fdt() powerpc/xmon: Add address lookup for percpu symbols powerpc/mm: remove huge_pte_offset_and_shift() prototype powerpc/lib: Use patch_site to patch copy_32 functions once cache is enabled powerpc/pseries: Fix endianness while restoring of r3 in MCE handler. powerpc/fadump: merge adjacent memory ranges to reduce PT_LOAD segements powerpc/fadump: handle crash memory ranges array index overflow ...
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r--arch/powerpc/lib/Makefile2
-rw-r--r--arch/powerpc/lib/code-patching.c16
-rw-r--r--arch/powerpc/lib/copy_32.S9
-rw-r--r--arch/powerpc/lib/copypage_64.S1
-rw-r--r--arch/powerpc/lib/copypage_power7.S4
-rw-r--r--arch/powerpc/lib/copyuser_64.S587
-rw-r--r--arch/powerpc/lib/copyuser_power7.S21
-rw-r--r--arch/powerpc/lib/feature-fixups-test.S1
-rw-r--r--arch/powerpc/lib/feature-fixups.c35
-rw-r--r--arch/powerpc/lib/hweight_64.S1
-rw-r--r--arch/powerpc/lib/ldstfp.S1
-rw-r--r--arch/powerpc/lib/locks.c1
-rw-r--r--arch/powerpc/lib/memcmp_64.S414
-rw-r--r--arch/powerpc/lib/memcpy_64.S11
-rw-r--r--arch/powerpc/lib/memcpy_power7.S28
-rw-r--r--arch/powerpc/lib/strlen_32.S78
-rw-r--r--arch/powerpc/lib/vmx-helper.c4
17 files changed, 839 insertions, 375 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index d0ca13ad8231..670286808928 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -12,7 +12,7 @@ CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
obj-y += string.o alloc.o code-patching.o feature-fixups.o
-obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o
+obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o strlen_32.o
# See corresponding test in arch/powerpc/Makefile
# 64-bit linker creates .sfpr on demand for final link (vmlinux),
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index e0d881ab304e..850f3b8f4da5 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -195,6 +195,22 @@ int patch_branch(unsigned int *addr, unsigned long target, int flags)
return patch_instruction(addr, create_branch(addr, target, flags));
}
+int patch_branch_site(s32 *site, unsigned long target, int flags)
+{
+ unsigned int *addr;
+
+ addr = (unsigned int *)((unsigned long)site + *site);
+ return patch_instruction(addr, create_branch(addr, target, flags));
+}
+
+int patch_instruction_site(s32 *site, unsigned int instr)
+{
+ unsigned int *addr;
+
+ addr = (unsigned int *)((unsigned long)site + *site);
+ return patch_instruction(addr, instr);
+}
+
bool is_offset_in_branch_range(long offset)
{
/*
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index da425bb6b369..ba66846fe973 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -13,6 +13,7 @@
#include <asm/errno.h>
#include <asm/ppc_asm.h>
#include <asm/export.h>
+#include <asm/code-patching-asm.h>
#define COPY_16_BYTES \
lwz r7,4(r4); \
@@ -107,8 +108,8 @@ _GLOBAL(memset)
* Skip optimised bloc until cache is enabled. Will be replaced
* by 'bne' during boot to use normal procedure if r4 is not zero
*/
-_GLOBAL(memset_nocache_branch)
- b 2f
+5: b 2f
+ patch_site 5b, patch__memset_nocache
clrlwi r7,r6,32-LG_CACHELINE_BYTES
add r8,r7,r5
@@ -168,7 +169,9 @@ _GLOBAL(memmove)
/* fall through */
_GLOBAL(memcpy)
- b generic_memcpy
+1: b generic_memcpy
+ patch_site 1b, patch__memcpy_nocache
+
add r7,r3,r5 /* test if the src & dst overlap */
add r8,r4,r5
cmplw 0,r4,r7
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 8d5034f645f3..694390357667 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -11,6 +11,7 @@
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/export.h>
+#include <asm/feature-fixups.h>
.section ".toc","aw"
PPC64_CACHES:
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7ab20e..e38f956f7d9f 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
- bl enter_vmx_copy
+ bl enter_vmx_ops
cmpwi r3,0
ld r0,STACKFRAMESIZE+16(r1)
ld r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
addi r3,r3,128
bdnz 1b
- b exit_vmx_copy /* tail call optimise */
+ b exit_vmx_ops /* tail call optimise */
#else
li r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S
index 506677395681..96c514bee66b 100644
--- a/arch/powerpc/lib/copyuser_64.S
+++ b/arch/powerpc/lib/copyuser_64.S
@@ -9,6 +9,13 @@
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/export.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#ifndef SELFTEST_CASE
+/* 0 == most CPUs, 1 == POWER6, 2 == Cell */
+#define SELFTEST_CASE 0
+#endif
#ifdef __BIG_ENDIAN__
#define sLd sld /* Shift towards low-numbered address. */
@@ -18,6 +25,28 @@
#define sHd sld /* Shift towards high-numbered address. */
#endif
+/*
+ * These macros are used to generate exception table entries.
+ * The exception handlers below use the original arguments
+ * (stored on the stack) and the point where we're up to in
+ * the destination buffer, i.e. the address of the first
+ * unmodified byte. Generally r3 points into the destination
+ * buffer, but the first unmodified byte is at a variable
+ * offset from r3. In the code below, the symbol r3_offset
+ * is set to indicate the current offset at each point in
+ * the code. This offset is then used as a negative offset
+ * from the exception handler code, and those instructions
+ * before the exception handlers are addi instructions that
+ * adjust r3 to point to the correct place.
+ */
+ .macro lex /* exception handler for load */
+100: EX_TABLE(100b, .Lld_exc - r3_offset)
+ .endm
+
+ .macro stex /* exception handler for store */
+100: EX_TABLE(100b, .Lst_exc - r3_offset)
+ .endm
+
.align 7
_GLOBAL_TOC(__copy_tofrom_user)
#ifdef CONFIG_PPC_BOOK3S_64
@@ -28,7 +57,7 @@ FTR_SECTION_ELSE
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
#endif
_GLOBAL(__copy_tofrom_user_base)
- /* first check for a whole page copy on a page boundary */
+ /* first check for a 4kB copy on a 4kB boundary */
cmpldi cr1,r5,16
cmpdi cr6,r5,4096
or r0,r3,r4
@@ -49,6 +78,7 @@ _GLOBAL(__copy_tofrom_user_base)
* At the time of writing the only CPU that has this combination of bits
* set is Power6.
*/
+test_feature = (SELFTEST_CASE == 1)
BEGIN_FTR_SECTION
nop
FTR_SECTION_ELSE
@@ -57,6 +87,8 @@ ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
CPU_FTR_UNALIGNED_LD_STD)
.Ldst_aligned:
addi r3,r3,-16
+r3_offset = 16
+test_feature = (SELFTEST_CASE == 0)
BEGIN_FTR_SECTION
andi. r0,r4,7
bne .Lsrc_unaligned
@@ -64,57 +96,69 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
blt cr1,.Ldo_tail /* if < 16 bytes to copy */
srdi r0,r5,5
cmpdi cr1,r0,0
-20: ld r7,0(r4)
-220: ld r6,8(r4)
+lex; ld r7,0(r4)
+lex; ld r6,8(r4)
addi r4,r4,16
mtctr r0
andi. r0,r5,0x10
beq 22f
addi r3,r3,16
+r3_offset = 0
addi r4,r4,-16
mr r9,r7
mr r8,r6
beq cr1,72f
-21: ld r7,16(r4)
-221: ld r6,24(r4)
+21:
+lex; ld r7,16(r4)
+lex; ld r6,24(r4)
addi r4,r4,32
-70: std r9,0(r3)
-270: std r8,8(r3)
-22: ld r9,0(r4)
-222: ld r8,8(r4)
-71: std r7,16(r3)
-271: std r6,24(r3)
+stex; std r9,0(r3)
+r3_offset = 8
+stex; std r8,8(r3)
+r3_offset = 16
+22:
+lex; ld r9,0(r4)
+lex; ld r8,8(r4)
+stex; std r7,16(r3)
+r3_offset = 24
+stex; std r6,24(r3)
addi r3,r3,32
+r3_offset = 0
bdnz 21b
-72: std r9,0(r3)
-272: std r8,8(r3)
+72:
+stex; std r9,0(r3)
+r3_offset = 8
+stex; std r8,8(r3)
+r3_offset = 16
andi. r5,r5,0xf
beq+ 3f
addi r4,r4,16
.Ldo_tail:
addi r3,r3,16
+r3_offset = 0
bf cr7*4+0,246f
-244: ld r9,0(r4)
+lex; ld r9,0(r4)
addi r4,r4,8
-245: std r9,0(r3)
+stex; std r9,0(r3)
addi r3,r3,8
246: bf cr7*4+1,1f
-23: lwz r9,0(r4)
+lex; lwz r9,0(r4)
addi r4,r4,4
-73: stw r9,0(r3)
+stex; stw r9,0(r3)
addi r3,r3,4
1: bf cr7*4+2,2f
-44: lhz r9,0(r4)
+lex; lhz r9,0(r4)
addi r4,r4,2
-74: sth r9,0(r3)
+stex; sth r9,0(r3)
addi r3,r3,2
2: bf cr7*4+3,3f
-45: lbz r9,0(r4)
-75: stb r9,0(r3)
+lex; lbz r9,0(r4)
+stex; stb r9,0(r3)
3: li r3,0
blr
.Lsrc_unaligned:
+r3_offset = 16
srdi r6,r5,3
addi r5,r5,-16
subf r4,r0,r4
@@ -127,58 +171,69 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
add r5,r5,r0
bt cr7*4+0,28f
-24: ld r9,0(r4) /* 3+2n loads, 2+2n stores */
-25: ld r0,8(r4)
+lex; ld r9,0(r4) /* 3+2n loads, 2+2n stores */
+lex; ld r0,8(r4)
sLd r6,r9,r10
-26: ldu r9,16(r4)
+lex; ldu r9,16(r4)
sHd r7,r0,r11
sLd r8,r0,r10
or r7,r7,r6
blt cr6,79f
-27: ld r0,8(r4)
+lex; ld r0,8(r4)
b 2f
-28: ld r0,0(r4) /* 4+2n loads, 3+2n stores */
-29: ldu r9,8(r4)
+28:
+lex; ld r0,0(r4) /* 4+2n loads, 3+2n stores */
+lex; ldu r9,8(r4)
sLd r8,r0,r10
addi r3,r3,-8
+r3_offset = 24
blt cr6,5f
-30: ld r0,8(r4)
+lex; ld r0,8(r4)
sHd r12,r9,r11
sLd r6,r9,r10
-31: ldu r9,16(r4)
+lex; ldu r9,16(r4)
or r12,r8,r12
sHd r7,r0,r11
sLd r8,r0,r10
addi r3,r3,16
+r3_offset = 8
beq cr6,78f
1: or r7,r7,r6
-32: ld r0,8(r4)
-76: std r12,8(r3)
+lex; ld r0,8(r4)
+stex; std r12,8(r3)
+r3_offset = 16
2: sHd r12,r9,r11
sLd r6,r9,r10
-33: ldu r9,16(r4)
+lex; ldu r9,16(r4)
or r12,r8,r12
-77: stdu r7,16(r3)
+stex; stdu r7,16(r3)
+r3_offset = 8
sHd r7,r0,r11
sLd r8,r0,r10
bdnz 1b
-78: std r12,8(r3)
+78:
+stex; std r12,8(r3)
+r3_offset = 16
or r7,r7,r6
-79: std r7,16(r3)
+79:
+stex; std r7,16(r3)
+r3_offset = 24
5: sHd r12,r9,r11
or r12,r8,r12
-80: std r12,24(r3)
+stex; std r12,24(r3)
+r3_offset = 32
bne 6f
li r3,0
blr
6: cmpwi cr1,r5,8
addi r3,r3,32
+r3_offset = 0
sLd r9,r9,r10
ble cr1,7f
-34: ld r0,8(r4)
+lex; ld r0,8(r4)
sHd r7,r0,r11
or r9,r7,r9
7:
@@ -186,7 +241,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
#ifdef __BIG_ENDIAN__
rotldi r9,r9,32
#endif
-94: stw r9,0(r3)
+stex; stw r9,0(r3)
#ifdef __LITTLE_ENDIAN__
rotrdi r9,r9,32
#endif
@@ -195,7 +250,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
#ifdef __BIG_ENDIAN__
rotldi r9,r9,16
#endif
-95: sth r9,0(r3)
+stex; sth r9,0(r3)
#ifdef __LITTLE_ENDIAN__
rotrdi r9,r9,16
#endif
@@ -204,7 +259,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
#ifdef __BIG_ENDIAN__
rotldi r9,r9,8
#endif
-96: stb r9,0(r3)
+stex; stb r9,0(r3)
#ifdef __LITTLE_ENDIAN__
rotrdi r9,r9,8
#endif
@@ -212,47 +267,55 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
blr
.Ldst_unaligned:
+r3_offset = 0
PPC_MTOCRF(0x01,r6) /* put #bytes to 8B bdry into cr7 */
subf r5,r6,r5
li r7,0
cmpldi cr1,r5,16
bf cr7*4+3,1f
-35: lbz r0,0(r4)
-81: stb r0,0(r3)
+100: EX_TABLE(100b, .Lld_exc_r7)
+ lbz r0,0(r4)
+100: EX_TABLE(100b, .Lst_exc_r7)
+ stb r0,0(r3)
addi r7,r7,1
1: bf cr7*4+2,2f
-36: lhzx r0,r7,r4
-82: sthx r0,r7,r3
+100: EX_TABLE(100b, .Lld_exc_r7)
+ lhzx r0,r7,r4
+100: EX_TABLE(100b, .Lst_exc_r7)
+ sthx r0,r7,r3
addi r7,r7,2
2: bf cr7*4+1,3f
-37: lwzx r0,r7,r4
-83: stwx r0,r7,r3
+100: EX_TABLE(100b, .Lld_exc_r7)
+ lwzx r0,r7,r4
+100: EX_TABLE(100b, .Lst_exc_r7)
+ stwx r0,r7,r3
3: PPC_MTOCRF(0x01,r5)
add r4,r6,r4
add r3,r6,r3
b .Ldst_aligned
.Lshort_copy:
+r3_offset = 0
bf cr7*4+0,1f
-38: lwz r0,0(r4)
-39: lwz r9,4(r4)
+lex; lwz r0,0(r4)
+lex; lwz r9,4(r4)
addi r4,r4,8
-84: stw r0,0(r3)
-85: stw r9,4(r3)
+stex; stw r0,0(r3)
+stex; stw r9,4(r3)
addi r3,r3,8
1: bf cr7*4+1,2f
-40: lwz r0,0(r4)
+lex; lwz r0,0(r4)
addi r4,r4,4
-86: stw r0,0(r3)
+stex; stw r0,0(r3)
addi r3,r3,4
2: bf cr7*4+2,3f
-41: lhz r0,0(r4)
+lex; lhz r0,0(r4)
addi r4,r4,2
-87: sth r0,0(r3)
+stex; sth r0,0(r3)
addi r3,r3,2
3: bf cr7*4+3,4f
-42: lbz r0,0(r4)
-88: stb r0,0(r3)
+lex; lbz r0,0(r4)
+stex; stb r0,0(r3)
4: li r3,0
blr
@@ -260,48 +323,34 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
* exception handlers follow
* we have to return the number of bytes not copied
* for an exception on a load, we set the rest of the destination to 0
+ * Note that the number of bytes of instructions for adjusting r3 needs
+ * to equal the amount of the adjustment, due to the trick of using
+ * .Lld_exc - r3_offset as the handler address.
*/
-136:
-137:
+.Lld_exc_r7:
add r3,r3,r7
- b 1f
-130:
-131:
+ b .Lld_exc
+
+ /* adjust by 24 */
addi r3,r3,8
-120:
-320:
-122:
-322:
-124:
-125:
-126:
-127:
-128:
-129:
-133:
+ nop
+ /* adjust by 16 */
addi r3,r3,8
-132:
+ nop
+ /* adjust by 8 */
addi r3,r3,8
-121:
-321:
-344:
-134:
-135:
-138:
-139:
-140:
-141:
-142:
-123:
-144:
-145:
+ nop
/*
- * here we have had a fault on a load and r3 points to the first
- * unmodified byte of the destination
+ * Here we have had a fault on a load and r3 points to the first
+ * unmodified byte of the destination. We use the original arguments
+ * and r3 to work out how much wasn't copied. Since we load some
+ * distance ahead of the stores, we continue copying byte-by-byte until
+ * we hit the load fault again in order to copy as much as possible.
*/
-1: ld r6,-24(r1)
+.Lld_exc:
+ ld r6,-24(r1)
ld r4,-16(r1)
ld r5,-8(r1)
subf r6,r6,r3
@@ -312,9 +361,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
* first see if we can copy any more bytes before hitting another exception
*/
mtctr r5
+r3_offset = 0
+100: EX_TABLE(100b, .Ldone)
43: lbz r0,0(r4)
addi r4,r4,1
-89: stb r0,0(r3)
+stex; stb r0,0(r3)
addi r3,r3,1
bdnz 43b
li r3,0 /* huh? all copied successfully this time? */
@@ -323,116 +374,63 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
/*
* here we have trapped again, amount remaining is in ctr.
*/
-143: mfctr r3
+.Ldone:
+ mfctr r3
blr
/*
- * exception handlers for stores: we just need to work
- * out how many bytes weren't copied
+ * exception handlers for stores: we need to work out how many bytes
+ * weren't copied, and we may need to copy some more.
+ * Note that the number of bytes of instructions for adjusting r3 needs
+ * to equal the amount of the adjustment, due to the trick of using
+ * .Lst_exc - r3_offset as the handler address.
*/
-182:
-183:
+.Lst_exc_r7:
add r3,r3,r7
- b 1f
-371:
-180:
+ b .Lst_exc
+
+ /* adjust by 24 */
addi r3,r3,8
-171:
-177:
-179:
+ nop
+ /* adjust by 16 */
addi r3,r3,8
-370:
-372:
-176:
-178:
+ nop
+ /* adjust by 8 */
addi r3,r3,4
-185:
+ /* adjust by 4 */
addi r3,r3,4
-170:
-172:
-345:
-173:
-174:
-175:
-181:
-184:
-186:
-187:
-188:
-189:
-194:
-195:
-196:
-1:
- ld r6,-24(r1)
- ld r5,-8(r1)
- add r6,r6,r5
- subf r3,r3,r6 /* #bytes not copied */
+.Lst_exc:
+ ld r6,-24(r1) /* original destination pointer */
+ ld r4,-16(r1) /* original source pointer */
+ ld r5,-8(r1) /* original number of bytes */
+ add r7,r6,r5
+ /*
+ * If the destination pointer isn't 8-byte aligned,
+ * we may have got the exception as a result of a
+ * store that overlapped a page boundary, so we may be
+ * able to copy a few more bytes.
+ */
+17: andi. r0,r3,7
+ beq 19f
+ subf r8,r6,r3 /* #bytes copied */
+100: EX_TABLE(100b,19f)
+ lbzx r0,r8,r4
+100: EX_TABLE(100b,19f)
+ stb r0,0(r3)
+ addi r3,r3,1
+ cmpld r3,r7
+ blt 17b
+19: subf r3,r3,r7 /* #bytes not copied in r3 */
blr
- EX_TABLE(20b,120b)
- EX_TABLE(220b,320b)
- EX_TABLE(21b,121b)
- EX_TABLE(221b,321b)
- EX_TABLE(70b,170b)
- EX_TABLE(270b,370b)
- EX_TABLE(22b,122b)
- EX_TABLE(222b,322b)
- EX_TABLE(71b,171b)
- EX_TABLE(271b,371b)
- EX_TABLE(72b,172b)
- EX_TABLE(272b,372b)
- EX_TABLE(244b,344b)
- EX_TABLE(245b,345b)
- EX_TABLE(23b,123b)
- EX_TABLE(73b,173b)
- EX_TABLE(44b,144b)
- EX_TABLE(74b,174b)
- EX_TABLE(45b,145b)
- EX_TABLE(75b,175b)
- EX_TABLE(24b,124b)
- EX_TABLE(25b,125b)
- EX_TABLE(26b,126b)
- EX_TABLE(27b,127b)
- EX_TABLE(28b,128b)
- EX_TABLE(29b,129b)
- EX_TABLE(30b,130b)
- EX_TABLE(31b,131b)
- EX_TABLE(32b,132b)
- EX_TABLE(76b,176b)
- EX_TABLE(33b,133b)
- EX_TABLE(77b,177b)
- EX_TABLE(78b,178b)
- EX_TABLE(79b,179b)
- EX_TABLE(80b,180b)
- EX_TABLE(34b,134b)
- EX_TABLE(94b,194b)
- EX_TABLE(95b,195b)
- EX_TABLE(96b,196b)
- EX_TABLE(35b,135b)
- EX_TABLE(81b,181b)
- EX_TABLE(36b,136b)
- EX_TABLE(82b,182b)
- EX_TABLE(37b,137b)
- EX_TABLE(83b,183b)
- EX_TABLE(38b,138b)
- EX_TABLE(39b,139b)
- EX_TABLE(84b,184b)
- EX_TABLE(85b,185b)
- EX_TABLE(40b,140b)
- EX_TABLE(86b,186b)
- EX_TABLE(41b,141b)
- EX_TABLE(87b,187b)
- EX_TABLE(42b,142b)
- EX_TABLE(88b,188b)
- EX_TABLE(43b,143b)
- EX_TABLE(89b,189b)
-
/*
* Routine to copy a whole page of data, optimized for POWER4.
* On POWER4 it is more than 50% faster than the simple loop
* above (following the .Ldst_aligned label).
*/
+ .macro exc
+100: EX_TABLE(100b, .Labort)
+ .endm
.Lcopy_page_4K:
std r31,-32(1)
std r30,-40(1)
@@ -451,86 +449,86 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
li r0,5
0: addi r5,r5,-24
mtctr r0
-20: ld r22,640(4)
-21: ld r21,512(4)
-22: ld r20,384(4)
-23: ld r11,256(4)
-24: ld r9,128(4)
-25: ld r7,0(4)
-26: ld r25,648(4)
-27: ld r24,520(4)
-28: ld r23,392(4)
-29: ld r10,264(4)
-30: ld r8,136(4)
-31: ldu r6,8(4)
+exc; ld r22,640(4)
+exc; ld r21,512(4)
+exc; ld r20,384(4)
+exc; ld r11,256(4)
+exc; ld r9,128(4)
+exc; ld r7,0(4)
+exc; ld r25,648(4)
+exc; ld r24,520(4)
+exc; ld r23,392(4)
+exc; ld r10,264(4)
+exc; ld r8,136(4)
+exc; ldu r6,8(4)
cmpwi r5,24
1:
-32: std r22,648(3)
-33: std r21,520(3)
-34: std r20,392(3)
-35: std r11,264(3)
-36: std r9,136(3)
-37: std r7,8(3)
-38: ld r28,648(4)
-39: ld r27,520(4)
-40: ld r26,392(4)
-41: ld r31,264(4)
-42: ld r30,136(4)
-43: ld r29,8(4)
-44: std r25,656(3)
-45: std r24,528(3)
-46: std r23,400(3)
-47: std r10,272(3)
-48: std r8,144(3)
-49: std r6,16(3)
-50: ld r22,656(4)
-51: ld r21,528(4)
-52: ld r20,400(4)
-53: ld r11,272(4)
-54: ld r9,144(4)
-55: ld r7,16(4)
-56: std r28,664(3)
-57: std r27,536(3)
-58: std r26,408(3)
-59: std r31,280(3)
-60: std r30,152(3)
-61: stdu r29,24(3)
-62: ld r25,664(4)
-63: ld r24,536(4)
-64: ld r23,408(4)
-65: ld r10,280(4)
-66: ld r8,152(4)
-67: ldu r6,24(4)
+exc; std r22,648(3)
+exc; std r21,520(3)
+exc; std r20,392(3)
+exc; std r11,264(3)
+exc; std r9,136(3)
+exc; std r7,8(3)
+exc; ld r28,648(4)
+exc; ld r27,520(4)
+exc; ld r26,392(4)
+exc; ld r31,264(4)
+exc; ld r30,136(4)
+exc; ld r29,8(4)
+exc; std r25,656(3)
+exc; std r24,528(3)
+exc; std r23,400(3)
+exc; std r10,272(3)
+exc; std r8,144(3)
+exc; std r6,16(3)
+exc; ld r22,656(4)
+exc; ld r21,528(4)
+exc; ld r20,400(4)
+exc; ld r11,272(4)
+exc; ld r9,144(4)
+exc; ld r7,16(4)
+exc; std r28,664(3)
+exc; std r27,536(3)
+exc; std r26,408(3)
+exc; std r31,280(3)
+exc; std r30,152(3)
+exc; stdu r29,24(3)
+exc; ld r25,664(4)
+exc; ld r24,536(4)
+exc; ld r23,408(4)
+exc; ld r10,280(4)
+exc; ld r8,152(4)
+exc; ldu r6,24(4)
bdnz 1b
-68: std r22,648(3)
-69: std r21,520(3)
-70: std r20,392(3)
-71: std r11,264(3)
-72: std r9,136(3)
-73: std r7,8(3)
-74: addi r4,r4,640
-75: addi r3,r3,648
+exc; std r22,648(3)
+exc; std r21,520(3)
+exc; std r20,392(3)
+exc; std r11,264(3)
+exc; std r9,136(3)
+exc; std r7,8(3)
+ addi r4,r4,640
+ addi r3,r3,648
bge 0b
mtctr r5
-76: ld r7,0(4)
-77: ld r8,8(4)
-78: ldu r9,16(4)
+exc; ld r7,0(4)
+exc; ld r8,8(4)
+exc; ldu r9,16(4)
3:
-79: ld r10,8(4)
-80: std r7,8(3)
-81: ld r7,16(4)
-82: std r8,16(3)
-83: ld r8,24(4)
-84: std r9,24(3)
-85: ldu r9,32(4)
-86: stdu r10,32(3)
+exc; ld r10,8(4)
+exc; std r7,8(3)
+exc; ld r7,16(4)
+exc; std r8,16(3)
+exc; ld r8,24(4)
+exc; std r9,24(3)
+exc; ldu r9,32(4)
+exc; stdu r10,32(3)
bdnz 3b
4:
-87: ld r10,8(4)
-88: std r7,8(3)
-89: std r8,16(3)
-90: std r9,24(3)
-91: std r10,32(3)
+exc; ld r10,8(4)
+exc; std r7,8(3)
+exc; std r8,16(3)
+exc; std r9,24(3)
+exc; std r10,32(3)
9: ld r20,-120(1)
ld r21,-112(1)
ld r22,-104(1)
@@ -550,7 +548,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
* on an exception, reset to the beginning and jump back into the
* standard __copy_tofrom_user
*/
-100: ld r20,-120(1)
+.Labort:
+ ld r20,-120(1)
ld r21,-112(1)
ld r22,-104(1)
ld r23,-96(1)
@@ -566,78 +565,4 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
ld r4,-16(r1)
li r5,4096
b .Ldst_aligned
-
- EX_TABLE(20b,100b)
- EX_TABLE(21b,100b)
- EX_TABLE(22b,100b)
- EX_TABLE(23b,100b)
- EX_TABLE(24b,100b)
- EX_TABLE(25b,100b)
- EX_TABLE(26b,100b)
- EX_TABLE(27b,100b)
- EX_TABLE(28b,100b)
- EX_TABLE(29b,100b)
- EX_TABLE(30b,100b)
- EX_TABLE(31b,100b)
- EX_TABLE(32b,100b)
- EX_TABLE(33b,100b)
- EX_TABLE(34b,100b)
- EX_TABLE(35b,100b)
- EX_TABLE(36b,100b)
- EX_TABLE(37b,100b)
- EX_TABLE(38b,100b)
- EX_TABLE(39b,100b)
- EX_TABLE(40b,100b)
- EX_TABLE(41b,100b)
- EX_TABLE(42b,100b)
- EX_TABLE(43b,100b)
- EX_TABLE(44b,100b)
- EX_TABLE(45b,100b)
- EX_TABLE(46b,100b)
- EX_TABLE(47b,100b)
- EX_TABLE(48b,100b)
- EX_TABLE(49b,100b)
- EX_TABLE(50b,100b)
- EX_TABLE(51b,100b)
- EX_TABLE(52b,100b)
- EX_TABLE(53b,100b)
- EX_TABLE(54b,100b)
- EX_TABLE(55b,100b)
- EX_TABLE(56b,100b)
- EX_TABLE(57b,100b)
- EX_TABLE(58b,100b)
- EX_TABLE(59b,100b)
- EX_TABLE(60b,100b)
- EX_TABLE(61b,100b)
- EX_TABLE(62b,100b)
- EX_TABLE(63b,100b)
- EX_TABLE(64b,100b)
- EX_TABLE(65b,100b)
- EX_TABLE(66b,100b)
- EX_TABLE(67b,100b)
- EX_TABLE(68b,100b)
- EX_TABLE(69b,100b)
- EX_TABLE(70b,100b)
- EX_TABLE(71b,100b)
- EX_TABLE(72b,100b)
- EX_TABLE(73b,100b)
- EX_TABLE(74b,100b)
- EX_TABLE(75b,100b)
- EX_TABLE(76b,100b)
- EX_TABLE(77b,100b)
- EX_TABLE(78b,100b)
- EX_TABLE(79b,100b)
- EX_TABLE(80b,100b)
- EX_TABLE(81b,100b)
- EX_TABLE(82b,100b)
- EX_TABLE(83b,100b)
- EX_TABLE(84b,100b)
- EX_TABLE(85b,100b)
- EX_TABLE(86b,100b)
- EX_TABLE(87b,100b)
- EX_TABLE(88b,100b)
- EX_TABLE(89b,100b)
- EX_TABLE(90b,100b)
- EX_TABLE(91b,100b)
-
EXPORT_SYMBOL(__copy_tofrom_user)
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S
index 215e4760c09f..1a1fe180af62 100644
--- a/arch/powerpc/lib/copyuser_power7.S
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -19,6 +19,11 @@
*/
#include <asm/ppc_asm.h>
+#ifndef SELFTEST_CASE
+/* 0 == don't use VMX, 1 == use VMX */
+#define SELFTEST_CASE 0
+#endif
+
#ifdef __BIG_ENDIAN__
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
@@ -80,7 +85,6 @@
_GLOBAL(__copy_tofrom_user_power7)
-#ifdef CONFIG_ALTIVEC
cmpldi r5,16
cmpldi cr1,r5,3328
@@ -89,15 +93,12 @@ _GLOBAL(__copy_tofrom_user_power7)
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
blt .Lshort_copy
- bge cr1,.Lvmx_copy
-#else
- cmpldi r5,16
- std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
- std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
- std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
-
- blt .Lshort_copy
+#ifdef CONFIG_ALTIVEC
+test_feature = SELFTEST_CASE
+BEGIN_FTR_SECTION
+ bgt cr1,.Lvmx_copy
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
#endif
.Lnonvmx_copy:
@@ -278,8 +279,8 @@ err1; stb r0,0(r3)
addi r1,r1,STACKFRAMESIZE
b .Lnonvmx_copy
-#ifdef CONFIG_ALTIVEC
.Lvmx_copy:
+#ifdef CONFIG_ALTIVEC
mflr r0
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
diff --git a/arch/powerpc/lib/feature-fixups-test.S b/arch/powerpc/lib/feature-fixups-test.S
index f16cec989506..ee7c5fd5fc64 100644
--- a/arch/powerpc/lib/feature-fixups-test.S
+++ b/arch/powerpc/lib/feature-fixups-test.S
@@ -11,6 +11,7 @@
#include <asm/feature-fixups.h>
#include <asm/ppc_asm.h>
#include <asm/synch.h>
+#include <asm/asm-compat.h>
.text
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 8b69f868298c..e613b02bb2f0 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -304,6 +304,9 @@ void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_
printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i);
}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BARRIER_NOSPEC
void do_barrier_nospec_fixups(bool enable)
{
void *start, *end;
@@ -313,8 +316,38 @@ void do_barrier_nospec_fixups(bool enable)
do_barrier_nospec_fixups_range(enable, start, end);
}
+#endif /* CONFIG_PPC_BARRIER_NOSPEC */
-#endif /* CONFIG_PPC_BOOK3S_64 */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end)
+{
+ unsigned int instr[2], *dest;
+ long *start, *end;
+ int i;
+
+ start = fixup_start;
+ end = fixup_end;
+
+ instr[0] = PPC_INST_NOP;
+ instr[1] = PPC_INST_NOP;
+
+ if (enable) {
+ pr_info("barrier-nospec: using isync; sync as speculation barrier\n");
+ instr[0] = PPC_INST_ISYNC;
+ instr[1] = PPC_INST_SYNC;
+ }
+
+ for (i = 0; start < end; start++, i++) {
+ dest = (void *)start + *start;
+
+ pr_devel("patching dest %lx\n", (unsigned long)dest);
+ patch_instruction(dest, instr[0]);
+ patch_instruction(dest + 1, instr[1]);
+ }
+
+ printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i);
+}
+#endif /* CONFIG_PPC_FSL_BOOK3E */
void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
{
diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S
index 3de7ac154f24..0526b2225260 100644
--- a/arch/powerpc/lib/hweight_64.S
+++ b/arch/powerpc/lib/hweight_64.S
@@ -20,6 +20,7 @@
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/export.h>
+#include <asm/feature-fixups.h>
/* Note: This code relies on -mminimal-toc */
diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S
index ae15eba49c1f..32e91994b6b2 100644
--- a/arch/powerpc/lib/ldstfp.S
+++ b/arch/powerpc/lib/ldstfp.S
@@ -15,6 +15,7 @@
#include <asm/ppc-opcode.h>
#include <asm/reg.h>
#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
#include <linux/errno.h>
#ifdef CONFIG_PPC_FPU
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index b7b1237d4aa6..35a0ef932e1a 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -15,7 +15,6 @@
#include <linux/kernel.h>
#include <linux/spinlock.h>
#include <linux/export.h>
-#include <linux/stringify.h>
#include <linux/smp.h>
/* waiting for a spinlock... */
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b7bd55..844d8e774492 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -9,6 +9,7 @@
*/
#include <asm/ppc_asm.h>
#include <asm/export.h>
+#include <asm/ppc-opcode.h>
#define off8 r6
#define off16 r7
@@ -24,28 +25,102 @@
#define rH r31
#ifdef __LITTLE_ENDIAN__
+#define LH lhbrx
+#define LW lwbrx
#define LD ldbrx
+#define LVS lvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+ vperm _VRT,_VRB,_VRA,_VRC
#else
+#define LH lhzx
+#define LW lwzx
#define LD ldx
+#define LVS lvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+ vperm _VRT,_VRA,_VRB,_VRC
#endif
-_GLOBAL(memcmp)
+#define VMX_THRESH 4096
+#define ENTER_VMX_OPS \
+ mflr r0; \
+ std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+ std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+ std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+ std r0,16(r1); \
+ stdu r1,-STACKFRAMESIZE(r1); \
+ bl enter_vmx_ops; \
+ cmpwi cr1,r3,0; \
+ ld r0,STACKFRAMESIZE+16(r1); \
+ ld r3,STK_REG(R31)(r1); \
+ ld r4,STK_REG(R30)(r1); \
+ ld r5,STK_REG(R29)(r1); \
+ addi r1,r1,STACKFRAMESIZE; \
+ mtlr r0
+
+#define EXIT_VMX_OPS \
+ mflr r0; \
+ std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+ std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+ std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+ std r0,16(r1); \
+ stdu r1,-STACKFRAMESIZE(r1); \
+ bl exit_vmx_ops; \
+ ld r0,STACKFRAMESIZE+16(r1); \
+ ld r3,STK_REG(R31)(r1); \
+ ld r4,STK_REG(R30)(r1); \
+ ld r5,STK_REG(R29)(r1); \
+ addi r1,r1,STACKFRAMESIZE; \
+ mtlr r0
+
+/*
+ * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
+ * 16 bytes boundary and permute the result with the 1st 16 bytes.
+
+ * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
+ * ^ ^ ^
+ * 0xbbbb10 0xbbbb20 0xbbb30
+ * ^
+ * _vaddr
+ *
+ *
+ * _vmask is the mask generated by LVS
+ * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
+ * for example: 0xyyyyyyyyyyyyy012 for big endian
+ * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
+ * for example: 0x3456789abcdefzzz for big endian
+ * The permute result is saved in _v_res.
+ * for example: 0x0123456789abcdef for big endian.
+ */
+#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
+ lvx _v2nd_qw,_vaddr,off16; \
+ VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
+
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_xxxx
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_xxxx
+ */
+_GLOBAL_TOC(memcmp)
cmpdi cr1,r5,0
- /* Use the short loop if both strings are not 8B aligned */
- or r6,r3,r4
+ /* Use the short loop if the src/dst addresses are not
+ * with the same offset of 8 bytes align boundary.
+ */
+ xor r6,r3,r4
andi. r6,r6,7
- /* Use the short loop if length is less than 32B */
- cmpdi cr6,r5,31
+ /* Fall back to short loop if compare at aligned addrs
+ * with less than 8 bytes.
+ */
+ cmpdi cr6,r5,7
beq cr1,.Lzero
- bne .Lshort
- bgt cr6,.Llong
+ bgt cr6,.Lno_short
.Lshort:
mtctr r5
-
1: lbz rA,0(r3)
lbz rB,0(r4)
subf. rC,rB,rA
@@ -78,11 +153,98 @@ _GLOBAL(memcmp)
li r3,0
blr
+.Lno_short:
+ dcbt 0,r3
+ dcbt 0,r4
+ bne .Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+ /* attempt to compare bytes not aligned with 8 bytes so that
+ * rest comparison can run based on 8 bytes alignment.
+ */
+ andi. r6,r3,7
+
+ /* Try to compare the first double word which is not 8 bytes aligned:
+ * load the first double word at (src & ~7UL) and shift left appropriate
+ * bits before comparision.
+ */
+ rlwinm r6,r3,3,26,28
+ beq .Lsameoffset_8bytes_aligned
+ clrrdi r3,r3,3
+ clrrdi r4,r4,3
+ LD rA,0,r3
+ LD rB,0,r4
+ sld rA,rA,r6
+ sld rB,rB,r6
+ cmpld cr0,rA,rB
+ srwi r6,r6,3
+ bne cr0,.LcmpAB_lightweight
+ subfic r6,r6,8
+ subf. r5,r6,r5
+ addi r3,r3,8
+ addi r4,r4,8
+ beq .Lzero
+
+.Lsameoffset_8bytes_aligned:
+ /* now we are aligned with 8 bytes.
+ * Use .Llong loop if left cmp bytes are equal or greater than 32B.
+ */
+ cmpdi cr6,r5,31
+ bgt cr6,.Llong
+
+.Lcmp_lt32bytes:
+ /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
+ cmpdi cr5,r5,7
+ srdi r0,r5,3
+ ble cr5,.Lcmp_rest_lt8bytes
+
+ /* handle 8 ~ 31 bytes */
+ clrldi r5,r5,61
+ mtctr r0
+2:
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ addi r3,r3,8
+ addi r4,r4,8
+ bne cr0,.LcmpAB_lightweight
+ bdnz 2b
+
+ cmpwi r5,0
+ beq .Lzero
+
+.Lcmp_rest_lt8bytes:
+ /* Here we have only less than 8 bytes to compare with. at least s1
+ * Address is aligned with 8 bytes.
+ * The next double words are load and shift right with appropriate
+ * bits.
+ */
+ subfic r6,r5,8
+ slwi r6,r6,3
+ LD rA,0,r3
+ LD rB,0,r4
+ srd rA,rA,r6
+ srd rB,rB,r6
+ cmpld cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ b .Lzero
+
.Lnon_zero:
mr r3,rC
blr
.Llong:
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+ /* Try to use vmx loop if length is equal or greater than 4K */
+ cmpldi cr6,r5,VMX_THRESH
+ bge cr6,.Lsameoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Llong_novmx_cmp:
+#endif
+ /* At least s1 addr is aligned with 8 bytes */
li off8,8
li off16,16
li off24,24
@@ -232,4 +394,240 @@ _GLOBAL(memcmp)
ld r28,-32(r1)
ld r27,-40(r1)
blr
+
+.LcmpAB_lightweight: /* skip NV GPRS restore */
+ li r3,1
+ bgtlr
+ li r3,-1
+ blr
+
+#ifdef CONFIG_ALTIVEC
+.Lsameoffset_vmx_cmp:
+ /* Enter with src/dst addrs has the same offset with 8 bytes
+ * align boundary.
+ *
+ * There is an optimization based on following fact: memcmp()
+ * prones to fail early at the first 32 bytes.
+ * Before applying VMX instructions which will lead to 32x128bits
+ * VMX regs load/restore penalty, we compare the first 32 bytes
+ * so that we can catch the ~80% fail cases.
+ */
+
+ li r0,4
+ mtctr r0
+.Lsameoffset_prechk_32B_loop:
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ addi r3,r3,8
+ addi r4,r4,8
+ bne cr0,.LcmpAB_lightweight
+ addi r5,r5,-8
+ bdnz .Lsameoffset_prechk_32B_loop
+
+ ENTER_VMX_OPS
+ beq cr1,.Llong_novmx_cmp
+
+3:
+ /* need to check whether r4 has the same offset with r3
+ * for 16 bytes boundary.
+ */
+ xor r0,r3,r4
+ andi. r0,r0,0xf
+ bne .Ldiffoffset_vmx_cmp_start
+
+ /* len is no less than 4KB. Need to align with 16 bytes further.
+ */
+ andi. rA,r3,8
+ LD rA,0,r3
+ beq 4f
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ addi r3,r3,8
+ addi r4,r4,8
+ addi r5,r5,-8
+
+ beq cr0,4f
+ /* save and restore cr0 */
+ mfocrf r5,128
+ EXIT_VMX_OPS
+ mtocrf 128,r5
+ b .LcmpAB_lightweight
+
+4:
+ /* compare 32 bytes for each loop */
+ srdi r0,r5,5
+ mtctr r0
+ clrldi r5,r5,59
+ li off16,16
+
+.balign 16
+5:
+ lvx v0,0,r3
+ lvx v1,0,r4
+ VCMPEQUD_RC(v0,v0,v1)
+ bnl cr6,7f
+ lvx v0,off16,r3
+ lvx v1,off16,r4
+ VCMPEQUD_RC(v0,v0,v1)
+ bnl cr6,6f
+ addi r3,r3,32
+ addi r4,r4,32
+ bdnz 5b
+
+ EXIT_VMX_OPS
+ cmpdi r5,0
+ beq .Lzero
+ b .Lcmp_lt32bytes
+
+6:
+ addi r3,r3,16
+ addi r4,r4,16
+
+7:
+ /* diff the last 16 bytes */
+ EXIT_VMX_OPS
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ li off8,8
+ bne cr0,.LcmpAB_lightweight
+
+ LD rA,off8,r3
+ LD rB,off8,r4
+ cmpld cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ b .Lzero
+#endif
+
+.Ldiffoffset_8bytes_make_align_start:
+ /* now try to align s1 with 8 bytes */
+ rlwinm r6,r3,3,26,28
+ beq .Ldiffoffset_align_s1_8bytes
+
+ clrrdi r3,r3,3
+ LD rA,0,r3
+ LD rB,0,r4 /* unaligned load */
+ sld rA,rA,r6
+ srd rA,rA,r6
+ srd rB,rB,r6
+ cmpld cr0,rA,rB
+ srwi r6,r6,3
+ bne cr0,.LcmpAB_lightweight
+
+ subfic r6,r6,8
+ subf. r5,r6,r5
+ addi r3,r3,8
+ add r4,r4,r6
+
+ beq .Lzero
+
+.Ldiffoffset_align_s1_8bytes:
+ /* now s1 is aligned with 8 bytes. */
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+ /* only do vmx ops when the size equal or greater than 4K bytes */
+ cmpdi cr5,r5,VMX_THRESH
+ bge cr5,.Ldiffoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Ldiffoffset_novmx_cmp:
+#endif
+
+
+ cmpdi cr5,r5,31
+ ble cr5,.Lcmp_lt32bytes
+
+#ifdef CONFIG_ALTIVEC
+ b .Llong_novmx_cmp
+#else
+ b .Llong
+#endif
+
+#ifdef CONFIG_ALTIVEC
+.Ldiffoffset_vmx_cmp:
+ /* perform a 32 bytes pre-checking before
+ * enable VMX operations.
+ */
+ li r0,4
+ mtctr r0
+.Ldiffoffset_prechk_32B_loop:
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ addi r3,r3,8
+ addi r4,r4,8
+ bne cr0,.LcmpAB_lightweight
+ addi r5,r5,-8
+ bdnz .Ldiffoffset_prechk_32B_loop
+
+ ENTER_VMX_OPS
+ beq cr1,.Ldiffoffset_novmx_cmp
+
+.Ldiffoffset_vmx_cmp_start:
+ /* Firstly try to align r3 with 16 bytes */
+ andi. r6,r3,0xf
+ li off16,16
+ beq .Ldiffoffset_vmx_s1_16bytes_align
+
+ LVS v3,0,r3
+ LVS v4,0,r4
+
+ lvx v5,0,r3
+ lvx v6,0,r4
+ LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
+ LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+
+ VCMPEQUB_RC(v7,v9,v10)
+ bnl cr6,.Ldiffoffset_vmx_diff_found
+
+ subfic r6,r6,16
+ subf r5,r6,r5
+ add r3,r3,r6
+ add r4,r4,r6
+
+.Ldiffoffset_vmx_s1_16bytes_align:
+ /* now s1 is aligned with 16 bytes */
+ lvx v6,0,r4
+ LVS v4,0,r4
+ srdi r6,r5,5 /* loop for 32 bytes each */
+ clrldi r5,r5,59
+ mtctr r6
+
+.balign 16
+.Ldiffoffset_vmx_32bytesloop:
+ /* the first qw of r4 was saved in v6 */
+ lvx v9,0,r3
+ LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+ VCMPEQUB_RC(v7,v9,v10)
+ vor v6,v8,v8
+ bnl cr6,.Ldiffoffset_vmx_diff_found
+
+ addi r3,r3,16
+ addi r4,r4,16
+
+ lvx v9,0,r3
+ LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+ VCMPEQUB_RC(v7,v9,v10)
+ vor v6,v8,v8
+ bnl cr6,.Ldiffoffset_vmx_diff_found
+
+ addi r3,r3,16
+ addi r4,r4,16
+
+ bdnz .Ldiffoffset_vmx_32bytesloop
+
+ EXIT_VMX_OPS
+
+ cmpdi r5,0
+ beq .Lzero
+ b .Lcmp_lt32bytes
+
+.Ldiffoffset_vmx_diff_found:
+ EXIT_VMX_OPS
+ /* anyway, the diff will appear in next 16 bytes */
+ li r5,16
+ b .Lcmp_lt32bytes
+
+#endif
EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 8d8265be1a59..273ea67e60a1 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S
@@ -9,6 +9,13 @@
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/export.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#ifndef SELFTEST_CASE
+/* For big-endian, 0 == most CPUs, 1 == POWER6, 2 == Cell */
+#define SELFTEST_CASE 0
+#endif
.align 7
_GLOBAL_TOC(memcpy)
@@ -20,10 +27,8 @@ BEGIN_FTR_SECTION
#endif
FTR_SECTION_ELSE
#ifdef CONFIG_PPC_BOOK3S_64
-#ifndef SELFTEST
b memcpy_power7
#endif
-#endif
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
#ifdef __LITTLE_ENDIAN__
/* dumb little-endian memcpy that will get replaced at runtime */
@@ -47,6 +52,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
cleared.
At the time of writing the only CPU that has this combination of bits
set is Power6. */
+test_feature = (SELFTEST_CASE == 1)
BEGIN_FTR_SECTION
nop
FTR_SECTION_ELSE
@@ -55,6 +61,7 @@ ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
CPU_FTR_UNALIGNED_LD_STD)
.Ldst_aligned:
addi r3,r3,-16
+test_feature = (SELFTEST_CASE == 0)
BEGIN_FTR_SECTION
andi. r0,r4,7
bne .Lsrc_unaligned
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index df7de9d3da08..89bfefcf7fcc 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -19,7 +19,10 @@
*/
#include <asm/ppc_asm.h>
-_GLOBAL(memcpy_power7)
+#ifndef SELFTEST_CASE
+/* 0 == don't use VMX, 1 == use VMX */
+#define SELFTEST_CASE 0
+#endif
#ifdef __BIG_ENDIAN__
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
@@ -29,20 +32,17 @@ _GLOBAL(memcpy_power7)
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
#endif
-#ifdef CONFIG_ALTIVEC
+_GLOBAL(memcpy_power7)
cmpldi r5,16
cmpldi cr1,r5,4096
-
std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-
blt .Lshort_copy
- bgt cr1,.Lvmx_copy
-#else
- cmpldi r5,16
-
- std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
- blt .Lshort_copy
+#ifdef CONFIG_ALTIVEC
+test_feature = SELFTEST_CASE
+BEGIN_FTR_SECTION
+ bgt cr1, .Lvmx_copy
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
#endif
.Lnonvmx_copy:
@@ -223,14 +223,14 @@ _GLOBAL(memcpy_power7)
addi r1,r1,STACKFRAMESIZE
b .Lnonvmx_copy
-#ifdef CONFIG_ALTIVEC
.Lvmx_copy:
+#ifdef CONFIG_ALTIVEC
mflr r0
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
- bl enter_vmx_copy
+ bl enter_vmx_ops
cmpwi cr1,r3,0
ld r0,STACKFRAMESIZE+16(r1)
ld r3,STK_REG(R31)(r1)
@@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7)
15: addi r1,r1,STACKFRAMESIZE
ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
- b exit_vmx_copy /* tail call optimise */
+ b exit_vmx_ops /* tail call optimise */
.Lvmx_unaligned_copy:
/* Get the destination 16B aligned */
@@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7)
15: addi r1,r1,STACKFRAMESIZE
ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
- b exit_vmx_copy /* tail call optimise */
+ b exit_vmx_ops /* tail call optimise */
#endif /* CONFIG_ALTIVEC */
diff --git a/arch/powerpc/lib/strlen_32.S b/arch/powerpc/lib/strlen_32.S
new file mode 100644
index 000000000000..0a8d3f64d493
--- /dev/null
+++ b/arch/powerpc/lib/strlen_32.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * strlen() for PPC32
+ *
+ * Copyright (C) 2018 Christophe Leroy CS Systemes d'Information.
+ *
+ * Inspired from glibc implementation
+ */
+#include <asm/ppc_asm.h>
+#include <asm/export.h>
+#include <asm/cache.h>
+
+ .text
+
+/*
+ * Algorithm:
+ *
+ * 1) Given a word 'x', we can test to see if it contains any 0 bytes
+ * by subtracting 0x01010101, and seeing if any of the high bits of each
+ * byte changed from 0 to 1. This works because the least significant
+ * 0 byte must have had no incoming carry (otherwise it's not the least
+ * significant), so it is 0x00 - 0x01 == 0xff. For all other
+ * byte values, either they have the high bit set initially, or when
+ * 1 is subtracted you get a value in the range 0x00-0x7f, none of which
+ * have their high bit set. The expression here is
+ * (x - 0x01010101) & ~x & 0x80808080), which gives 0x00000000 when
+ * there were no 0x00 bytes in the word. You get 0x80 in bytes that
+ * match, but possibly false 0x80 matches in the next more significant
+ * byte to a true match due to carries. For little-endian this is
+ * of no consequence since the least significant match is the one
+ * we're interested in, but big-endian needs method 2 to find which
+ * byte matches.
+ * 2) Given a word 'x', we can test to see _which_ byte was zero by
+ * calculating ~(((x & ~0x80808080) - 0x80808080 - 1) | x | ~0x80808080).
+ * This produces 0x80 in each byte that was zero, and 0x00 in all
+ * the other bytes. The '| ~0x80808080' clears the low 7 bits in each
+ * byte, and the '| x' part ensures that bytes with the high bit set
+ * produce 0x00. The addition will carry into the high bit of each byte
+ * iff that byte had one of its low 7 bits set. We can then just see
+ * which was the most significant bit set and divide by 8 to find how
+ * many to add to the index.
+ * This is from the book 'The PowerPC Compiler Writer's Guide',
+ * by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
+ */
+
+_GLOBAL(strlen)
+ andi. r0, r3, 3
+ lis r7, 0x0101
+ addi r10, r3, -4
+ addic r7, r7, 0x0101 /* r7 = 0x01010101 (lomagic) & clear XER[CA] */
+ rotlwi r6, r7, 31 /* r6 = 0x80808080 (himagic) */
+ bne- 3f
+ .balign IFETCH_ALIGN_BYTES
+1: lwzu r9, 4(r10)
+2: subf r8, r7, r9
+ and. r8, r8, r6
+ beq+ 1b
+ andc. r8, r8, r9
+ beq+ 1b
+ andc r8, r9, r6
+ orc r9, r9, r6
+ subfe r8, r6, r8
+ nor r8, r8, r9
+ cntlzw r8, r8
+ subf r3, r3, r10
+ srwi r8, r8, 3
+ add r3, r3, r8
+ blr
+
+ /* Missaligned string: make sure bytes before string are seen not 0 */
+3: xor r10, r10, r0
+ orc r8, r8, r8
+ lwzu r9, 4(r10)
+ slwi r0, r0, 3
+ srw r8, r8, r0
+ orc r9, r9, r8
+ b 2b
+EXPORT_SYMBOL(strlen)
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index bf925cdcaca9..9f340494a8ac 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
return 0;
}
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
{
if (in_interrupt())
return 0;
@@ -70,7 +70,7 @@ int enter_vmx_copy(void)
* passed a pointer to the destination which we return as required by a
* memcpy implementation.
*/
-void *exit_vmx_copy(void *dest)
+void *exit_vmx_ops(void *dest)
{
disable_kernel_altivec();
preempt_enable();