Merge tag 'powerpc-4.19-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc updates from Michael Ellerman: "Notable changes: - A fix for a bug in our page table fragment allocator, where a page table page could be freed and reallocated for something else while still in use, leading to memory corruption etc. The fix reuses pt_mm in struct page (x86 only) for a powerpc only refcount. - Fixes to our pkey support. Several are user-visible changes, but bring us in to line with x86 behaviour and/or fix outright bugs. Thanks to Florian Weimer for reporting many of these. - A series to improve the hvc driver & related OPAL console code, which have been seen to cause hardlockups at times. The hvc driver changes in particular have been in linux-next for ~month. - Increase our MAX_PHYSMEM_BITS to 128TB when SPARSEMEM_VMEMMAP=y. - Remove Power8 DD1 and Power9 DD1 support, neither chip should be in use anywhere other than as a paper weight. - An optimised memcmp implementation using Power7-or-later VMX instructions - Support for barrier_nospec on some NXP CPUs. - Support for flushing the count cache on context switch on some IBM CPUs (controlled by firmware), as a Spectre v2 mitigation. - A series to enhance the information we print on unhandled signals to bring it into line with other arches, including showing the offending VMA and dumping the instructions around the fault. Thanks to: Aaro Koskinen, Akshay Adiga, Alastair D'Silva, Alexey Kardashevskiy, Alexey Spirkov, Alistair Popple, Andrew Donnellan, Aneesh Kumar K.V, Anju T Sudhakar, Arnd Bergmann, Bartosz Golaszewski, Benjamin Herrenschmidt, Bharat Bhushan, Bjoern Noetel, Boqun Feng, Breno Leitao, Bryant G. Ly, Camelia Groza, Christophe Leroy, Christoph Hellwig, Cyril Bur, Dan Carpenter, Daniel Klamt, Darren Stevens, Dave Young, David Gibson, Diana Craciun, Finn Thain, Florian Weimer, Frederic Barrat, Gautham R. Shenoy, Geert Uytterhoeven, Geoff Levand, Guenter Roeck, Gustavo Romero, Haren Myneni, Hari Bathini, Joel Stanley, Jonathan Neuschäfer, Kees Cook, Madhavan Srinivasan, Mahesh Salgaonkar, Markus Elfring, Mathieu Malaterre, Mauro S. M. Rodrigues, Michael Hanselmann, Michael Neuling, Michael Schmitz, Mukesh Ojha, Murilo Opsfelder Araujo, Nicholas Piggin, Parth Y Shah, Paul Mackerras, Paul Menzel, Ram Pai, Randy Dunlap, Rashmica Gupta, Reza Arbab, Rodrigo R. Galvao, Russell Currey, Sam Bobroff, Scott Wood, Shilpasri G Bhat, Simon Guo, Souptick Joarder, Stan Johnson, Thiago Jung Bauermann, Tyrel Datwyler, Vaibhav Jain, Vasant Hegde, Venkat Rao, zhong jiang" * tag 'powerpc-4.19-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (234 commits) powerpc/mm/book3s/radix: Add mapping statistics powerpc/uaccess: Enable get_user(u64, *p) on 32-bit powerpc/mm/hash: Remove unnecessary do { } while(0) loop powerpc/64s: move machine check SLB flushing to mm/slb.c powerpc/powernv/idle: Fix build error powerpc/mm/tlbflush: update the mmu_gather page size while iterating address range powerpc/mm: remove warning about ‘type’ being set powerpc/32: Include setup.h header file to fix warnings powerpc: Move `path` variable inside DEBUG_PROM powerpc/powermac: Make some functions static powerpc/powermac: Remove variable x that's never read cxl: remove a dead branch powerpc/powermac: Add missing include of header pmac.h powerpc/kexec: Use common error handling code in setup_new_fdt() powerpc/xmon: Add address lookup for percpu symbols powerpc/mm: remove huge_pte_offset_and_shift() prototype powerpc/lib: Use patch_site to patch copy_32 functions once cache is enabled powerpc/pseries: Fix endianness while restoring of r3 in MCE handler. powerpc/fadump: merge adjacent memory ranges to reduce PT_LOAD segements powerpc/fadump: handle crash memory ranges array index overflow ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-08-17 11:32:50 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-08-17 11:32:50 -0700
commit: 5e2d059b52e397d9ac42f4c4d9d9a841887b5818 (patch)
tree: c8cd8fd7187113be33e29fcc75f45a8bbc27e6b2 /arch/powerpc/lib
parent: d190775206d06397a9309421cac5ba2f2c243521 (diff)
parent: a2dc009afa9ae8b92305be7728676562a104cb40 (diff)
17 files changed, 839 insertions, 375 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index d0ca13ad8231..670286808928 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -12,7 +12,7 @@ CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
 
 obj-y += string.o alloc.o code-patching.o feature-fixups.o
 
-obj-$(CONFIG_PPC32)	+= div64.o copy_32.o crtsavres.o
+obj-$(CONFIG_PPC32)	+= div64.o copy_32.o crtsavres.o strlen_32.o
 
 # See corresponding test in arch/powerpc/Makefile
 # 64-bit linker creates .sfpr on demand for final link (vmlinux),
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index e0d881ab304e..850f3b8f4da5 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -195,6 +195,22 @@ int patch_branch(unsigned int *addr, unsigned long target, int flags)
 	return patch_instruction(addr, create_branch(addr, target, flags));
 }
 
+int patch_branch_site(s32 *site, unsigned long target, int flags)
+{
+	unsigned int *addr;
+
+	addr = (unsigned int *)((unsigned long)site + *site);
+	return patch_instruction(addr, create_branch(addr, target, flags));
+}
+
+int patch_instruction_site(s32 *site, unsigned int instr)
+{
+	unsigned int *addr;
+
+	addr = (unsigned int *)((unsigned long)site + *site);
+	return patch_instruction(addr, instr);
+}
+
 bool is_offset_in_branch_range(long offset)
 {
 	/*
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index da425bb6b369..ba66846fe973 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -13,6 +13,7 @@
 #include <asm/errno.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/code-patching-asm.h>
 
 #define COPY_16_BYTES		\
 	lwz	r7,4(r4);	\
@@ -107,8 +108,8 @@ _GLOBAL(memset)
 	 * Skip optimised bloc until cache is enabled. Will be replaced
 	 * by 'bne' during boot to use normal procedure if r4 is not zero
 	 */
-_GLOBAL(memset_nocache_branch)
-	b	2f
+5:	b	2f
+	patch_site	5b, patch__memset_nocache
 
 	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
 	add	r8,r7,r5
@@ -168,7 +169,9 @@ _GLOBAL(memmove)
 	/* fall through */
 
 _GLOBAL(memcpy)
-	b	generic_memcpy
+1:	b	generic_memcpy
+	patch_site	1b, patch__memcpy_nocache
+
 	add	r7,r3,r5		/* test if the src & dst overlap */
 	add	r8,r4,r5
 	cmplw	0,r4,r7
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 8d5034f645f3..694390357667 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -11,6 +11,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/export.h>
+#include <asm/feature-fixups.h>
 
         .section        ".toc","aw"
 PPC64_CACHES:
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7ab20e..e38f956f7d9f 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
 	addi	r3,r3,128
 	bdnz	1b
 
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 #else
 	li	r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S
index 506677395681..96c514bee66b 100644
--- a/arch/powerpc/lib/copyuser_64.S
+++ b/arch/powerpc/lib/copyuser_64.S
@@ -9,6 +9,13 @@
 #include <asm/processor.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#ifndef SELFTEST_CASE
+/* 0 == most CPUs, 1 == POWER6, 2 == Cell */
+#define SELFTEST_CASE	0
+#endif
 
 #ifdef __BIG_ENDIAN__
 #define sLd sld		/* Shift towards low-numbered address. */
@@ -18,6 +25,28 @@
 #define sHd sld		/* Shift towards high-numbered address. */
 #endif
 
+/*
+ * These macros are used to generate exception table entries.
+ * The exception handlers below use the original arguments
+ * (stored on the stack) and the point where we're up to in
+ * the destination buffer, i.e. the address of the first
+ * unmodified byte.  Generally r3 points into the destination
+ * buffer, but the first unmodified byte is at a variable
+ * offset from r3.  In the code below, the symbol r3_offset
+ * is set to indicate the current offset at each point in
+ * the code.  This offset is then used as a negative offset
+ * from the exception handler code, and those instructions
+ * before the exception handlers are addi instructions that
+ * adjust r3 to point to the correct place.
+ */
+	.macro	lex		/* exception handler for load */
+100:	EX_TABLE(100b, .Lld_exc - r3_offset)
+	.endm
+
+	.macro	stex		/* exception handler for store */
+100:	EX_TABLE(100b, .Lst_exc - r3_offset)
+	.endm
+
 	.align	7
 _GLOBAL_TOC(__copy_tofrom_user)
 #ifdef CONFIG_PPC_BOOK3S_64
@@ -28,7 +57,7 @@ FTR_SECTION_ELSE
 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
 #endif
 _GLOBAL(__copy_tofrom_user_base)
-	/* first check for a whole page copy on a page boundary */
+	/* first check for a 4kB copy on a 4kB boundary */
 	cmpldi	cr1,r5,16
 	cmpdi	cr6,r5,4096
 	or	r0,r3,r4
@@ -49,6 +78,7 @@ _GLOBAL(__copy_tofrom_user_base)
  * At the time of writing the only CPU that has this combination of bits
  * set is Power6.
  */
+test_feature = (SELFTEST_CASE == 1)
 BEGIN_FTR_SECTION
 	nop
 FTR_SECTION_ELSE
@@ -57,6 +87,8 @@ ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
 		    CPU_FTR_UNALIGNED_LD_STD)
 .Ldst_aligned:
 	addi	r3,r3,-16
+r3_offset = 16
+test_feature = (SELFTEST_CASE == 0)
 BEGIN_FTR_SECTION
 	andi.	r0,r4,7
 	bne	.Lsrc_unaligned
@@ -64,57 +96,69 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	blt	cr1,.Ldo_tail		/* if < 16 bytes to copy */
 	srdi	r0,r5,5
 	cmpdi	cr1,r0,0
-20:	ld	r7,0(r4)
-220:	ld	r6,8(r4)
+lex;	ld	r7,0(r4)
+lex;	ld	r6,8(r4)
 	addi	r4,r4,16
 	mtctr	r0
 	andi.	r0,r5,0x10
 	beq	22f
 	addi	r3,r3,16
+r3_offset = 0
 	addi	r4,r4,-16
 	mr	r9,r7
 	mr	r8,r6
 	beq	cr1,72f
-21:	ld	r7,16(r4)
-221:	ld	r6,24(r4)
+21:
+lex;	ld	r7,16(r4)
+lex;	ld	r6,24(r4)
 	addi	r4,r4,32
-70:	std	r9,0(r3)
-270:	std	r8,8(r3)
-22:	ld	r9,0(r4)
-222:	ld	r8,8(r4)
-71:	std	r7,16(r3)
-271:	std	r6,24(r3)
+stex;	std	r9,0(r3)
+r3_offset = 8
+stex;	std	r8,8(r3)
+r3_offset = 16
+22:
+lex;	ld	r9,0(r4)
+lex;	ld	r8,8(r4)
+stex;	std	r7,16(r3)
+r3_offset = 24
+stex;	std	r6,24(r3)
 	addi	r3,r3,32
+r3_offset = 0
 	bdnz	21b
-72:	std	r9,0(r3)
-272:	std	r8,8(r3)
+72:
+stex;	std	r9,0(r3)
+r3_offset = 8
+stex;	std	r8,8(r3)
+r3_offset = 16
 	andi.	r5,r5,0xf
 	beq+	3f
 	addi	r4,r4,16
 .Ldo_tail:
 	addi	r3,r3,16
+r3_offset = 0
 	bf	cr7*4+0,246f
-244:	ld	r9,0(r4)
+lex;	ld	r9,0(r4)
 	addi	r4,r4,8
-245:	std	r9,0(r3)
+stex;	std	r9,0(r3)
 	addi	r3,r3,8
 246:	bf	cr7*4+1,1f
-23:	lwz	r9,0(r4)
+lex;	lwz	r9,0(r4)
 	addi	r4,r4,4
-73:	stw	r9,0(r3)
+stex;	stw	r9,0(r3)
 	addi	r3,r3,4
 1:	bf	cr7*4+2,2f
-44:	lhz	r9,0(r4)
+lex;	lhz	r9,0(r4)
 	addi	r4,r4,2
-74:	sth	r9,0(r3)
+stex;	sth	r9,0(r3)
 	addi	r3,r3,2
 2:	bf	cr7*4+3,3f
-45:	lbz	r9,0(r4)
-75:	stb	r9,0(r3)
+lex;	lbz	r9,0(r4)
+stex;	stb	r9,0(r3)
 3:	li	r3,0
 	blr
 
 .Lsrc_unaligned:
+r3_offset = 16
 	srdi	r6,r5,3
 	addi	r5,r5,-16
 	subf	r4,r0,r4
@@ -127,58 +171,69 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	add	r5,r5,r0
 	bt	cr7*4+0,28f
 
-24:	ld	r9,0(r4)	/* 3+2n loads, 2+2n stores */
-25:	ld	r0,8(r4)
+lex;	ld	r9,0(r4)	/* 3+2n loads, 2+2n stores */
+lex;	ld	r0,8(r4)
 	sLd	r6,r9,r10
-26:	ldu	r9,16(r4)
+lex;	ldu	r9,16(r4)
 	sHd	r7,r0,r11
 	sLd	r8,r0,r10
 	or	r7,r7,r6
 	blt	cr6,79f
-27:	ld	r0,8(r4)
+lex;	ld	r0,8(r4)
 	b	2f
 
-28:	ld	r0,0(r4)	/* 4+2n loads, 3+2n stores */
-29:	ldu	r9,8(r4)
+28:
+lex;	ld	r0,0(r4)	/* 4+2n loads, 3+2n stores */
+lex;	ldu	r9,8(r4)
 	sLd	r8,r0,r10
 	addi	r3,r3,-8
+r3_offset = 24
 	blt	cr6,5f
-30:	ld	r0,8(r4)
+lex;	ld	r0,8(r4)
 	sHd	r12,r9,r11
 	sLd	r6,r9,r10
-31:	ldu	r9,16(r4)
+lex;	ldu	r9,16(r4)
 	or	r12,r8,r12
 	sHd	r7,r0,r11
 	sLd	r8,r0,r10
 	addi	r3,r3,16
+r3_offset = 8
 	beq	cr6,78f
 
 1:	or	r7,r7,r6
-32:	ld	r0,8(r4)
-76:	std	r12,8(r3)
+lex;	ld	r0,8(r4)
+stex;	std	r12,8(r3)
+r3_offset = 16
 2:	sHd	r12,r9,r11
 	sLd	r6,r9,r10
-33:	ldu	r9,16(r4)
+lex;	ldu	r9,16(r4)
 	or	r12,r8,r12
-77:	stdu	r7,16(r3)
+stex;	stdu	r7,16(r3)
+r3_offset = 8
 	sHd	r7,r0,r11
 	sLd	r8,r0,r10
 	bdnz	1b
 
-78:	std	r12,8(r3)
+78:
+stex;	std	r12,8(r3)
+r3_offset = 16
 	or	r7,r7,r6
-79:	std	r7,16(r3)
+79:
+stex;	std	r7,16(r3)
+r3_offset = 24
 5:	sHd	r12,r9,r11
 	or	r12,r8,r12
-80:	std	r12,24(r3)
+stex;	std	r12,24(r3)
+r3_offset = 32
 	bne	6f
 	li	r3,0
 	blr
 6:	cmpwi	cr1,r5,8
 	addi	r3,r3,32
+r3_offset = 0
 	sLd	r9,r9,r10
 	ble	cr1,7f
-34:	ld	r0,8(r4)
+lex;	ld	r0,8(r4)
 	sHd	r7,r0,r11
 	or	r9,r7,r9
 7:
@@ -186,7 +241,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 #ifdef __BIG_ENDIAN__
 	rotldi	r9,r9,32
 #endif
-94:	stw	r9,0(r3)
+stex;	stw	r9,0(r3)
 #ifdef __LITTLE_ENDIAN__
 	rotrdi	r9,r9,32
 #endif
@@ -195,7 +250,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 #ifdef __BIG_ENDIAN__
 	rotldi	r9,r9,16
 #endif
-95:	sth	r9,0(r3)
+stex;	sth	r9,0(r3)
 #ifdef __LITTLE_ENDIAN__
 	rotrdi	r9,r9,16
 #endif
@@ -204,7 +259,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 #ifdef __BIG_ENDIAN__
 	rotldi	r9,r9,8
 #endif
-96:	stb	r9,0(r3)
+stex;	stb	r9,0(r3)
 #ifdef __LITTLE_ENDIAN__
 	rotrdi	r9,r9,8
 #endif
@@ -212,47 +267,55 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	blr
 
 .Ldst_unaligned:
+r3_offset = 0
 	PPC_MTOCRF(0x01,r6)		/* put #bytes to 8B bdry into cr7 */
 	subf	r5,r6,r5
 	li	r7,0
 	cmpldi	cr1,r5,16
 	bf	cr7*4+3,1f
-35:	lbz	r0,0(r4)
-81:	stb	r0,0(r3)
+100:	EX_TABLE(100b, .Lld_exc_r7)
+	lbz	r0,0(r4)
+100:	EX_TABLE(100b, .Lst_exc_r7)
+	stb	r0,0(r3)
 	addi	r7,r7,1
 1:	bf	cr7*4+2,2f
-36:	lhzx	r0,r7,r4
-82:	sthx	r0,r7,r3
+100:	EX_TABLE(100b, .Lld_exc_r7)
+	lhzx	r0,r7,r4
+100:	EX_TABLE(100b, .Lst_exc_r7)
+	sthx	r0,r7,r3
 	addi	r7,r7,2
 2:	bf	cr7*4+1,3f
-37:	lwzx	r0,r7,r4
-83:	stwx	r0,r7,r3
+100:	EX_TABLE(100b, .Lld_exc_r7)
+	lwzx	r0,r7,r4
+100:	EX_TABLE(100b, .Lst_exc_r7)
+	stwx	r0,r7,r3
 3:	PPC_MTOCRF(0x01,r5)
 	add	r4,r6,r4
 	add	r3,r6,r3
 	b	.Ldst_aligned
 
 .Lshort_copy:
+r3_offset = 0
 	bf	cr7*4+0,1f
-38:	lwz	r0,0(r4)
-39:	lwz	r9,4(r4)
+lex;	lwz	r0,0(r4)
+lex;	lwz	r9,4(r4)
 	addi	r4,r4,8
-84:	stw	r0,0(r3)
-85:	stw	r9,4(r3)
+stex;	stw	r0,0(r3)
+stex;	stw	r9,4(r3)
 	addi	r3,r3,8
 1:	bf	cr7*4+1,2f
-40:	lwz	r0,0(r4)
+lex;	lwz	r0,0(r4)
 	addi	r4,r4,4
-86:	stw	r0,0(r3)
+stex;	stw	r0,0(r3)
 	addi	r3,r3,4
 2:	bf	cr7*4+2,3f
-41:	lhz	r0,0(r4)
+lex;	lhz	r0,0(r4)
 	addi	r4,r4,2
-87:	sth	r0,0(r3)
+stex;	sth	r0,0(r3)
 	addi	r3,r3,2
 3:	bf	cr7*4+3,4f
-42:	lbz	r0,0(r4)
-88:	stb	r0,0(r3)
+lex;	lbz	r0,0(r4)
+stex;	stb	r0,0(r3)
 4:	li	r3,0
 	blr
 
@@ -260,48 +323,34 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
  * exception handlers follow
  * we have to return the number of bytes not copied
  * for an exception on a load, we set the rest of the destination to 0
+ * Note that the number of bytes of instructions for adjusting r3 needs
+ * to equal the amount of the adjustment, due to the trick of using
+ * .Lld_exc - r3_offset as the handler address.
  */
 
-136:
-137:
+.Lld_exc_r7:
 	add	r3,r3,r7
-	b	1f
-130:
-131:
+	b	.Lld_exc
+
+	/* adjust by 24 */
 	addi	r3,r3,8
-120:
-320:
-122:
-322:
-124:
-125:
-126:
-127:
-128:
-129:
-133:
+	nop
+	/* adjust by 16 */
 	addi	r3,r3,8
-132:
+	nop
+	/* adjust by 8 */
 	addi	r3,r3,8
-121:
-321:
-344:
-134:
-135:
-138:
-139:
-140:
-141:
-142:
-123:
-144:
-145:
+	nop
 
 /*
- * here we have had a fault on a load and r3 points to the first
- * unmodified byte of the destination
+ * Here we have had a fault on a load and r3 points to the first
+ * unmodified byte of the destination.  We use the original arguments
+ * and r3 to work out how much wasn't copied.  Since we load some
+ * distance ahead of the stores, we continue copying byte-by-byte until
+ * we hit the load fault again in order to copy as much as possible.
  */
-1:	ld	r6,-24(r1)
+.Lld_exc:
+	ld	r6,-24(r1)
 	ld	r4,-16(r1)
 	ld	r5,-8(r1)
 	subf	r6,r6,r3
@@ -312,9 +361,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
  * first see if we can copy any more bytes before hitting another exception
  */
 	mtctr	r5
+r3_offset = 0
+100:	EX_TABLE(100b, .Ldone)
 43:	lbz	r0,0(r4)
 	addi	r4,r4,1
-89:	stb	r0,0(r3)
+stex;	stb	r0,0(r3)
 	addi	r3,r3,1
 	bdnz	43b
 	li	r3,0		/* huh? all copied successfully this time? */
@@ -323,116 +374,63 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 /*
  * here we have trapped again, amount remaining is in ctr.
  */
-143:	mfctr	r3
+.Ldone:
+	mfctr	r3
 	blr
 
 /*
- * exception handlers for stores: we just need to work
- * out how many bytes weren't copied
+ * exception handlers for stores: we need to work out how many bytes
+ * weren't copied, and we may need to copy some more.
+ * Note that the number of bytes of instructions for adjusting r3 needs
+ * to equal the amount of the adjustment, due to the trick of using
+ * .Lst_exc - r3_offset as the handler address.
  */
-182:
-183:
+.Lst_exc_r7:
 	add	r3,r3,r7
-	b	1f
-371:
-180:
+	b	.Lst_exc
+
+	/* adjust by 24 */
 	addi	r3,r3,8
-171:
-177:
-179:
+	nop
+	/* adjust by 16 */
 	addi	r3,r3,8
-370:
-372:
-176:
-178:
+	nop
+	/* adjust by 8 */
 	addi	r3,r3,4
-185:
+	/* adjust by 4 */
 	addi	r3,r3,4
-170:
-172:
-345:
-173:
-174:
-175:
-181:
-184:
-186:
-187:
-188:
-189:	
-194:
-195:
-196:
-1:
-	ld	r6,-24(r1)
-	ld	r5,-8(r1)
-	add	r6,r6,r5
-	subf	r3,r3,r6	/* #bytes not copied */
+.Lst_exc:
+	ld	r6,-24(r1)	/* original destination pointer */
+	ld	r4,-16(r1)	/* original source pointer */
+	ld	r5,-8(r1)	/* original number of bytes */
+	add	r7,r6,r5
+	/*
+	 * If the destination pointer isn't 8-byte aligned,
+	 * we may have got the exception as a result of a
+	 * store that overlapped a page boundary, so we may be
+	 * able to copy a few more bytes.
+	 */
+17:	andi.	r0,r3,7
+	beq	19f
+	subf	r8,r6,r3	/* #bytes copied */
+100:	EX_TABLE(100b,19f)
+	lbzx	r0,r8,r4
+100:	EX_TABLE(100b,19f)
+	stb	r0,0(r3)
+	addi	r3,r3,1
+	cmpld	r3,r7
+	blt	17b
+19:	subf	r3,r3,r7	/* #bytes not copied in r3 */
 	blr
 
-	EX_TABLE(20b,120b)
-	EX_TABLE(220b,320b)
-	EX_TABLE(21b,121b)
-	EX_TABLE(221b,321b)
-	EX_TABLE(70b,170b)
-	EX_TABLE(270b,370b)
-	EX_TABLE(22b,122b)
-	EX_TABLE(222b,322b)
-	EX_TABLE(71b,171b)
-	EX_TABLE(271b,371b)
-	EX_TABLE(72b,172b)
-	EX_TABLE(272b,372b)
-	EX_TABLE(244b,344b)
-	EX_TABLE(245b,345b)
-	EX_TABLE(23b,123b)
-	EX_TABLE(73b,173b)
-	EX_TABLE(44b,144b)
-	EX_TABLE(74b,174b)
-	EX_TABLE(45b,145b)
-	EX_TABLE(75b,175b)
-	EX_TABLE(24b,124b)
-	EX_TABLE(25b,125b)
-	EX_TABLE(26b,126b)
-	EX_TABLE(27b,127b)
-	EX_TABLE(28b,128b)
-	EX_TABLE(29b,129b)
-	EX_TABLE(30b,130b)
-	EX_TABLE(31b,131b)
-	EX_TABLE(32b,132b)
-	EX_TABLE(76b,176b)
-	EX_TABLE(33b,133b)
-	EX_TABLE(77b,177b)
-	EX_TABLE(78b,178b)
-	EX_TABLE(79b,179b)
-	EX_TABLE(80b,180b)
-	EX_TABLE(34b,134b)
-	EX_TABLE(94b,194b)
-	EX_TABLE(95b,195b)
-	EX_TABLE(96b,196b)
-	EX_TABLE(35b,135b)
-	EX_TABLE(81b,181b)
-	EX_TABLE(36b,136b)
-	EX_TABLE(82b,182b)
-	EX_TABLE(37b,137b)
-	EX_TABLE(83b,183b)
-	EX_TABLE(38b,138b)
-	EX_TABLE(39b,139b)
-	EX_TABLE(84b,184b)
-	EX_TABLE(85b,185b)
-	EX_TABLE(40b,140b)
-	EX_TABLE(86b,186b)
-	EX_TABLE(41b,141b)
-	EX_TABLE(87b,187b)
-	EX_TABLE(42b,142b)
-	EX_TABLE(88b,188b)
-	EX_TABLE(43b,143b)
-	EX_TABLE(89b,189b)
-
 /*
  * Routine to copy a whole page of data, optimized for POWER4.
  * On POWER4 it is more than 50% faster than the simple loop
  * above (following the .Ldst_aligned label).
  */
+	.macro	exc
+100:	EX_TABLE(100b, .Labort)
+	.endm
 .Lcopy_page_4K:
 	std	r31,-32(1)
 	std	r30,-40(1)
@@ -451,86 +449,86 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	li	r0,5
 0:	addi	r5,r5,-24
 	mtctr	r0
-20:	ld	r22,640(4)
-21:	ld	r21,512(4)
-22:	ld	r20,384(4)
-23:	ld	r11,256(4)
-24:	ld	r9,128(4)
-25:	ld	r7,0(4)
-26:	ld	r25,648(4)
-27:	ld	r24,520(4)
-28:	ld	r23,392(4)
-29:	ld	r10,264(4)
-30:	ld	r8,136(4)
-31:	ldu	r6,8(4)
+exc;	ld	r22,640(4)
+exc;	ld	r21,512(4)
+exc;	ld	r20,384(4)
+exc;	ld	r11,256(4)
+exc;	ld	r9,128(4)
+exc;	ld	r7,0(4)
+exc;	ld	r25,648(4)
+exc;	ld	r24,520(4)
+exc;	ld	r23,392(4)
+exc;	ld	r10,264(4)
+exc;	ld	r8,136(4)
+exc;	ldu	r6,8(4)
 	cmpwi	r5,24
 1:
-32:	std	r22,648(3)
-33:	std	r21,520(3)
-34:	std	r20,392(3)
-35:	std	r11,264(3)
-36:	std	r9,136(3)
-37:	std	r7,8(3)
-38:	ld	r28,648(4)
-39:	ld	r27,520(4)
-40:	ld	r26,392(4)
-41:	ld	r31,264(4)
-42:	ld	r30,136(4)
-43:	ld	r29,8(4)
-44:	std	r25,656(3)
-45:	std	r24,528(3)
-46:	std	r23,400(3)
-47:	std	r10,272(3)
-48:	std	r8,144(3)
-49:	std	r6,16(3)
-50:	ld	r22,656(4)
-51:	ld	r21,528(4)
-52:	ld	r20,400(4)
-53:	ld	r11,272(4)
-54:	ld	r9,144(4)
-55:	ld	r7,16(4)
-56:	std	r28,664(3)
-57:	std	r27,536(3)
-58:	std	r26,408(3)
-59:	std	r31,280(3)
-60:	std	r30,152(3)
-61:	stdu	r29,24(3)
-62:	ld	r25,664(4)
-63:	ld	r24,536(4)
-64:	ld	r23,408(4)
-65:	ld	r10,280(4)
-66:	ld	r8,152(4)
-67:	ldu	r6,24(4)
+exc;	std	r22,648(3)
+exc;	std	r21,520(3)
+exc;	std	r20,392(3)
+exc;	std	r11,264(3)
+exc;	std	r9,136(3)
+exc;	std	r7,8(3)
+exc;	ld	r28,648(4)
+exc;	ld	r27,520(4)
+exc;	ld	r26,392(4)
+exc;	ld	r31,264(4)
+exc;	ld	r30,136(4)
+exc;	ld	r29,8(4)
+exc;	std	r25,656(3)
+exc;	std	r24,528(3)
+exc;	std	r23,400(3)
+exc;	std	r10,272(3)
+exc;	std	r8,144(3)
+exc;	std	r6,16(3)
+exc;	ld	r22,656(4)
+exc;	ld	r21,528(4)
+exc;	ld	r20,400(4)
+exc;	ld	r11,272(4)
+exc;	ld	r9,144(4)
+exc;	ld	r7,16(4)
+exc;	std	r28,664(3)
+exc;	std	r27,536(3)
+exc;	std	r26,408(3)
+exc;	std	r31,280(3)
+exc;	std	r30,152(3)
+exc;	stdu	r29,24(3)
+exc;	ld	r25,664(4)
+exc;	ld	r24,536(4)
+exc;	ld	r23,408(4)
+exc;	ld	r10,280(4)
+exc;	ld	r8,152(4)
+exc;	ldu	r6,24(4)
 	bdnz	1b
-68:	std	r22,648(3)
-69:	std	r21,520(3)
-70:	std	r20,392(3)
-71:	std	r11,264(3)
-72:	std	r9,136(3)
-73:	std	r7,8(3)
-74:	addi	r4,r4,640
-75:	addi	r3,r3,648
+exc;	std	r22,648(3)
+exc;	std	r21,520(3)
+exc;	std	r20,392(3)
+exc;	std	r11,264(3)
+exc;	std	r9,136(3)
+exc;	std	r7,8(3)
+	addi	r4,r4,640
+	addi	r3,r3,648
 	bge	0b
 	mtctr	r5
-76:	ld	r7,0(4)
-77:	ld	r8,8(4)
-78:	ldu	r9,16(4)
+exc;	ld	r7,0(4)
+exc;	ld	r8,8(4)
+exc;	ldu	r9,16(4)
 3:
-79:	ld	r10,8(4)
-80:	std	r7,8(3)
-81:	ld	r7,16(4)
-82:	std	r8,16(3)
-83:	ld	r8,24(4)
-84:	std	r9,24(3)
-85:	ldu	r9,32(4)
-86:	stdu	r10,32(3)
+exc;	ld	r10,8(4)
+exc;	std	r7,8(3)
+exc;	ld	r7,16(4)
+exc;	std	r8,16(3)
+exc;	ld	r8,24(4)
+exc;	std	r9,24(3)
+exc;	ldu	r9,32(4)
+exc;	stdu	r10,32(3)
 	bdnz	3b
 4:
-87:	ld	r10,8(4)
-88:	std	r7,8(3)
-89:	std	r8,16(3)
-90:	std	r9,24(3)
-91:	std	r10,32(3)
+exc;	ld	r10,8(4)
+exc;	std	r7,8(3)
+exc;	std	r8,16(3)
+exc;	std	r9,24(3)
+exc;	std	r10,32(3)
 9:	ld	r20,-120(1)
 	ld	r21,-112(1)
 	ld	r22,-104(1)
@@ -550,7 +548,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
  * on an exception, reset to the beginning and jump back into the
  * standard __copy_tofrom_user
  */
-100:	ld	r20,-120(1)
+.Labort:
+	ld	r20,-120(1)
 	ld	r21,-112(1)
 	ld	r22,-104(1)
 	ld	r23,-96(1)
@@ -566,78 +565,4 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	ld	r4,-16(r1)
 	li	r5,4096
 	b	.Ldst_aligned
-
-	EX_TABLE(20b,100b)
-	EX_TABLE(21b,100b)
-	EX_TABLE(22b,100b)
-	EX_TABLE(23b,100b)
-	EX_TABLE(24b,100b)
-	EX_TABLE(25b,100b)
-	EX_TABLE(26b,100b)
-	EX_TABLE(27b,100b)
-	EX_TABLE(28b,100b)
-	EX_TABLE(29b,100b)
-	EX_TABLE(30b,100b)
-	EX_TABLE(31b,100b)
-	EX_TABLE(32b,100b)
-	EX_TABLE(33b,100b)
-	EX_TABLE(34b,100b)
-	EX_TABLE(35b,100b)
-	EX_TABLE(36b,100b)
-	EX_TABLE(37b,100b)
-	EX_TABLE(38b,100b)
-	EX_TABLE(39b,100b)
-	EX_TABLE(40b,100b)
-	EX_TABLE(41b,100b)
-	EX_TABLE(42b,100b)
-	EX_TABLE(43b,100b)
-	EX_TABLE(44b,100b)
-	EX_TABLE(45b,100b)
-	EX_TABLE(46b,100b)
-	EX_TABLE(47b,100b)
-	EX_TABLE(48b,100b)
-	EX_TABLE(49b,100b)
-	EX_TABLE(50b,100b)
-	EX_TABLE(51b,100b)
-	EX_TABLE(52b,100b)
-	EX_TABLE(53b,100b)
-	EX_TABLE(54b,100b)
-	EX_TABLE(55b,100b)
-	EX_TABLE(56b,100b)
-	EX_TABLE(57b,100b)
-	EX_TABLE(58b,100b)
-	EX_TABLE(59b,100b)
-	EX_TABLE(60b,100b)
-	EX_TABLE(61b,100b)
-	EX_TABLE(62b,100b)
-	EX_TABLE(63b,100b)
-	EX_TABLE(64b,100b)
-	EX_TABLE(65b,100b)
-	EX_TABLE(66b,100b)
-	EX_TABLE(67b,100b)
-	EX_TABLE(68b,100b)
-	EX_TABLE(69b,100b)
-	EX_TABLE(70b,100b)
-	EX_TABLE(71b,100b)
-	EX_TABLE(72b,100b)
-	EX_TABLE(73b,100b)
-	EX_TABLE(74b,100b)
-	EX_TABLE(75b,100b)
-	EX_TABLE(76b,100b)
-	EX_TABLE(77b,100b)
-	EX_TABLE(78b,100b)
-	EX_TABLE(79b,100b)
-	EX_TABLE(80b,100b)
-	EX_TABLE(81b,100b)
-	EX_TABLE(82b,100b)
-	EX_TABLE(83b,100b)
-	EX_TABLE(84b,100b)
-	EX_TABLE(85b,100b)
-	EX_TABLE(86b,100b)
-	EX_TABLE(87b,100b)
-	EX_TABLE(88b,100b)
-	EX_TABLE(89b,100b)
-	EX_TABLE(90b,100b)
-	EX_TABLE(91b,100b)
-
 EXPORT_SYMBOL(__copy_tofrom_user)
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S
index 215e4760c09f..1a1fe180af62 100644
--- a/arch/powerpc/lib/copyuser_power7.S
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -19,6 +19,11 @@
  */
 #include <asm/ppc_asm.h>
 
+#ifndef SELFTEST_CASE
+/* 0 == don't use VMX, 1 == use VMX */
+#define SELFTEST_CASE	0
+#endif
+
 #ifdef __BIG_ENDIAN__
 #define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
 #define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
@@ -80,7 +85,6 @@
 
 
 _GLOBAL(__copy_tofrom_user_power7)
-#ifdef CONFIG_ALTIVEC
 	cmpldi	r5,16
 	cmpldi	cr1,r5,3328
 
@@ -89,15 +93,12 @@ _GLOBAL(__copy_tofrom_user_power7)
 	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 
 	blt	.Lshort_copy
-	bge	cr1,.Lvmx_copy
-#else
-	cmpldi	r5,16
 
-	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
-	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
-
-	blt	.Lshort_copy
+#ifdef CONFIG_ALTIVEC
+test_feature = SELFTEST_CASE
+BEGIN_FTR_SECTION
+	bgt	cr1,.Lvmx_copy
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
 
 .Lnonvmx_copy:
@@ -278,8 +279,8 @@ err1;	stb	r0,0(r3)
 	addi	r1,r1,STACKFRAMESIZE
 	b	.Lnonvmx_copy
 
-#ifdef CONFIG_ALTIVEC
 .Lvmx_copy:
+#ifdef CONFIG_ALTIVEC
 	mflr	r0
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
diff --git a/arch/powerpc/lib/feature-fixups-test.S b/arch/powerpc/lib/feature-fixups-test.S
index f16cec989506..ee7c5fd5fc64 100644
--- a/arch/powerpc/lib/feature-fixups-test.S
+++ b/arch/powerpc/lib/feature-fixups-test.S
@@ -11,6 +11,7 @@
 #include <asm/feature-fixups.h>
 #include <asm/ppc_asm.h>
 #include <asm/synch.h>
+#include <asm/asm-compat.h>
 
 	.text
 
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 8b69f868298c..e613b02bb2f0 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -304,6 +304,9 @@ void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_
 	printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i);
 }
 
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BARRIER_NOSPEC
 void do_barrier_nospec_fixups(bool enable)
 {
 	void *start, *end;
@@ -313,8 +316,38 @@ void do_barrier_nospec_fixups(bool enable)
 
 	do_barrier_nospec_fixups_range(enable, start, end);
 }
+#endif /* CONFIG_PPC_BARRIER_NOSPEC */
 
-#endif /* CONFIG_PPC_BOOK3S_64 */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end)
+{
+	unsigned int instr[2], *dest;
+	long *start, *end;
+	int i;
+
+	start = fixup_start;
+	end = fixup_end;
+
+	instr[0] = PPC_INST_NOP;
+	instr[1] = PPC_INST_NOP;
+
+	if (enable) {
+		pr_info("barrier-nospec: using isync; sync as speculation barrier\n");
+		instr[0] = PPC_INST_ISYNC;
+		instr[1] = PPC_INST_SYNC;
+	}
+
+	for (i = 0; start < end; start++, i++) {
+		dest = (void *)start + *start;
+
+		pr_devel("patching dest %lx\n", (unsigned long)dest);
+		patch_instruction(dest, instr[0]);
+		patch_instruction(dest + 1, instr[1]);
+	}
+
+	printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i);
+}
+#endif /* CONFIG_PPC_FSL_BOOK3E */
 
 void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 {
diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S
index 3de7ac154f24..0526b2225260 100644
--- a/arch/powerpc/lib/hweight_64.S
+++ b/arch/powerpc/lib/hweight_64.S
@@ -20,6 +20,7 @@
 #include <asm/processor.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/feature-fixups.h>
 
 /* Note: This code relies on -mminimal-toc */
 
diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S
index ae15eba49c1f..32e91994b6b2 100644
--- a/arch/powerpc/lib/ldstfp.S
+++ b/arch/powerpc/lib/ldstfp.S
@@ -15,6 +15,7 @@
 #include <asm/ppc-opcode.h>
 #include <asm/reg.h>
 #include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
 #include <linux/errno.h>
 
 #ifdef CONFIG_PPC_FPU
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index b7b1237d4aa6..35a0ef932e1a 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -15,7 +15,6 @@
 #include <linux/kernel.h>
 #include <linux/spinlock.h>
 #include <linux/export.h>
-#include <linux/stringify.h>
 #include <linux/smp.h>
 
 /* waiting for a spinlock... */
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b7bd55..844d8e774492 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -9,6 +9,7 @@
  */
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/ppc-opcode.h>
 
 #define off8	r6
 #define off16	r7
@@ -24,28 +25,102 @@
 #define rH	r31
 
 #ifdef __LITTLE_ENDIAN__
+#define LH	lhbrx
+#define LW	lwbrx
 #define LD	ldbrx
+#define LVS	lvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRB,_VRA,_VRC
 #else
+#define LH	lhzx
+#define LW	lwzx
 #define LD	ldx
+#define LVS	lvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRA,_VRB,_VRC
 #endif
 
-_GLOBAL(memcmp)
+#define VMX_THRESH 4096
+#define ENTER_VMX_OPS	\
+	mflr    r0;	\
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      enter_vmx_ops; \
+	cmpwi   cr1,r3,0; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+#define EXIT_VMX_OPS \
+	mflr    r0; \
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      exit_vmx_ops; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+/*
+ * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
+ * 16 bytes boundary and permute the result with the 1st 16 bytes.
+
+ *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
+ *    ^                                  ^                                 ^
+ * 0xbbbb10                          0xbbbb20                          0xbbb30
+ *                                 ^
+ *                                _vaddr
+ *
+ *
+ * _vmask is the mask generated by LVS
+ * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
+ *   for example: 0xyyyyyyyyyyyyy012 for big endian
+ * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
+ *   for example: 0x3456789abcdefzzz for big endian
+ * The permute result is saved in _v_res.
+ *   for example: 0x0123456789abcdef for big endian.
+ */
+#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
+        lvx     _v2nd_qw,_vaddr,off16; \
+        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
+
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_xxxx
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_xxxx
+ */
+_GLOBAL_TOC(memcmp)
 	cmpdi	cr1,r5,0
 
-	/* Use the short loop if both strings are not 8B aligned */
-	or	r6,r3,r4
+	/* Use the short loop if the src/dst addresses are not
+	 * with the same offset of 8 bytes align boundary.
+	 */
+	xor	r6,r3,r4
 	andi.	r6,r6,7
 
-	/* Use the short loop if length is less than 32B */
-	cmpdi	cr6,r5,31
+	/* Fall back to short loop if compare at aligned addrs
+	 * with less than 8 bytes.
+	 */
+	cmpdi   cr6,r5,7
 
 	beq	cr1,.Lzero
-	bne	.Lshort
-	bgt	cr6,.Llong
+	bgt	cr6,.Lno_short
 
 .Lshort:
 	mtctr	r5
-
 1:	lbz	rA,0(r3)
 	lbz	rB,0(r4)
 	subf.	rC,rB,rA
@@ -78,11 +153,98 @@ _GLOBAL(memcmp)
 	li	r3,0
 	blr
 
+.Lno_short:
+	dcbt	0,r3
+	dcbt	0,r4
+	bne	.Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+	/* attempt to compare bytes not aligned with 8 bytes so that
+	 * rest comparison can run based on 8 bytes alignment.
+	 */
+	andi.   r6,r3,7
+
+	/* Try to compare the first double word which is not 8 bytes aligned:
+	 * load the first double word at (src & ~7UL) and shift left appropriate
+	 * bits before comparision.
+	 */
+	rlwinm  r6,r3,3,26,28
+	beq     .Lsameoffset_8bytes_aligned
+	clrrdi	r3,r3,3
+	clrrdi	r4,r4,3
+	LD	rA,0,r3
+	LD	rB,0,r4
+	sld	rA,rA,r6
+	sld	rB,rB,r6
+	cmpld	cr0,rA,rB
+	srwi	r6,r6,3
+	bne	cr0,.LcmpAB_lightweight
+	subfic  r6,r6,8
+	subf.	r5,r6,r5
+	addi	r3,r3,8
+	addi	r4,r4,8
+	beq	.Lzero
+
+.Lsameoffset_8bytes_aligned:
+	/* now we are aligned with 8 bytes.
+	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
+	 */
+	cmpdi   cr6,r5,31
+	bgt	cr6,.Llong
+
+.Lcmp_lt32bytes:
+	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
+	cmpdi   cr5,r5,7
+	srdi    r0,r5,3
+	ble	cr5,.Lcmp_rest_lt8bytes
+
+	/* handle 8 ~ 31 bytes */
+	clrldi  r5,r5,61
+	mtctr   r0
+2:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne	cr0,.LcmpAB_lightweight
+	bdnz	2b
+
+	cmpwi   r5,0
+	beq	.Lzero
+
+.Lcmp_rest_lt8bytes:
+	/* Here we have only less than 8 bytes to compare with. at least s1
+	 * Address is aligned with 8 bytes.
+	 * The next double words are load and shift right with appropriate
+	 * bits.
+	 */
+	subfic  r6,r5,8
+	slwi	r6,r6,3
+	LD	rA,0,r3
+	LD	rB,0,r4
+	srd	rA,rA,r6
+	srd	rB,rB,r6
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+
 .Lnon_zero:
 	mr	r3,rC
 	blr
 
 .Llong:
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	/* Try to use vmx loop if length is equal or greater than 4K */
+	cmpldi  cr6,r5,VMX_THRESH
+	bge	cr6,.Lsameoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Llong_novmx_cmp:
+#endif
+	/* At least s1 addr is aligned with 8 bytes */
 	li	off8,8
 	li	off16,16
 	li	off24,24
@@ -232,4 +394,240 @@ _GLOBAL(memcmp)
 	ld	r28,-32(r1)
 	ld	r27,-40(r1)
 	blr
+
+.LcmpAB_lightweight:   /* skip NV GPRS restore */
+	li	r3,1
+	bgtlr
+	li	r3,-1
+	blr
+
+#ifdef CONFIG_ALTIVEC
+.Lsameoffset_vmx_cmp:
+	/* Enter with src/dst addrs has the same offset with 8 bytes
+	 * align boundary.
+	 *
+	 * There is an optimization based on following fact: memcmp()
+	 * prones to fail early at the first 32 bytes.
+	 * Before applying VMX instructions which will lead to 32x128bits
+	 * VMX regs load/restore penalty, we compare the first 32 bytes
+	 * so that we can catch the ~80% fail cases.
+	 */
+
+	li	r0,4
+	mtctr	r0
+.Lsameoffset_prechk_32B_loop:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne     cr0,.LcmpAB_lightweight
+	addi	r5,r5,-8
+	bdnz	.Lsameoffset_prechk_32B_loop
+
+	ENTER_VMX_OPS
+	beq     cr1,.Llong_novmx_cmp
+
+3:
+	/* need to check whether r4 has the same offset with r3
+	 * for 16 bytes boundary.
+	 */
+	xor	r0,r3,r4
+	andi.	r0,r0,0xf
+	bne	.Ldiffoffset_vmx_cmp_start
+
+	/* len is no less than 4KB. Need to align with 16 bytes further.
+	 */
+	andi.	rA,r3,8
+	LD	rA,0,r3
+	beq	4f
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	addi	r5,r5,-8
+
+	beq	cr0,4f
+	/* save and restore cr0 */
+	mfocrf  r5,128
+	EXIT_VMX_OPS
+	mtocrf  128,r5
+	b	.LcmpAB_lightweight
+
+4:
+	/* compare 32 bytes for each loop */
+	srdi	r0,r5,5
+	mtctr	r0
+	clrldi  r5,r5,59
+	li	off16,16
+
+.balign 16
+5:
+	lvx 	v0,0,r3
+	lvx 	v1,0,r4
+	VCMPEQUD_RC(v0,v0,v1)
+	bnl	cr6,7f
+	lvx 	v0,off16,r3
+	lvx 	v1,off16,r4
+	VCMPEQUD_RC(v0,v0,v1)
+	bnl	cr6,6f
+	addi	r3,r3,32
+	addi	r4,r4,32
+	bdnz	5b
+
+	EXIT_VMX_OPS
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lcmp_lt32bytes
+
+6:
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+7:
+	/* diff the last 16 bytes */
+	EXIT_VMX_OPS
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	li	off8,8
+	bne	cr0,.LcmpAB_lightweight
+
+	LD	rA,off8,r3
+	LD	rB,off8,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+#endif
+
+.Ldiffoffset_8bytes_make_align_start:
+	/* now try to align s1 with 8 bytes */
+	rlwinm  r6,r3,3,26,28
+	beq     .Ldiffoffset_align_s1_8bytes
+
+	clrrdi	r3,r3,3
+	LD	rA,0,r3
+	LD	rB,0,r4  /* unaligned load */
+	sld	rA,rA,r6
+	srd	rA,rA,r6
+	srd	rB,rB,r6
+	cmpld	cr0,rA,rB
+	srwi	r6,r6,3
+	bne	cr0,.LcmpAB_lightweight
+
+	subfic  r6,r6,8
+	subf.	r5,r6,r5
+	addi	r3,r3,8
+	add	r4,r4,r6
+
+	beq	.Lzero
+
+.Ldiffoffset_align_s1_8bytes:
+	/* now s1 is aligned with 8 bytes. */
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	/* only do vmx ops when the size equal or greater than 4K bytes */
+	cmpdi	cr5,r5,VMX_THRESH
+	bge	cr5,.Ldiffoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Ldiffoffset_novmx_cmp:
+#endif
+
+
+	cmpdi   cr5,r5,31
+	ble	cr5,.Lcmp_lt32bytes
+
+#ifdef CONFIG_ALTIVEC
+	b	.Llong_novmx_cmp
+#else
+	b	.Llong
+#endif
+
+#ifdef CONFIG_ALTIVEC
+.Ldiffoffset_vmx_cmp:
+	/* perform a 32 bytes pre-checking before
+	 * enable VMX operations.
+	 */
+	li	r0,4
+	mtctr	r0
+.Ldiffoffset_prechk_32B_loop:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne     cr0,.LcmpAB_lightweight
+	addi	r5,r5,-8
+	bdnz	.Ldiffoffset_prechk_32B_loop
+
+	ENTER_VMX_OPS
+	beq     cr1,.Ldiffoffset_novmx_cmp
+
+.Ldiffoffset_vmx_cmp_start:
+	/* Firstly try to align r3 with 16 bytes */
+	andi.   r6,r3,0xf
+	li	off16,16
+	beq     .Ldiffoffset_vmx_s1_16bytes_align
+
+	LVS	v3,0,r3
+	LVS	v4,0,r4
+
+	lvx     v5,0,r3
+	lvx     v6,0,r4
+	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+
+	VCMPEQUB_RC(v7,v9,v10)
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	subfic  r6,r6,16
+	subf    r5,r6,r5
+	add     r3,r3,r6
+	add     r4,r4,r6
+
+.Ldiffoffset_vmx_s1_16bytes_align:
+	/* now s1 is aligned with 16 bytes */
+	lvx     v6,0,r4
+	LVS	v4,0,r4
+	srdi	r6,r5,5  /* loop for 32 bytes each */
+	clrldi  r5,r5,59
+	mtctr	r6
+
+.balign	16
+.Ldiffoffset_vmx_32bytesloop:
+	/* the first qw of r4 was saved in v6 */
+	lvx	v9,0,r3
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+	VCMPEQUB_RC(v7,v9,v10)
+	vor	v6,v8,v8
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	lvx	v9,0,r3
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+	VCMPEQUB_RC(v7,v9,v10)
+	vor	v6,v8,v8
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	bdnz	.Ldiffoffset_vmx_32bytesloop
+
+	EXIT_VMX_OPS
+
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lcmp_lt32bytes
+
+.Ldiffoffset_vmx_diff_found:
+	EXIT_VMX_OPS
+	/* anyway, the diff will appear in next 16 bytes */
+	li	r5,16
+	b	.Lcmp_lt32bytes
+
+#endif
 EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 8d8265be1a59..273ea67e60a1 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S
@@ -9,6 +9,13 @@
 #include <asm/processor.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#ifndef SELFTEST_CASE
+/* For big-endian, 0 == most CPUs, 1 == POWER6, 2 == Cell */
+#define SELFTEST_CASE	0
+#endif
 
 	.align	7
 _GLOBAL_TOC(memcpy)
@@ -20,10 +27,8 @@ BEGIN_FTR_SECTION
 #endif
 FTR_SECTION_ELSE
 #ifdef CONFIG_PPC_BOOK3S_64
-#ifndef SELFTEST
 	b	memcpy_power7
 #endif
-#endif
 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
 #ifdef __LITTLE_ENDIAN__
 	/* dumb little-endian memcpy that will get replaced at runtime */
@@ -47,6 +52,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
    cleared.
    At the time of writing the only CPU that has this combination of bits
    set is Power6. */
+test_feature = (SELFTEST_CASE == 1)
 BEGIN_FTR_SECTION
 	nop
 FTR_SECTION_ELSE
@@ -55,6 +61,7 @@ ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
                     CPU_FTR_UNALIGNED_LD_STD)
 .Ldst_aligned:
 	addi	r3,r3,-16
+test_feature = (SELFTEST_CASE == 0)
 BEGIN_FTR_SECTION
 	andi.	r0,r4,7
 	bne	.Lsrc_unaligned
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index df7de9d3da08..89bfefcf7fcc 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -19,7 +19,10 @@
  */
 #include <asm/ppc_asm.h>
 
-_GLOBAL(memcpy_power7)
+#ifndef SELFTEST_CASE
+/* 0 == don't use VMX, 1 == use VMX */
+#define SELFTEST_CASE	0
+#endif
 
 #ifdef __BIG_ENDIAN__
 #define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
@@ -29,20 +32,17 @@ _GLOBAL(memcpy_power7)
 #define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
 #endif
 
-#ifdef CONFIG_ALTIVEC
+_GLOBAL(memcpy_power7)
 	cmpldi	r5,16
 	cmpldi	cr1,r5,4096
-
 	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-
 	blt	.Lshort_copy
-	bgt	cr1,.Lvmx_copy
-#else
-	cmpldi	r5,16
-
-	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
 
-	blt	.Lshort_copy
+#ifdef CONFIG_ALTIVEC
+test_feature = SELFTEST_CASE
+BEGIN_FTR_SECTION
+	bgt	cr1, .Lvmx_copy
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
 
 .Lnonvmx_copy:
@@ -223,14 +223,14 @@ _GLOBAL(memcpy_power7)
 	addi	r1,r1,STACKFRAMESIZE
 	b	.Lnonvmx_copy
 
-#ifdef CONFIG_ALTIVEC
 .Lvmx_copy:
+#ifdef CONFIG_ALTIVEC
 	mflr	r0
 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	cr1,r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7)
 
 15:	addi	r1,r1,STACKFRAMESIZE
 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 .Lvmx_unaligned_copy:
 	/* Get the destination 16B aligned */
@@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7)
 
 15:	addi	r1,r1,STACKFRAMESIZE
 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 #endif /* CONFIG_ALTIVEC */
diff --git a/arch/powerpc/lib/strlen_32.S b/arch/powerpc/lib/strlen_32.S
new file mode 100644
index 000000000000..0a8d3f64d493
--- /dev/null
+++ b/arch/powerpc/lib/strlen_32.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * strlen() for PPC32
+ *
+ * Copyright (C) 2018 Christophe Leroy CS Systemes d'Information.
+ *
+ * Inspired from glibc implementation
+ */
+#include <asm/ppc_asm.h>
+#include <asm/export.h>
+#include <asm/cache.h>
+
+	.text
+
+/*
+ * Algorithm:
+ *
+ * 1) Given a word 'x', we can test to see if it contains any 0 bytes
+ *    by subtracting 0x01010101, and seeing if any of the high bits of each
+ *    byte changed from 0 to 1. This works because the least significant
+ *    0 byte must have had no incoming carry (otherwise it's not the least
+ *    significant), so it is 0x00 - 0x01 == 0xff. For all other
+ *    byte values, either they have the high bit set initially, or when
+ *    1 is subtracted you get a value in the range 0x00-0x7f, none of which
+ *    have their high bit set. The expression here is
+ *    (x - 0x01010101) & ~x & 0x80808080), which gives 0x00000000 when
+ *    there were no 0x00 bytes in the word.  You get 0x80 in bytes that
+ *    match, but possibly false 0x80 matches in the next more significant
+ *    byte to a true match due to carries.  For little-endian this is
+ *    of no consequence since the least significant match is the one
+ *    we're interested in, but big-endian needs method 2 to find which
+ *    byte matches.
+ * 2) Given a word 'x', we can test to see _which_ byte was zero by
+ *    calculating ~(((x & ~0x80808080) - 0x80808080 - 1) | x | ~0x80808080).
+ *    This produces 0x80 in each byte that was zero, and 0x00 in all
+ *    the other bytes. The '| ~0x80808080' clears the low 7 bits in each
+ *    byte, and the '| x' part ensures that bytes with the high bit set
+ *    produce 0x00. The addition will carry into the high bit of each byte
+ *    iff that byte had one of its low 7 bits set. We can then just see
+ *    which was the most significant bit set and divide by 8 to find how
+ *    many to add to the index.
+ *    This is from the book 'The PowerPC Compiler Writer's Guide',
+ *    by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
+ */
+
+_GLOBAL(strlen)
+	andi.   r0, r3, 3
+	lis	r7, 0x0101
+	addi	r10, r3, -4
+	addic	r7, r7, 0x0101	/* r7 = 0x01010101 (lomagic) & clear XER[CA] */
+	rotlwi	r6, r7, 31 	/* r6 = 0x80808080 (himagic) */
+	bne-	3f
+	.balign IFETCH_ALIGN_BYTES
+1:	lwzu	r9, 4(r10)
+2:	subf	r8, r7, r9
+	and.	r8, r8, r6
+	beq+	1b
+	andc.	r8, r8, r9
+	beq+	1b
+	andc	r8, r9, r6
+	orc	r9, r9, r6
+	subfe	r8, r6, r8
+	nor	r8, r8, r9
+	cntlzw	r8, r8
+	subf	r3, r3, r10
+	srwi	r8, r8, 3
+	add	r3, r3, r8
+	blr
+
+	/* Missaligned string: make sure bytes before string are seen not 0 */
+3:	xor	r10, r10, r0
+	orc	r8, r8, r8
+	lwzu	r9, 4(r10)
+	slwi	r0, r0, 3
+	srw	r8, r8, r0
+	orc	r9, r9, r8
+	b	2b
+EXPORT_SYMBOL(strlen)
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index bf925cdcaca9..9f340494a8ac 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
 	return 0;
 }
 
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
 {
 	if (in_interrupt())
 		return 0;
@@ -70,7 +70,7 @@ int enter_vmx_copy(void)
  * passed a pointer to the destination which we return as required by a
  * memcpy implementation.
  */
-void *exit_vmx_copy(void *dest)
+void *exit_vmx_ops(void *dest)
 {
 	disable_kernel_altivec();
 	preempt_enable();
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-08-17 11:32:50 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-08-17 11:32:50 -0700
commit	5e2d059b52e397d9ac42f4c4d9d9a841887b5818 (patch)
tree	c8cd8fd7187113be33e29fcc75f45a8bbc27e6b2 /arch/powerpc/lib
parent	d190775206d06397a9309421cac5ba2f2c243521 (diff)
parent	a2dc009afa9ae8b92305be7728676562a104cb40 (diff)