diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-07 13:55:45 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-07 13:55:45 -0700 |
commit | d691b7e7d1b5186eae62fd32adee65d3316bfdf6 (patch) | |
tree | 3808f7deab74f68267b9fdd6a35dcda9e50142aa | |
parent | b59eea554f57befa2aa3172fcb63e521bdd850dd (diff) | |
parent | 1e0fc9d1eb2b0241a03e0a02bcdb9b5b641b9d35 (diff) |
Merge tag 'powerpc-4.13-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc updates from Michael Ellerman:
"Highlights include:
- Support for STRICT_KERNEL_RWX on 64-bit server CPUs.
- Platform support for FSP2 (476fpe) board
- Enable ZONE_DEVICE on 64-bit server CPUs.
- Generic & powerpc spin loop primitives to optimise busy waiting
- Convert VDSO update function to use new update_vsyscall() interface
- Optimisations to hypercall/syscall/context-switch paths
- Improvements to the CPU idle code on Power8 and Power9.
As well as many other fixes and improvements.
Thanks to: Akshay Adiga, Andrew Donnellan, Andrew Jeffery, Anshuman
Khandual, Anton Blanchard, Balbir Singh, Benjamin Herrenschmidt,
Christophe Leroy, Christophe Lombard, Colin Ian King, Dan Carpenter,
Gautham R. Shenoy, Hari Bathini, Ian Munsie, Ivan Mikhaylov, Javier
Martinez Canillas, Madhavan Srinivasan, Masahiro Yamada, Matt Brown,
Michael Neuling, Michal Suchanek, Murilo Opsfelder Araujo, Naveen N.
Rao, Nicholas Piggin, Oliver O'Halloran, Paul Mackerras, Pavel Machek,
Russell Currey, Santosh Sivaraj, Stephen Rothwell, Thiago Jung
Bauermann, Yang Li"
* tag 'powerpc-4.13-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (158 commits)
powerpc/Kconfig: Enable STRICT_KERNEL_RWX for some configs
powerpc/mm/radix: Implement STRICT_RWX/mark_rodata_ro() for Radix
powerpc/mm/hash: Implement mark_rodata_ro() for hash
powerpc/vmlinux.lds: Align __init_begin to 16M
powerpc/lib/code-patching: Use alternate map for patch_instruction()
powerpc/xmon: Add patch_instruction() support for xmon
powerpc/kprobes/optprobes: Use patch_instruction()
powerpc/kprobes: Move kprobes over to patch_instruction()
powerpc/mm/radix: Fix execute permissions for interrupt_vectors
powerpc/pseries: Fix passing of pp0 in updatepp() and updateboltedpp()
powerpc/64s: Blacklist rtas entry/exit from kprobes
powerpc/64s: Blacklist functions invoked on a trap
powerpc/64s: Un-blacklist system_call() from kprobes
powerpc/64s: Move system_call() symbol to just after setting MSR_EE
powerpc/64s: Blacklist system_call() and system_call_common() from kprobes
powerpc/64s: Convert .L__replay_interrupt_return to a local label
powerpc64/elfv1: Only dereference function descriptor for non-text symbols
cxl: Export library to support IBM XSL
powerpc/dts: Use #include "..." to include local DT
powerpc/perf/hv-24x7: Aggregate result elements on POWER9 SMT8
...
161 files changed, 4328 insertions, 1082 deletions
diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 9cabaf8a207e..bdd344aa18d9 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -61,8 +61,8 @@ as follows: boot successfully. For syntax of crashkernel= parameter, refer to Documentation/kdump/kdump.txt. If any offset is provided in crashkernel= parameter, it will be ignored - as fadump reserves memory at end of RAM for boot memory - dump preservation in case of a crash. + as fadump uses a predefined offset to reserve memory + for boot memory dump preservation in case of a crash. -- After the low memory (boot memory) area has been saved, the firmware will reset PCI and other hardware state. It will diff --git a/MAINTAINERS b/MAINTAINERS index a9795896323e..a4f37b69a66c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3781,8 +3781,8 @@ S: Supported F: drivers/net/ethernet/chelsio/cxgb4vf/ CXL (IBM Coherent Accelerator Processor Interface CAPI) DRIVER -M: Ian Munsie <imunsie@au1.ibm.com> M: Frederic Barrat <fbarrat@linux.vnet.ibm.com> +M: Andrew Donnellan <andrew.donnellan@au1.ibm.com> L: linuxppc-dev@lists.ozlabs.org S: Supported F: arch/powerpc/platforms/powernv/pci-cxl.c @@ -5352,7 +5352,7 @@ S: Maintained F: drivers/video/fbdev/fsl-diu-fb.* FREESCALE DMA DRIVER -M: Li Yang <leoli@freescale.com> +M: Li Yang <leoyang.li@nxp.com> M: Zhang Wei <zw@zh-kernel.org> L: linuxppc-dev@lists.ozlabs.org S: Maintained @@ -5417,11 +5417,11 @@ S: Maintained F: drivers/net/ethernet/freescale/dpaa FREESCALE SOC DRIVERS -M: Scott Wood <oss@buserror.net> +M: Li Yang <leoyang.li@nxp.com> L: linuxppc-dev@lists.ozlabs.org L: linux-arm-kernel@lists.infradead.org S: Maintained -F: Documentation/devicetree/bindings/powerpc/fsl/ +F: Documentation/devicetree/bindings/soc/fsl/ F: drivers/soc/fsl/ F: include/linux/fsl/ @@ -5434,14 +5434,14 @@ F: include/soc/fsl/*qe*.h F: include/soc/fsl/*ucc*.h FREESCALE USB PERIPHERAL DRIVERS -M: Li Yang <leoli@freescale.com> +M: Li Yang <leoyang.li@nxp.com> L: linux-usb@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org S: Maintained F: drivers/usb/gadget/udc/fsl* FREESCALE QUICC ENGINE UCC ETHERNET DRIVER -M: Li Yang <leoli@freescale.com> +M: Li Yang <leoyang.li@nxp.com> L: netdev@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org S: Maintained @@ -7784,6 +7784,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/scottwood/linux.git S: Maintained F: arch/powerpc/platforms/83xx/ F: arch/powerpc/platforms/85xx/ +F: Documentation/devicetree/bindings/powerpc/fsl/ LINUX FOR POWERPC PA SEMI PWRFICIENT L: linuxppc-dev@lists.ozlabs.org diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 6189238e69f8..afb608413314 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -109,14 +109,6 @@ config GENERIC_LOCKBREAK default y depends on SMP && PREEMPT -config ARCH_HAS_ILOG2_U32 - bool - default y - -config ARCH_HAS_ILOG2_U64 - bool - default y if 64BIT - config GENERIC_HWEIGHT bool default y @@ -138,6 +130,7 @@ config PPC select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO @@ -163,7 +156,7 @@ config PPC select GENERIC_SMP_IDLE_THREAD select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select GENERIC_TIME_VSYSCALL_OLD + select GENERIC_TIME_VSYSCALL select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KGDB @@ -171,6 +164,8 @@ config PPC select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK + select ARCH_HAS_STRICT_KERNEL_RWX if (PPC_BOOK3S_64 && !RELOCATABLE && !HIBERNATION) + select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select HAVE_CBPF_JIT if !PPC64 select HAVE_CONTEXT_TRACKING if PPC64 select HAVE_DEBUG_KMEMLEAK @@ -208,6 +203,7 @@ config PPC select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING + select HAVE_IRQ_TIME_ACCOUNTING select IRQ_DOMAIN select IRQ_FORCED_THREADING select MODULES_USE_ELF_RELA @@ -438,6 +434,17 @@ config PPC_TRANSACTIONAL_MEM ---help--- Support user-mode Transactional Memory on POWERPC. +config LD_HEAD_STUB_CATCH + bool "Reserve 256 bytes to cope with linker stubs in HEAD text" if EXPERT + depends on PPC64 + default n + help + Very large kernels can cause linker branch stubs to be generated by + code in head_64.S, which moves the head text sections out of their + specified location. This option can work around the problem. + + If unsure, say "N". + config DISABLE_MPROFILE_KERNEL bool "Disable use of mprofile-kernel for kernel tracing" depends on PPC64 && CPU_LITTLE_ENDIAN diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 3e0f0e1fadef..8d4ed73d5490 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -98,6 +98,7 @@ endif LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) +LDFLAGS_vmlinux += $(call ld-option,--orphan-handling=warn) ifeq ($(CONFIG_PPC64),y) ifeq ($(call cc-option-yn,-mcmodel=medium),y) @@ -189,7 +190,17 @@ else CHECKFLAGS += -D__LITTLE_ENDIAN__ endif +ifdef CONFIG_PPC32 KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o +else +ifeq ($(call ld-ifversion, -ge, 225000000, y),y) +# Have the linker provide sfpr if possible. +# There is a corresponding test in arch/powerpc/lib/Makefile +KBUILD_LDFLAGS_MODULE += --save-restore-funcs +else +KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o +endif +endif ifeq ($(CONFIG_476FPE_ERR46),y) KBUILD_LDFLAGS_MODULE += --ppc476-workaround \ diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink index eccfcc88afae..5db43ebbe2df 100644 --- a/arch/powerpc/Makefile.postlink +++ b/arch/powerpc/Makefile.postlink @@ -10,13 +10,26 @@ __archpost: -include include/config/auto.conf include scripts/Kbuild.include +quiet_cmd_head_check = CHKHEAD $@ + cmd_head_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/head_check.sh "$(NM)" "$@" + quiet_cmd_relocs_check = CHKREL $@ - cmd_relocs_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@" +ifdef CONFIG_PPC_BOOK3S_64 + cmd_relocs_check = \ + $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@" ; \ + $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/unrel_branch_check.sh "$(OBJDUMP)" "$@" +else + cmd_relocs_check = \ + $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@" +endif # `@true` prevents complaint when there is nothing to be done vmlinux: FORCE @true +ifdef CONFIG_PPC64 + $(call cmd,head_check) +endif ifdef CONFIG_RELOCATABLE $(call if_changed,relocs_check) endif @@ -25,7 +38,7 @@ endif @true clean: - @true + rm -f .tmp_symbols.txt PHONY += FORCE clean diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index e82f333cc84a..a7814a7b1523 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -95,13 +95,16 @@ libfdtheader := fdt.h libfdt.h libfdt_internal.h $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ $(addprefix $(obj)/,$(libfdtheader)) -src-wlib-y := string.S crt0.S crtsavres.S stdio.c decompress.c main.c \ +src-wlib-y := string.S crt0.S stdio.c decompress.c main.c \ $(libfdt) libfdt-wrapper.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ elf_util.c $(zlib-y) devtree.c stdlib.c \ oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ uartlite.c mpc52xx-psc.c opal.c src-wlib-$(CONFIG_PPC64_BOOT_WRAPPER) += opal-calls.S +ifndef CONFIG_PPC64_BOOT_WRAPPER +src-wlib-y += crtsavres.S +endif src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c fsl-soc.c diff --git a/arch/powerpc/boot/crtsavres.S b/arch/powerpc/boot/crtsavres.S index f3d9b35c07d4..085fb2b9a8b8 100644 --- a/arch/powerpc/boot/crtsavres.S +++ b/arch/powerpc/boot/crtsavres.S @@ -37,12 +37,13 @@ * the executable file might be covered by the GNU General Public License. */ +#ifdef __powerpc64__ +#error "On PPC64, FPR save/restore functions are provided by the linker." +#endif + .file "crtsavres.S" .section ".text" -/* On PowerPC64 Linux, these functions are provided by the linker. */ -#ifndef __powerpc64__ - #define _GLOBAL(name) \ .type name,@function; \ .globl name; \ @@ -230,4 +231,3 @@ _GLOBAL(_rest32gpr_31_x) mtlr 0 mr 1,11 blr -#endif diff --git a/arch/powerpc/boot/dts/ac14xx.dts b/arch/powerpc/boot/dts/ac14xx.dts index 27fcabc2f857..83bcfd865167 100644 --- a/arch/powerpc/boot/dts/ac14xx.dts +++ b/arch/powerpc/boot/dts/ac14xx.dts @@ -10,7 +10,7 @@ */ -#include <mpc5121.dtsi> +#include "mpc5121.dtsi" / { model = "ac14xx"; diff --git a/arch/powerpc/boot/dts/digsy_mtc.dts b/arch/powerpc/boot/dts/digsy_mtc.dts index 955bff629df3..c280e75c86bf 100644 --- a/arch/powerpc/boot/dts/digsy_mtc.dts +++ b/arch/powerpc/boot/dts/digsy_mtc.dts @@ -73,7 +73,7 @@ i2c@3d00 { eeprom@50 { - compatible = "at,24c08"; + compatible = "atmel,24c08"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/fsl/b4qds.dtsi b/arch/powerpc/boot/dts/fsl/b4qds.dtsi index 3785ef826d07..999efd3bc167 100644 --- a/arch/powerpc/boot/dts/fsl/b4qds.dtsi +++ b/arch/powerpc/boot/dts/fsl/b4qds.dtsi @@ -166,19 +166,19 @@ reg = <0>; eeprom@50 { - compatible = "at24,24c64"; + compatible = "atmel,24c64"; reg = <0x50>; }; eeprom@51 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x51>; }; eeprom@53 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x53>; }; eeprom@57 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x57>; }; rtc@68 { diff --git a/arch/powerpc/boot/dts/fsl/c293pcie.dts b/arch/powerpc/boot/dts/fsl/c293pcie.dts index 66709788429d..5e905e0857cf 100644 --- a/arch/powerpc/boot/dts/fsl/c293pcie.dts +++ b/arch/powerpc/boot/dts/fsl/c293pcie.dts @@ -153,7 +153,7 @@ &soc { i2c@3000 { eeprom@50 { - compatible = "st,24c1024"; + compatible = "st,24c1024", "atmel,24c1024"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi b/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi index a8e4ba070104..2ca9cee2ddeb 100644 --- a/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi @@ -89,7 +89,7 @@ &board_soc { i2c@3000 { eeprom@50 { - compatible = "st,24c256"; + compatible = "st,24c256", "atmel,24c256"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/fsl/p1023rdb.dts b/arch/powerpc/boot/dts/fsl/p1023rdb.dts index 9716ca64651c..ead928364beb 100644 --- a/arch/powerpc/boot/dts/fsl/p1023rdb.dts +++ b/arch/powerpc/boot/dts/fsl/p1023rdb.dts @@ -79,7 +79,7 @@ i2c@3000 { eeprom@53 { - compatible = "at24,24c04"; + compatible = "atmel,24c04"; reg = <0x53>; }; diff --git a/arch/powerpc/boot/dts/fsl/p2041rdb.dts b/arch/powerpc/boot/dts/fsl/p2041rdb.dts index e50fea95a853..950816b9d6e1 100644 --- a/arch/powerpc/boot/dts/fsl/p2041rdb.dts +++ b/arch/powerpc/boot/dts/fsl/p2041rdb.dts @@ -127,7 +127,7 @@ reg = <0x48>; }; eeprom@50 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x50>; }; rtc@68 { @@ -142,7 +142,7 @@ i2c@118100 { eeprom@50 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x50>; }; }; diff --git a/arch/powerpc/boot/dts/fsl/p3041ds.dts b/arch/powerpc/boot/dts/fsl/p3041ds.dts index 40748e415adb..6f5f7283c533 100644 --- a/arch/powerpc/boot/dts/fsl/p3041ds.dts +++ b/arch/powerpc/boot/dts/fsl/p3041ds.dts @@ -124,11 +124,11 @@ i2c@118100 { eeprom@51 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x51>; }; eeprom@52 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x52>; }; }; diff --git a/arch/powerpc/boot/dts/fsl/p4080ds.dts b/arch/powerpc/boot/dts/fsl/p4080ds.dts index 816b9788d5f6..65e20152e22f 100644 --- a/arch/powerpc/boot/dts/fsl/p4080ds.dts +++ b/arch/powerpc/boot/dts/fsl/p4080ds.dts @@ -125,11 +125,11 @@ i2c@118100 { eeprom@51 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x51>; }; eeprom@52 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x52>; }; rtc@68 { diff --git a/arch/powerpc/boot/dts/fsl/p5020ds.dts b/arch/powerpc/boot/dts/fsl/p5020ds.dts index cd6f37386111..b24adf902d8d 100644 --- a/arch/powerpc/boot/dts/fsl/p5020ds.dts +++ b/arch/powerpc/boot/dts/fsl/p5020ds.dts @@ -124,11 +124,11 @@ i2c@118100 { eeprom@51 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x51>; }; eeprom@52 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x52>; }; }; diff --git a/arch/powerpc/boot/dts/fsl/p5040ds.dts b/arch/powerpc/boot/dts/fsl/p5040ds.dts index 45084738cf4e..30850b3228e0 100644 --- a/arch/powerpc/boot/dts/fsl/p5040ds.dts +++ b/arch/powerpc/boot/dts/fsl/p5040ds.dts @@ -133,11 +133,11 @@ i2c@118100 { eeprom@51 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x51>; }; eeprom@52 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x52>; }; }; diff --git a/arch/powerpc/boot/dts/fsl/t208xqds.dtsi b/arch/powerpc/boot/dts/fsl/t208xqds.dtsi index ec080bd01b09..db4139999b28 100644 --- a/arch/powerpc/boot/dts/fsl/t208xqds.dtsi +++ b/arch/powerpc/boot/dts/fsl/t208xqds.dtsi @@ -147,17 +147,17 @@ reg = <0x0>; eeprom@50 { - compatible = "at24,24c512"; + compatible = "atmel,24c512"; reg = <0x50>; }; eeprom@51 { - compatible = "at24,24c02"; + compatible = "atmel,24c02"; reg = <0x51>; }; eeprom@57 { - compatible = "at24,24c02"; + compatible = "atmel,24c02"; reg = <0x57>; }; @@ -174,7 +174,7 @@ reg = <0x1>; eeprom@55 { - compatible = "at24,24c02"; + compatible = "atmel,24c02"; reg = <0x55>; }; }; diff --git a/arch/powerpc/boot/dts/fsl/t4240qds.dts b/arch/powerpc/boot/dts/fsl/t4240qds.dts index 9573ceada07c..c0913ac5aaad 100644 --- a/arch/powerpc/boot/dts/fsl/t4240qds.dts +++ b/arch/powerpc/boot/dts/fsl/t4240qds.dts @@ -377,27 +377,27 @@ reg = <0>; eeprom@51 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x51>; }; eeprom@52 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x52>; }; eeprom@53 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x53>; }; eeprom@54 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x54>; }; eeprom@55 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x55>; }; eeprom@56 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x56>; }; rtc@68 { diff --git a/arch/powerpc/boot/dts/fsl/t4240rdb.dts b/arch/powerpc/boot/dts/fsl/t4240rdb.dts index 8166c660712a..15eb0a3f7290 100644 --- a/arch/powerpc/boot/dts/fsl/t4240rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t4240rdb.dts @@ -130,15 +130,15 @@ reg = <0x2f>; }; eeprom@52 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x52>; }; eeprom@54 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x54>; }; eeprom@56 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x56>; }; rtc@68 { diff --git a/arch/powerpc/boot/dts/fsp2.dts b/arch/powerpc/boot/dts/fsp2.dts new file mode 100644 index 000000000000..475953ada707 --- /dev/null +++ b/arch/powerpc/boot/dts/fsp2.dts @@ -0,0 +1,608 @@ +/* + * Device Tree Source for FSP2 + * + * Copyright 2010,2012 IBM Corp. + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without + * any warranty of any kind, whether express or implied. + */ + + +/dts-v1/; + +/ { + #address-cells = <2>; + #size-cells = <1>; + model = "ibm,fsp2"; + compatible = "ibm,fsp2"; + dcr-parent = <&{/cpus/cpu@0}>; + + aliases { + ethernet0 = &EMAC0; + ethernet1 = &EMAC1; + serial0 = &UART0; + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + model = "PowerPC, 476FSP2"; + reg = <0x0>; + clock-frequency = <0>; /* Filled in by cuboot */ + timebase-frequency = <0>; /* Filled in by cuboot */ + i-cache-line-size = <32>; + d-cache-line-size = <32>; + d-cache-size = <32768>; + i-cache-size = <32768>; + dcr-controller; + dcr-access-method = "native"; + }; + }; + + memory { + device_type = "memory"; + reg = <0x00000000 0x00000000 0x00000000>; /* Filled in by + cuboot */ + }; + + clocks { + mmc_clk: mmc_clk { + compatible = "fixed-clock"; + clock-frequency = <50000000>; + clock-output-names = "mmc_clk"; + }; + }; + + UIC0: uic0 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <0>; + dcr-reg = <0x2c0 0x8>; + }; + + /* "interrupts" field is <bit level bit level> + first pair is non-critical, second is critical */ + UIC1_0: uic1_0 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <1>; + dcr-reg = <0x2c8 0x8>; + interrupt-parent = <&UIC0>; + interrupts = <21 0x4 4 0x84>; + }; + + /* PSI and DMA */ + UIC1_1: uic1_1 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <2>; + dcr-reg = <0x350 0x8>; + interrupt-parent = <&UIC0>; + interrupts = <22 0x4 5 0x84>; + }; + + /* Ethernet and USB */ + UIC1_2: uic1_2 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <3>; + dcr-reg = <0x358 0x8>; + interrupt-parent = <&UIC0>; + interrupts = <23 0x4 6 0x84>; + }; + + /* PLB Errors */ + UIC1_3: uic1_3 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <4>; + dcr-reg = <0x360 0x8>; + interrupt-parent = <&UIC0>; + interrupts = <24 0x4 7 0x84>; + }; + + UIC1_4: uic1_4 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <5>; + dcr-reg = <0x368 0x8>; + interrupt-parent = <&UIC0>; + interrupts = <25 0x4 8 0x84>; + }; + + UIC1_5: uic1_5 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <6>; + dcr-reg = <0x370 0x8>; + interrupt-parent = <&UIC0>; + interrupts = <26 0x4 9 0x84>; + }; + + /* 2nd level UICs for FSI */ + UIC2_0: uic2_0 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <7>; + dcr-reg = <0x2d0 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <16 0x4 0 0x84>; + }; + + UIC2_1: uic2_1 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <8>; + dcr-reg = <0x2d8 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <17 0x4 1 0x84>; + }; + + UIC2_2: uic2_2 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <9>; + dcr-reg = <0x2e0 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <18 0x4 2 0x84>; + }; + + UIC2_3: uic2_3 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <10>; + dcr-reg = <0x2e8 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <19 0x4 3 0x84>; + }; + + UIC2_4: uic2_4 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <11>; + dcr-reg = <0x2f0 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <20 0x4 4 0x84>; + }; + + UIC2_5: uic2_5 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <12>; + dcr-reg = <0x2f8 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <21 0x4 5 0x84>; + }; + + UIC2_6: uic2_6 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <13>; + dcr-reg = <0x300 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <22 0x4 6 0x84>; + }; + + UIC2_7: uic2_7 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <14>; + dcr-reg = <0x308 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <23 0x4 7 0x84>; + }; + + UIC2_8: uic2_8 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <15>; + dcr-reg = <0x310 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <24 0x4 8 0x84>; + }; + + UIC2_9: uic2_9 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <16>; + dcr-reg = <0x318 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <25 0x4 9 0x84>; + }; + + UIC2_10: uic2_10 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <17>; + dcr-reg = <0x320 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <26 0x4 10 0x84>; + }; + + UIC2_11: uic2_11 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <18>; + dcr-reg = <0x328 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <27 0x4 11 0x84>; + }; + + UIC2_12: uic2_12 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <19>; + dcr-reg = <0x330 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <28 0x4 12 0x84>; + }; + + UIC2_13: uic2_13 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <20>; + dcr-reg = <0x338 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <29 0x4 13 0x84>; + }; + + UIC2_14: uic2_14 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <21>; + dcr-reg = <0x340 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <30 0x4 14 0x84>; + }; + + UIC2_15: uic2_15 { + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <22>; + dcr-reg = <0x348 0x8>; + interrupt-parent = <&UIC1_0>; + interrupts = <31 0x4 15 0x84>; + }; + + mmc0: sdhci@020c0000 { + compatible = "st,sdhci-stih407", "st,sdhci"; + status = "disabled"; + reg = <0x020c0000 0x20000>; + reg-names = "mmc"; + interrupt-parent = <&UIC1_3>; + interrupts = <21 0x4 22 0x4>; + interrupt-names = "mmcirq"; + pinctrl-names = "default"; + pinctrl-0 = <>; + clock-names = "mmc"; + clocks = <&mmc_clk>; + }; + + plb6 { + compatible = "ibm,plb6"; + #address-cells = <2>; + #size-cells = <1>; + ranges; + + MCW0: memory-controller-wrapper { + compatible = "ibm,cw-476fsp2"; + dcr-reg = <0x11111800 0x40>; + }; + + MCIF0: memory-controller { + compatible = "ibm,sdram-476fsp2", "ibm,sdram-4xx-ddr3"; + dcr-reg = <0x11120000 0x10000>; + mcer-device = <&MCW0>; + interrupt-parent = <&UIC0>; + interrupts = <10 0x84 /* ECC UE */ + 11 0x84>; /* ECC CE */ + }; + }; + + plb4 { + compatible = "ibm,plb4"; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0x00000000 0x00000010 0x00000000 0x80000000 + 0x80000000 0x00000010 0x80000000 0x80000000>; + clock-frequency = <333333334>; + + plb6-system-hung-irq { + compatible = "ibm,bus-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC0>; + interrupts = <0 0x84>; + }; + + l2-error-irq { + compatible = "ibm,bus-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC0>; + interrupts = <20 0x84>; + }; + + plb6-plb4-irq { + compatible = "ibm,bus-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC0>; + interrupts = <1 0x84>; + }; + + plb4-ahb-irq { + compatible = "ibm,bus-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC1_3>; + interrupts = <20 0x84>; + }; + + opbd-error-irq { + compatible = "ibm,opbd-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC1_4>; + interrupts = <5 0x84>; + }; + + cmu-error-irq { + compatible = "ibm,cmu-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC0>; + interrupts = <28 0x84>; + }; + + conf-error-irq { + compatible = "ibm,conf-error-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC1_4>; + interrupts = <11 0x84>; + }; + + mc-ue-irq { + compatible = "ibm,mc-ue-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC0>; + interrupts = <10 0x84>; + }; + + reset-warning-irq { + compatible = "ibm,reset-warning-irq"; + #interrupt-cells = <2>; + interrupt-parent = <&UIC0>; + interrupts = <17 0x84>; + }; + + MAL0: mcmal0 { + #interrupt-cells = <1>; + #address-cells = <0>; + #size-cells = <0>; + compatible = "ibm,mcmal"; + dcr-reg = <0x80 0x80>; + num-tx-chans = <1>; + num-rx-chans = <1>; + interrupt-parent = <&MAL0>; + interrupts = <0 1 2 3 4>; + /* index interrupt-parent interrupt# type */ + interrupt-map = </*TXEOB*/ 0 &UIC1_2 4 0x4 + /*RXEOB*/ 1 &UIC1_2 3 0x4 + /*SERR*/ 2 &UIC1_2 7 0x4 + /*TXDE*/ 3 &UIC1_2 6 0x4 + /*RXDE*/ 4 &UIC1_2 5 0x4>; + }; + + MAL1: mcmal1 { + #interrupt-cells = <1>; + #address-cells = <0>; + #size-cells = <0>; + compatible = "ibm,mcmal"; + dcr-reg = <0x100 0x80>; + num-tx-chans = <1>; + num-rx-chans = <1>; + interrupt-parent = <&MAL1>; + interrupts = <0 1 2 3 4>; + /* index interrupt-parent interrupt# type */ + interrupt-map = </*TXEOB*/ 0 &UIC1_2 12 0x4 + /*RXEOB*/ 1 &UIC1_2 11 0x4 + /*SERR*/ 2 &UIC1_2 15 0x4 + /*TXDE*/ 3 &UIC1_2 14 0x4 + /*RXDE*/ 4 &UIC1_2 13 0x4>; + }; + + opb { + compatible = "ibm,opb"; + #address-cells = <1>; + #size-cells = <1>; + ranges; // pass-thru to parent bus + clock-frequency = <83333334>; + + EMAC0: ethernet@b0000000 { + linux,network-index = <0>; + device_type = "network"; + compatible = "ibm,emac4sync"; + has-inverted-stacr-oc; + interrupt-parent = <&UIC1_2>; + interrupts = <1 0x4 0 0x4>; + reg = <0xb0000000 0x100>; + local-mac-address = [000000000000]; /* Filled in by + cuboot */ + mal-device = <&MAL0>; + mal-tx-channel = <0>; + mal-rx-channel = <0>; + cell-index = <0>; + max-frame-size = <1500>; + rx-fifo-size = <4096>; + tx-fifo-size = <4096>; + rx-fifo-size-gige = <16384>; + tx-fifo-size-gige = <8192>; + phy-address = <1>; + phy-mode = "rgmii"; + phy-map = <00000003>; + rgmii-device = <&RGMII>; + rgmii-channel = <0>; + }; + + EMAC1: ethernet@b0000100 { + linux,network-index = <1>; + device_type = "network"; + compatible = "ibm,emac4sync"; + has-inverted-stacr-oc; + interrupt-parent = <&UIC1_2>; + interrupts = <9 0x4 8 0x4>; + reg = <0xb0000100 0x100>; + local-mac-address = [000000000000]; /* Filled in by + cuboot */ + mal-device = <&MAL1>; + mal-tx-channel = <0>; + mal-rx-channel = <0>; + cell-index = <1>; + max-frame-size = <1500>; + rx-fifo-size = <4096>; + tx-fifo-size = <4096>; + rx-fifo-size-gige = <16384>; + tx-fifo-size-gige = <8192>; + phy-address = <2>; + phy-mode = "rgmii"; + phy-map = <00000003>; + rgmii-device = <&RGMII>; + rgmii-channel = <1>; + }; + + RGMII: rgmii@b0000600 { + compatible = "ibm,rgmii"; + has-mdio; + reg = <0xb0000600 0x8>; + }; + + UART0: serial@b0020000 { + device_type = "serial"; + compatible = "ns16550"; + reg = <0xb0020000 0x8>; + virtual-reg = <0xb0020000>; + clock-frequency = <20833333>; + current-speed = <115200>; + interrupt-parent = <&UIC0>; + interrupts = <31 0x4>; + }; + }; + + OHCI1: ohci@02040000 { + compatible = "ohci-le"; + reg = <0x02040000 0xa0>; + interrupt-parent = <&UIC1_3>; + interrupts = <28 0x8 29 0x8>; + }; + + OHCI2: ohci@02080000 { + compatible = "ohci-le"; + reg = <0x02080000 0xa0>; + interrupt-parent = <&UIC1_3>; + interrupts = <30 0x8 31 0x8>; + }; + + EHCI: ehci@02000000 { + compatible = "usb-ehci"; + reg = <0x02000000 0xa4>; + interrupt-parent = <&UIC1_3>; + interrupts = <23 0x4>; + }; + + }; + + chosen { + linux,stdout-path = "/plb/opb/serial@b0020000"; + bootargs = "console=ttyS0,115200 rw log_buf_len=32768 debug"; + }; +}; diff --git a/arch/powerpc/boot/dts/mpc5121ads.dts b/arch/powerpc/boot/dts/mpc5121ads.dts index 75888ce2c792..1e81a7e32d18 100644 --- a/arch/powerpc/boot/dts/mpc5121ads.dts +++ b/arch/powerpc/boot/dts/mpc5121ads.dts @@ -9,7 +9,7 @@ * option) any later version. */ -#include <mpc5121.dtsi> +#include "mpc5121.dtsi" / { model = "mpc5121ads"; @@ -94,7 +94,7 @@ }; eeprom@50 { - compatible = "at,24c32"; + compatible = "atmel,24c32"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/mpc8308_p1m.dts b/arch/powerpc/boot/dts/mpc8308_p1m.dts index 57f86cdf9f36..cab933b3957a 100644 --- a/arch/powerpc/boot/dts/mpc8308_p1m.dts +++ b/arch/powerpc/boot/dts/mpc8308_p1m.dts @@ -123,7 +123,7 @@ interrupt-parent = <&ipic>; dfsrr; fram@50 { - compatible = "ramtron,24c64"; + compatible = "ramtron,24c64", "atmel,24c64"; reg = <0x50>; }; }; diff --git a/arch/powerpc/boot/dts/mpc8349emitx.dts b/arch/powerpc/boot/dts/mpc8349emitx.dts index 90aed3ac2f69..648a85858eb5 100644 --- a/arch/powerpc/boot/dts/mpc8349emitx.dts +++ b/arch/powerpc/boot/dts/mpc8349emitx.dts @@ -92,7 +92,7 @@ dfsrr; eeprom: at24@50 { - compatible = "st,24c256"; + compatible = "st,24c256", "atmel,24c256"; reg = <0x50>; }; @@ -130,7 +130,7 @@ }; spd: at24@51 { - compatible = "at24,spd"; + compatible = "atmel,spd"; reg = <0x51>; }; diff --git a/arch/powerpc/boot/dts/mpc8377_rdb.dts b/arch/powerpc/boot/dts/mpc8377_rdb.dts index e32613963ab0..5e85d8c93bca 100644 --- a/arch/powerpc/boot/dts/mpc8377_rdb.dts +++ b/arch/powerpc/boot/dts/mpc8377_rdb.dts @@ -150,7 +150,7 @@ }; at24@50 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/mpc8377_wlan.dts b/arch/powerpc/boot/dts/mpc8377_wlan.dts index c0c790168b96..fee15fcbb46f 100644 --- a/arch/powerpc/boot/dts/mpc8377_wlan.dts +++ b/arch/powerpc/boot/dts/mpc8377_wlan.dts @@ -135,7 +135,7 @@ dfsrr; at24@50 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/mpc8378_rdb.dts b/arch/powerpc/boot/dts/mpc8378_rdb.dts index 71842fcd621f..e973d61956b9 100644 --- a/arch/powerpc/boot/dts/mpc8378_rdb.dts +++ b/arch/powerpc/boot/dts/mpc8378_rdb.dts @@ -150,7 +150,7 @@ }; at24@50 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/mpc8379_rdb.dts b/arch/powerpc/boot/dts/mpc8379_rdb.dts index e442a29b2fe0..ed5d12ff2ee0 100644 --- a/arch/powerpc/boot/dts/mpc8379_rdb.dts +++ b/arch/powerpc/boot/dts/mpc8379_rdb.dts @@ -148,7 +148,7 @@ }; at24@50 { - compatible = "at24,24c256"; + compatible = "atmel,24c256"; reg = <0x50>; }; diff --git a/arch/powerpc/boot/dts/pcm030.dts b/arch/powerpc/boot/dts/pcm030.dts index 192e66af0001..836e47cc4bed 100644 --- a/arch/powerpc/boot/dts/pcm030.dts +++ b/arch/powerpc/boot/dts/pcm030.dts @@ -71,7 +71,7 @@ reg = <0x51>; }; eeprom@52 { - compatible = "catalyst,24c32"; + compatible = "catalyst,24c32", "atmel,24c32"; reg = <0x52>; pagesize = <32>; }; diff --git a/arch/powerpc/boot/dts/pcm032.dts b/arch/powerpc/boot/dts/pcm032.dts index 96b139bf50e9..576249bf2fb9 100644 --- a/arch/powerpc/boot/dts/pcm032.dts +++ b/arch/powerpc/boot/dts/pcm032.dts @@ -75,7 +75,7 @@ reg = <0x51>; }; eeprom@52 { - compatible = "catalyst,24c32"; + compatible = "catalyst,24c32", "atmel,24c32"; reg = <0x52>; pagesize = <32>; }; diff --git a/arch/powerpc/boot/dts/pdm360ng.dts b/arch/powerpc/boot/dts/pdm360ng.dts index 0cec7244abe7..445b88114009 100644 --- a/arch/powerpc/boot/dts/pdm360ng.dts +++ b/arch/powerpc/boot/dts/pdm360ng.dts @@ -13,7 +13,7 @@ * option) any later version. */ -#include <mpc5121.dtsi> +#include "mpc5121.dtsi" / { model = "pdm360ng"; diff --git a/arch/powerpc/boot/dts/sequoia.dts b/arch/powerpc/boot/dts/sequoia.dts index b1d329246b08..e41b88a5eaee 100644 --- a/arch/powerpc/boot/dts/sequoia.dts +++ b/arch/powerpc/boot/dts/sequoia.dts @@ -229,7 +229,7 @@ }; partition@84000 { label = "user"; - reg = <0x00000000 0x01f7c000>; + reg = <0x00084000 0x01f7c000>; }; }; }; diff --git a/arch/powerpc/boot/dts/warp.dts b/arch/powerpc/boot/dts/warp.dts index e576ee85c42f..ea9053ef4819 100644 --- a/arch/powerpc/boot/dts/warp.dts +++ b/arch/powerpc/boot/dts/warp.dts @@ -238,7 +238,7 @@ /* This will create 52 and 53 */ at24@52 { - compatible = "at,24c04"; + compatible = "atmel,24c04"; reg = <0x52>; }; }; diff --git a/arch/powerpc/boot/ppc_asm.h b/arch/powerpc/boot/ppc_asm.h index b03373d8b386..68e388ee94fe 100644 --- a/arch/powerpc/boot/ppc_asm.h +++ b/arch/powerpc/boot/ppc_asm.h @@ -67,13 +67,15 @@ #define MSR_LE 0x0000000000000001 #define FIXUP_ENDIAN \ - tdi 0, 0, 0x48; /* Reverse endian of b . + 8 */ \ - b $+36; /* Skip trampoline if endian is good */ \ - .long 0x05009f42; /* bcl 20,31,$+4 */ \ - .long 0xa602487d; /* mflr r10 */ \ - .long 0x1c004a39; /* addi r10,r10,28 */ \ + tdi 0,0,0x48; /* Reverse endian of b . + 8 */ \ + b $+44; /* Skip trampoline if endian is good */ \ .long 0xa600607d; /* mfmsr r11 */ \ .long 0x01006b69; /* xori r11,r11,1 */ \ + .long 0x00004039; /* li r10,0 */ \ + .long 0x6401417d; /* mtmsrd r10,1 */ \ + .long 0x05009f42; /* bcl 20,31,$+4 */ \ + .long 0xa602487d; /* mflr r10 */ \ + .long 0x14004a39; /* addi r10,r10,20 */ \ .long 0xa6035a7d; /* mtsrr0 r10 */ \ .long 0xa6037b7d; /* mtsrr1 r11 */ \ .long 0x2400004c /* rfid */ diff --git a/arch/powerpc/configs/44x/fsp2_defconfig b/arch/powerpc/configs/44x/fsp2_defconfig new file mode 100644 index 000000000000..e8e6a6999852 --- /dev/null +++ b/arch/powerpc/configs/44x/fsp2_defconfig @@ -0,0 +1,126 @@ +CONFIG_44x=y +# CONFIG_SWAP is not set +CONFIG_SYSVIPC=y +# CONFIG_CROSS_MEMORY_ATTACH is not set +# CONFIG_FHANDLE is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=16 +CONFIG_BLK_DEV_INITRD=y +# CONFIG_RD_LZMA is not set +# CONFIG_RD_XZ is not set +# CONFIG_RD_LZO is not set +# CONFIG_RD_LZ4 is not set +CONFIG_KALLSYMS_ALL=y +CONFIG_BPF_SYSCALL=y +CONFIG_EMBEDDED=y +CONFIG_PROFILING=y +CONFIG_OPROFILE=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_PPC_47x=y +# CONFIG_EBONY is not set +CONFIG_FSP2=y +CONFIG_476FPE_ERR46=y +CONFIG_SWIOTLB=y +CONFIG_KEXEC=y +CONFIG_CRASH_DUMP=y +CONFIG_CMDLINE_BOOL=y +CONFIG_CMDLINE="ip=on rw" +# CONFIG_SUSPEND is not set +# CONFIG_PCI is not set +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_IPV6 is not set +CONFIG_VLAN_8021Q=m +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_CONNECTOR=y +CONFIG_MTD=y +CONFIG_MTD_BLOCK=y +CONFIG_MTD_JEDECPROBE=y +CONFIG_MTD_CFI_AMDSTD=y +CONFIG_MTD_PHYSMAP_OF=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=35000 +# CONFIG_SCSI_PROC_FS is not set +CONFIG_BLK_DEV_SD=y +# CONFIG_SCSI_LOWLEVEL is not set +CONFIG_ATA=y +# CONFIG_SATA_PMP is not set +# CONFIG_ATA_SFF is not set +CONFIG_NETDEVICES=y +CONFIG_BONDING=m +CONFIG_IBM_EMAC=m +# CONFIG_INPUT is not set +# CONFIG_SERIO is not set +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_DEVMEM is not set +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_NR_UARTS=32 +CONFIG_SERIAL_8250_RUNTIME_UARTS=32 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_SHARE_IRQ=y +CONFIG_SERIAL_OF_PLATFORM=y +# CONFIG_HW_RANDOM is not set +CONFIG_I2C=y +CONFIG_I2C_IBM_IIC=y +CONFIG_PTP_1588_CLOCK=y +# CONFIG_HWMON is not set +CONFIG_THERMAL=y +CONFIG_WATCHDOG=y +CONFIG_BOOKE_WDT=y +CONFIG_USB=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_OHCI_HCD=y +CONFIG_MMC=y +CONFIG_MMC_DEBUG=y +CONFIG_MMC_SDHCI=y +CONFIG_MMC_SDHCI_PLTFM=y +CONFIG_MMC_SDHCI_OF_ARASAN=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_M41T80=y +CONFIG_EXT2_FS=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_JFFS2_FS=y +CONFIG_JFFS2_FS_WBUF_VERIFY=y +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_CRAMFS=y +CONFIG_NFS_FS=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_ROOT_NFS=y +CONFIG_NLS_DEFAULT="n" +CONFIG_XZ_DEC=y +CONFIG_PRINTK_TIME=y +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=3 +CONFIG_DYNAMIC_DEBUG=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_FS=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_CRYPTO_CBC=y +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_PCBC=y +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_DES=y +# CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index c0deafc212b8..25d42bd3f114 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -74,6 +74,11 @@ do { \ ___p1; \ }) +/* + * This must resolve to hwsync on SMP for the context switch path. + * See _switch, and core scheduler context switch memory ordering + * comments. + */ #define smp_mb__before_spinlock() smp_mb() #include <asm-generic/barrier.h> diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 33a24fdd7958..b750ffef83c7 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -206,68 +206,13 @@ static __inline__ void __clear_bit_unlock(int nr, volatile unsigned long *addr) * Return the zero-based bit position (LE, not IBM bit numbering) of * the most significant 1-bit in a double word. */ -static __inline__ __attribute__((const)) -int __ilog2(unsigned long x) -{ - int lz; +#define __ilog2(x) ilog2(x) - asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (x)); - return BITS_PER_LONG - 1 - lz; -} +#include <asm-generic/bitops/ffz.h> -static inline __attribute__((const)) -int __ilog2_u32(u32 n) -{ - int bit; - asm ("cntlzw %0,%1" : "=r" (bit) : "r" (n)); - return 31 - bit; -} +#include <asm-generic/bitops/builtin-__ffs.h> -#ifdef __powerpc64__ -static inline __attribute__((const)) -int __ilog2_u64(u64 n) -{ - int bit; - asm ("cntlzd %0,%1" : "=r" (bit) : "r" (n)); - return 63 - bit; -} -#endif - -/* - * Determines the bit position of the least significant 0 bit in the - * specified double word. The returned bit position will be - * zero-based, starting from the right side (63/31 - 0). - */ -static __inline__ unsigned long ffz(unsigned long x) -{ - /* no zero exists anywhere in the 8 byte area. */ - if ((x = ~x) == 0) - return BITS_PER_LONG; - - /* - * Calculate the bit position of the least significant '1' bit in x - * (since x has been changed this will actually be the least significant - * '0' bit in * the original x). Note: (x & -x) gives us a mask that - * is the least significant * (RIGHT-most) 1-bit of the value in x. - */ - return __ilog2(x & -x); -} - -static __inline__ unsigned long __ffs(unsigned long x) -{ - return __ilog2(x & -x); -} - -/* - * ffs: find first bit set. This is defined the same way as - * the libc and compiler builtin ffs routines, therefore - * differs in spirit from the above ffz (man ffs). - */ -static __inline__ int ffs(int x) -{ - unsigned long i = (unsigned long)x; - return __ilog2(i & -i) + 1; -} +#include <asm-generic/bitops/builtin-ffs.h> /* * fls: find last (most-significant) bit set. @@ -275,33 +220,15 @@ static __inline__ int ffs(int x) */ static __inline__ int fls(unsigned int x) { - int lz; - - asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x)); - return 32 - lz; + return 32 - __builtin_clz(x); } -static __inline__ unsigned long __fls(unsigned long x) -{ - return __ilog2(x); -} +#include <asm-generic/bitops/builtin-__fls.h> -/* - * 64-bit can do this using one cntlzd (count leading zeroes doubleword) - * instruction; for 32-bit we use the generic version, which does two - * 32-bit fls calls. - */ -#ifdef __powerpc64__ static __inline__ int fls64(__u64 x) { - int lz; - - asm ("cntlzd %0,%1" : "=r" (lz) : "r" (x)); - return 64 - lz; + return 64 - __builtin_clzll(x); } -#else -#include <asm-generic/bitops/fls64.h> -#endif /* __powerpc64__ */ #ifdef CONFIG_PPC64 unsigned int __arch_hweight8(unsigned int w); diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index d310546e5d9d..a120e7f8d535 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -31,7 +31,8 @@ extern struct kmem_cache *pgtable_cache[]; static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 26ed228d4dc6..7fb755880409 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -297,6 +297,8 @@ static inline void __ptep_set_access_flags(struct mm_struct *mm, extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp); +int map_kernel_page(unsigned long va, phys_addr_t pa, int flags); + /* Generic accessors to PTE bits */ static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 4e957b027fe0..0ce513f2926f 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -89,6 +89,9 @@ static inline int hash__pgd_bad(pgd_t pgd) { return (pgd_val(pgd) == 0); } +#ifdef CONFIG_STRICT_KERNEL_RWX +extern void hash__mark_rodata_ro(void); +#endif extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge); diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index cd5e7aa8cc34..20b1485ff1e8 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -53,10 +53,11 @@ extern void __tlb_remove_table(void *_table); static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm) { #ifdef CONFIG_PPC_64K_PAGES - return (pgd_t *)__get_free_page(PGALLOC_GFP); + return (pgd_t *)__get_free_page(pgtable_gfp_flags(mm, PGALLOC_GFP)); #else struct page *page; - page = alloc_pages(PGALLOC_GFP | __GFP_REPEAT, 4); + page = alloc_pages(pgtable_gfp_flags(mm, PGALLOC_GFP | __GFP_REPEAT), + 4); if (!page) return NULL; return (pgd_t *) page_address(page); @@ -76,7 +77,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { if (radix_enabled()) return radix__pgd_alloc(mm); - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) @@ -93,7 +95,8 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) @@ -119,7 +122,8 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) @@ -168,7 +172,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, struct page *page; pte_t *pte; - pte = pte_alloc_one_kernel(mm, address); + pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT); if (!pte) return NULL; page = virt_to_page(pte); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 85bc9875c3be..c0737c86a362 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -5,6 +5,7 @@ #ifndef __ASSEMBLY__ #include <linux/mmdebug.h> +#include <linux/bug.h> #endif /* @@ -79,6 +80,9 @@ #define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ #define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ +#define _PAGE_DEVMAP _RPAGE_SW1 /* software: ZONE_DEVICE page */ +#define __HAVE_ARCH_PTE_DEVMAP + /* * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE * Instead of fixing all of them, add an alternate define which @@ -599,6 +603,16 @@ static inline pte_t pte_mkhuge(pte_t pte) return pte; } +static inline pte_t pte_mkdevmap(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP); +} + +static inline int pte_devmap(pte_t pte) +{ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP)); +} + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { /* FIXME!! check whether this need to be a conditional */ @@ -1146,6 +1160,37 @@ static inline bool arch_needs_pgtable_deposit(void) return true; } + +static inline pmd_t pmd_mkdevmap(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP)); +} + +static inline int pmd_devmap(pmd_t pmd) +{ + return pte_devmap(pmd_pte(pmd)); +} + +static inline int pud_devmap(pud_t pud) +{ + return 0; +} + +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static inline const int pud_pfn(pud_t pud) +{ + /* + * Currently all calls to pud_pfn() are gated around a pud_devmap() + * check so this should never be used. If it grows another user we + * want to know about it. + */ + BUILD_BUG(); + return 0; +} #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index ac16d1943022..487709ff6875 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -116,6 +116,10 @@ #define RADIX_PUD_TABLE_SIZE (sizeof(pud_t) << RADIX_PUD_INDEX_SIZE) #define RADIX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE) +#ifdef CONFIG_STRICT_KERNEL_RWX +extern void radix__mark_rodata_ro(void); +#endif + static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr, unsigned long set) { @@ -252,7 +256,7 @@ static inline int radix__pgd_bad(pgd_t pgd) static inline int radix__pmd_trans_huge(pmd_t pmd) { - return !!(pmd_val(pmd) & _PAGE_PTE); + return (pmd_val(pmd) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE; } static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index abef812de7f8..5482928eea1b 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -83,8 +83,16 @@ static inline unsigned long ppc_function_entry(void *func) * On PPC64 ABIv1 the function pointer actually points to the * function's descriptor. The first entry in the descriptor is the * address of the function text. + * + * However, we may also receive pointer to an assembly symbol. To + * detect that, we first check if the function pointer we receive + * already points to kernel/module text and we only dereference it + * if it doesn't. */ - return ((func_descr_t *)func)->entry; + if (kernel_text_address((unsigned long)func)) + return (unsigned long)func; + else + return ((func_descr_t *)func)->entry; #else return (unsigned long)func; #endif diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h index f70cbfe0ec04..9f2ae0d25e15 100644 --- a/arch/powerpc/include/asm/dbell.h +++ b/arch/powerpc/include/asm/dbell.h @@ -56,6 +56,19 @@ static inline void ppc_msgsync(void) : : "i" (CPU_FTR_HVMODE|CPU_FTR_ARCH_300)); } +static inline void _ppc_msgclr(u32 msg) +{ + __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGCLR(%1), PPC_MSGCLRP(%1), %0) + : : "i" (CPU_FTR_HVMODE), "r" (msg)); +} + +static inline void ppc_msgclr(enum ppc_dbell type) +{ + u32 msg = PPC_DBELL_TYPE(type); + + _ppc_msgclr(msg); +} + #else /* CONFIG_PPC_BOOK3S */ #define PPC_DBELL_MSGTYPE PPC_DBELL diff --git a/arch/powerpc/include/asm/delay.h b/arch/powerpc/include/asm/delay.h index 52e4d54da2a9..3df4417dd9c8 100644 --- a/arch/powerpc/include/asm/delay.h +++ b/arch/powerpc/include/asm/delay.h @@ -2,6 +2,7 @@ #define _ASM_POWERPC_DELAY_H #ifdef __KERNEL__ +#include <linux/processor.h> #include <asm/time.h> /* @@ -58,11 +59,18 @@ extern void udelay(unsigned long usecs); typeof(condition) __ret; \ unsigned long __loops = tb_ticks_per_usec * timeout; \ unsigned long __start = get_tbl(); \ - while (!(__ret = (condition)) && (tb_ticks_since(__start) <= __loops)) \ - if (delay) \ + \ + if (delay) { \ + while (!(__ret = (condition)) && \ + (tb_ticks_since(__start) <= __loops)) \ udelay(delay); \ - else \ - cpu_relax(); \ + } else { \ + spin_begin(); \ + while (!(__ret = (condition)) && \ + (tb_ticks_since(__start) <= __loops)) \ + spin_cpu_relax(); \ + spin_end(); \ + } \ if (!__ret) \ __ret = (condition); \ __ret; \ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 183d73b6ed99..9a318973af05 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -36,20 +36,38 @@ */ #include <asm/head-64.h> +/* PACA save area offsets (exgen, exmc, etc) */ #define EX_R9 0 #define EX_R10 8 #define EX_R11 16 #define EX_R12 24 #define EX_R13 32 -#define EX_SRR0 40 -#define EX_DAR 48 -#define EX_DSISR 56 -#define EX_CCR 60 -#define EX_R3 64 -#define EX_LR 72 -#define EX_CFAR 80 -#define EX_PPR 88 /* SMT thread status register (priority) */ -#define EX_CTR 96 +#define EX_DAR 40 +#define EX_DSISR 48 +#define EX_CCR 52 +#define EX_CFAR 56 +#define EX_PPR 64 +#if defined(CONFIG_RELOCATABLE) +#define EX_CTR 72 +#define EX_SIZE 10 /* size in u64 units */ +#else +#define EX_SIZE 9 /* size in u64 units */ +#endif + +/* + * EX_LR is only used in EXSLB and where it does not overlap with EX_DAR + * EX_CCR similarly with DSISR, but being 4 byte registers there is a hole + * in the save area so it's not necessary to overlap them. Could be used + * for future savings though if another 4 byte register was to be saved. + */ +#define EX_LR EX_DAR + +/* + * EX_R3 is only used by the bad_stack handler. bad_stack reloads and + * saves DAR from SPRN_DAR, and EX_DAR is not used. So EX_R3 can overlap + * with EX_DAR. + */ +#define EX_R3 EX_DAR #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ @@ -236,6 +254,19 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define kvmppc_interrupt kvmppc_interrupt_pr #endif +/* + * Branch to label using its 0xC000 address. This results in instruction + * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned + * on using mtmsr rather than rfid. + * + * This could set the 0xc bits for !RELOCATABLE as an immediate, rather than + * load KBASE for a slight optimisation. + */ +#define BRANCH_TO_C000(reg, label) \ + __LOAD_HANDLER(reg, label); \ + mtctr reg; \ + bctr + #ifdef CONFIG_RELOCATABLE #define BRANCH_TO_COMMON(reg, label) \ __LOAD_HANDLER(reg, label); \ diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 60b91084f33c..ce88bbe1d809 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -43,6 +43,9 @@ #define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ + (0x1UL << 26)) +/* The upper limit percentage for user specified boot memory size (25%) */ +#define MAX_BOOT_MEM_RATIO 4 + #define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt) /* Firmware provided dump sections */ @@ -200,6 +203,7 @@ struct fad_crash_memory_ranges { unsigned long long size; }; +extern int is_fadump_boot_memory_area(u64 addr, ulong size); extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data); extern int fadump_reserve_mem(void); diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index 86eb87382031..d81eac5b509f 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -3,6 +3,7 @@ #include <asm/cache.h> +#ifdef __ASSEMBLY__ /* * We can't do CPP stringification and concatination directly into the section * name for some reason, so these macros can do it for us. @@ -49,8 +50,8 @@ * CLOSE_FIXED_SECTION() or elsewhere, there may be something * unexpected being added there. Remove the '. = x_len' line, rebuild, and * check what is pushing the section down. - * - If the build dies in linking, check arch/powerpc/kernel/vmlinux.lds.S - * for instructions. + * - If the build dies in linking, check arch/powerpc/tools/head_check.sh + * comments. * - If the kernel crashes or hangs in very early boot, it could be linker * stubs at the start of the main text. */ @@ -63,11 +64,29 @@ . = 0x0; \ start_##sname: +/* + * .linker_stub_catch section is used to catch linker stubs from being + * inserted in our .text section, above the start_text label (which breaks + * the ABS_ADDR calculation). See kernel/vmlinux.lds.S and tools/head_check.sh + * for more details. We would prefer to just keep a cacheline (0x80), but + * 0x100 seems to be how the linker aligns branch stub groups. + */ +#ifdef CONFIG_LD_HEAD_STUB_CATCH +#define OPEN_TEXT_SECTION(start) \ + .section ".linker_stub_catch","ax",@progbits; \ +linker_stub_catch: \ + . = 0x4; \ + text_start = (start) + 0x100; \ + .section ".text","ax",@progbits; \ + .balign 0x100; \ +start_text: +#else #define OPEN_TEXT_SECTION(start) \ text_start = (start); \ .section ".text","ax",@progbits; \ . = 0x0; \ start_text: +#endif #define ZERO_FIXED_SECTION(sname, start, end) \ sname##_start = (start); \ @@ -397,4 +416,6 @@ name: EXC_COMMON_BEGIN(name); \ STD_EXCEPTION_COMMON(realvec + 0x2, name, hdlr); \ +#endif /* __ASSEMBLY__ */ + #endif /* _ASM_POWERPC_HEAD_64_H */ diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index eba60416536e..c1dd1929342d 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -129,6 +129,10 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs) } extern bool prep_irq_for_idle(void); +extern bool prep_irq_for_idle_irqsoff(void); +extern void irq_set_pending_from_srr1(unsigned long srr1); + +#define fini_irq_for_idle_irqsoff() trace_hardirqs_off(); extern void force_external_irq_replay(void); diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index f90b22c722e1..cd2fc1cc1cc7 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -226,6 +226,7 @@ struct machdep_calls { extern void e500_idle(void); extern void power4_idle(void); extern void power7_idle(void); +extern void power9_idle(void); extern void ppc6xx_idle(void); extern void book3e_idle(void); diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 81eff8631434..190d69a7f701 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -90,13 +90,14 @@ enum MCE_UserErrorType { enum MCE_RaErrorType { MCE_RA_ERROR_INDETERMINATE = 0, MCE_RA_ERROR_IFETCH = 1, - MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH = 2, - MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN = 3, - MCE_RA_ERROR_LOAD = 4, - MCE_RA_ERROR_STORE = 5, - MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE = 6, - MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN = 7, - MCE_RA_ERROR_LOAD_STORE_FOREIGN = 8, + MCE_RA_ERROR_IFETCH_FOREIGN = 2, + MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH = 3, + MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN = 4, + MCE_RA_ERROR_LOAD = 5, + MCE_RA_ERROR_STORE = 6, + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE = 7, + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN = 8, + MCE_RA_ERROR_LOAD_STORE_FOREIGN = 9, }; enum MCE_LinkErrorType { diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index 633139291a48..cc369a70f2bb 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -31,7 +31,8 @@ extern struct kmem_cache *pgtable_cache[]; static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 5134ade2e850..91314268f04f 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -340,6 +340,8 @@ static inline void __ptep_set_access_flags(struct mm_struct *mm, extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp); +int map_kernel_page(unsigned long va, phys_addr_t pa, int flags); + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_POWERPC_NOHASH_32_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 897d2e1c8a9b..9721c7867b9c 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -43,7 +43,8 @@ extern struct kmem_cache *pgtable_cache[]; static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) @@ -57,7 +58,8 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) @@ -96,7 +98,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, struct page *page; pte_t *pte; - pte = pte_alloc_one_kernel(mm, address); + pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT); if (!pte) return NULL; page = virt_to_page(pte); @@ -189,7 +191,8 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), + pgtable_gfp_flags(mm, GFP_KERNEL)); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index cb3e6242a78c..ef930ba500f9 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -667,12 +667,14 @@ enum { enum { OPAL_PHB_ERROR_DATA_TYPE_P7IOC = 1, - OPAL_PHB_ERROR_DATA_TYPE_PHB3 = 2 + OPAL_PHB_ERROR_DATA_TYPE_PHB3 = 2, + OPAL_PHB_ERROR_DATA_TYPE_PHB4 = 3 }; enum { OPAL_P7IOC_NUM_PEST_REGS = 128, - OPAL_PHB3_NUM_PEST_REGS = 256 + OPAL_PHB3_NUM_PEST_REGS = 256, + OPAL_PHB4_NUM_PEST_REGS = 512 }; struct OpalIoPhbErrorCommon { @@ -802,6 +804,75 @@ struct OpalIoPhb3ErrorData { __be64 pestB[OPAL_PHB3_NUM_PEST_REGS]; }; +struct OpalIoPhb4ErrorData { + struct OpalIoPhbErrorCommon common; + + __be32 brdgCtl; + + /* PHB4 cfg regs */ + __be32 deviceStatus; + __be32 slotStatus; + __be32 linkStatus; + __be32 devCmdStatus; + __be32 devSecStatus; + + /* cfg AER regs */ + __be32 rootErrorStatus; + __be32 uncorrErrorStatus; + __be32 corrErrorStatus; + __be32 tlpHdr1; + __be32 tlpHdr2; + __be32 tlpHdr3; + __be32 tlpHdr4; + __be32 sourceId; + + /* PHB4 ETU Error Regs */ + __be64 nFir; /* 000 */ + __be64 nFirMask; /* 003 */ + __be64 nFirWOF; /* 008 */ + __be64 phbPlssr; /* 120 */ + __be64 phbCsr; /* 110 */ + __be64 lemFir; /* C00 */ + __be64 lemErrorMask; /* C18 */ + __be64 lemWOF; /* C40 */ + __be64 phbErrorStatus; /* C80 */ + __be64 phbFirstErrorStatus; /* C88 */ + __be64 phbErrorLog0; /* CC0 */ + __be64 phbErrorLog1; /* CC8 */ + __be64 phbTxeErrorStatus; /* D00 */ + __be64 phbTxeFirstErrorStatus; /* D08 */ + __be64 phbTxeErrorLog0; /* D40 */ + __be64 phbTxeErrorLog1; /* D48 */ + __be64 phbRxeArbErrorStatus; /* D80 */ + __be64 phbRxeArbFirstErrorStatus; /* D88 */ + __be64 phbRxeArbErrorLog0; /* DC0 */ + __be64 phbRxeArbErrorLog1; /* DC8 */ + __be64 phbRxeMrgErrorStatus; /* E00 */ + __be64 phbRxeMrgFirstErrorStatus; /* E08 */ + __be64 phbRxeMrgErrorLog0; /* E40 */ + __be64 phbRxeMrgErrorLog1; /* E48 */ + __be64 phbRxeTceErrorStatus; /* E80 */ + __be64 phbRxeTceFirstErrorStatus; /* E88 */ + __be64 phbRxeTceErrorLog0; /* EC0 */ + __be64 phbRxeTceErrorLog1; /* EC8 */ + + /* PHB4 REGB Error Regs */ + __be64 phbPblErrorStatus; /* 1900 */ + __be64 phbPblFirstErrorStatus; /* 1908 */ + __be64 phbPblErrorLog0; /* 1940 */ + __be64 phbPblErrorLog1; /* 1948 */ + __be64 phbPcieDlpErrorLog1; /* 1AA0 */ + __be64 phbPcieDlpErrorLog2; /* 1AA8 */ + __be64 phbPcieDlpErrorStatus; /* 1AB0 */ + __be64 phbRegbErrorStatus; /* 1C00 */ + __be64 phbRegbFirstErrorStatus; /* 1C08 */ + __be64 phbRegbErrorLog0; /* 1C40 */ + __be64 phbRegbErrorLog1; /* 1C48 */ + + __be64 pestA[OPAL_PHB4_NUM_PEST_REGS]; + __be64 pestB[OPAL_PHB4_NUM_PEST_REGS]; +}; + enum { OPAL_REINIT_CPUS_HILE_BE = (1 << 0), OPAL_REINIT_CPUS_HILE_LE = (1 << 1), @@ -877,6 +948,7 @@ enum { OPAL_PHB_CAPI_MODE_SNOOP_OFF = 2, OPAL_PHB_CAPI_MODE_SNOOP_ON = 3, OPAL_PHB_CAPI_MODE_DMA = 4, + OPAL_PHB_CAPI_MODE_DMA_TVT1 = 5, }; /* OPAL I2C request */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 1c09f8fe2ee8..dc88a31cc79a 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -21,7 +21,11 @@ #include <asm/lppaca.h> #include <asm/mmu.h> #include <asm/page.h> +#ifdef CONFIG_PPC_BOOK3E #include <asm/exception-64e.h> +#else +#include <asm/exception-64s.h> +#endif #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #include <asm/kvm_book3s_asm.h> #endif @@ -98,8 +102,8 @@ struct paca_struct { * Now, starting in cacheline 2, the exception save areas */ /* used for most interrupts/exceptions */ - u64 exgen[13] __attribute__((aligned(0x80))); - u64 exslb[13]; /* used for SLB/segment table misses + u64 exgen[EX_SIZE] __attribute__((aligned(0x80))); + u64 exslb[EX_SIZE]; /* used for SLB/segment table misses * on the linear mapping */ /* SLB related definitions */ u16 vmalloc_sllp; @@ -177,12 +181,14 @@ struct paca_struct { * to the sibling threads' paca. */ struct paca_struct **thread_sibling_pacas; + /* The PSSCR value that the kernel requested before going to stop */ + u64 requested_psscr; #endif #ifdef CONFIG_PPC_STD_MMU_64 /* Non-maskable exceptions that are not performance critical */ - u64 exnmi[13]; /* used for system reset (nmi) */ - u64 exmc[13]; /* used for machine checks */ + u64 exnmi[EX_SIZE]; /* used for system reset (nmi) */ + u64 exmc[EX_SIZE]; /* used for machine checks */ #endif #ifdef CONFIG_PPC_BOOK3S_64 /* Exclusive stacks for system reset and machine check exception. */ diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 0413457ba11d..d795c5d5789c 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -3,6 +3,20 @@ #include <linux/mm.h> +#ifndef MODULE +static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp) +{ + if (unlikely(mm == &init_mm)) + return gfp; + return gfp | __GFP_ACCOUNT; +} +#else /* !MODULE */ +static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp) +{ + return gfp | __GFP_ACCOUNT; +} +#endif /* MODULE */ + #ifdef CONFIG_PPC_BOOK3S #include <asm/book3s/pgalloc.h> #else diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 1a9b45198c06..fa9ebaead91e 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -191,8 +191,7 @@ /* sorted alphabetically */ #define PPC_INST_BHRBE 0x7c00025c #define PPC_INST_CLRBHRB 0x7c00035c -#define PPC_INST_COPY 0x7c00060c -#define PPC_INST_COPY_FIRST 0x7c20060c +#define PPC_INST_COPY 0x7c20060c #define PPC_INST_CP_ABORT 0x7c00068c #define PPC_INST_DCBA 0x7c0005ec #define PPC_INST_DCBA_MASK 0xfc0007fe @@ -223,10 +222,10 @@ #define PPC_INST_MSGCLR 0x7c0001dc #define PPC_INST_MSGSYNC 0x7c0006ec #define PPC_INST_MSGSNDP 0x7c00011c +#define PPC_INST_MSGCLRP 0x7c00015c #define PPC_INST_MTTMR 0x7c0003dc #define PPC_INST_NOP 0x60000000 -#define PPC_INST_PASTE 0x7c00070c -#define PPC_INST_PASTE_LAST 0x7c20070d +#define PPC_INST_PASTE 0x7c20070d #define PPC_INST_POPCNTB 0x7c0000f4 #define PPC_INST_POPCNTB_MASK 0xfc0007fe #define PPC_INST_POPCNTD 0x7c0003f4 @@ -394,6 +393,8 @@ /* Deal with instructions that older assemblers aren't aware of */ #define PPC_CP_ABORT stringify_in_c(.long PPC_INST_CP_ABORT) +#define PPC_COPY(a, b) stringify_in_c(.long PPC_INST_COPY | \ + ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \ __PPC_RA(a) | __PPC_RB(b)) #define PPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \ @@ -411,6 +412,8 @@ ___PPC_RB(b)) #define PPC_MSGSNDP(b) stringify_in_c(.long PPC_INST_MSGSNDP | \ ___PPC_RB(b)) +#define PPC_MSGCLRP(b) stringify_in_c(.long PPC_INST_MSGCLRP | \ + ___PPC_RB(b)) #define PPC_POPCNTB(a, s) stringify_in_c(.long PPC_INST_POPCNTB | \ __PPC_RA(a) | __PPC_RS(s)) #define PPC_POPCNTD(a, s) stringify_in_c(.long PPC_INST_POPCNTD | \ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 359c44341761..6baeeb9acd0d 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -770,15 +770,18 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) #else #define FIXUP_ENDIAN \ tdi 0,0,0x48; /* Reverse endian of b . + 8 */ \ - b $+36; /* Skip trampoline if endian is good */ \ - .long 0x05009f42; /* bcl 20,31,$+4 */ \ - .long 0xa602487d; /* mflr r10 */ \ - .long 0x1c004a39; /* addi r10,r10,28 */ \ + b $+44; /* Skip trampoline if endian is good */ \ .long 0xa600607d; /* mfmsr r11 */ \ .long 0x01006b69; /* xori r11,r11,1 */ \ + .long 0x00004039; /* li r10,0 */ \ + .long 0x6401417d; /* mtmsrd r10,1 */ \ + .long 0x05009f42; /* bcl 20,31,$+4 */ \ + .long 0xa602487d; /* mflr r10 */ \ + .long 0x14004a39; /* addi r10,r10,20 */ \ .long 0xa6035a7d; /* mtsrr0 r10 */ \ .long 0xa6037b7d; /* mtsrr1 r11 */ \ .long 0x2400004c /* rfid */ + #endif /* !CONFIG_PPC_BOOK3E */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 1189d04f3bd1..fab7ff877304 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -421,6 +421,26 @@ static inline unsigned long __pack_fe01(unsigned int fpmode) #ifdef CONFIG_PPC64 #define cpu_relax() do { HMT_low(); HMT_medium(); barrier(); } while (0) + +#define spin_begin() HMT_low() + +#define spin_cpu_relax() barrier() + +#define spin_cpu_yield() spin_cpu_relax() + +#define spin_end() HMT_medium() + +#define spin_until_cond(cond) \ +do { \ + if (unlikely(!(cond))) { \ + spin_begin(); \ + do { \ + spin_cpu_relax(); \ + } while (!(cond)); \ + spin_end(); \ + } \ +} while (0) + #else #define cpu_relax() barrier() #endif @@ -474,11 +494,11 @@ extern unsigned long cpuidle_disable; enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; extern int powersave_nap; /* set if nap mode can be used in idle loop */ -extern unsigned long power7_nap(int check_irq); -extern unsigned long power7_sleep(void); -extern unsigned long power7_winkle(void); -extern unsigned long power9_idle_stop(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask); +extern unsigned long power7_idle_insn(unsigned long type); /* PNV_THREAD_NAP/etc*/ +extern void power7_idle_type(unsigned long type); +extern unsigned long power9_idle_stop(unsigned long psscr_val); +extern void power9_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask); extern void flush_instruction_cache(void); extern void hard_reset_now(void); diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h index c05cef6ee06c..18f168aebae3 100644 --- a/arch/powerpc/include/asm/trace.h +++ b/arch/powerpc/include/asm/trace.h @@ -168,6 +168,39 @@ TRACE_EVENT(hash_fault, __entry->addr, __entry->access, __entry->trap) ); + +TRACE_EVENT(tlbie, + + TP_PROTO(unsigned long lpid, unsigned long local, unsigned long rb, + unsigned long rs, unsigned long ric, unsigned long prs, + unsigned long r), + TP_ARGS(lpid, local, rb, rs, ric, prs, r), + TP_STRUCT__entry( + __field(unsigned long, lpid) + __field(unsigned long, local) + __field(unsigned long, rb) + __field(unsigned long, rs) + __field(unsigned long, ric) + __field(unsigned long, prs) + __field(unsigned long, r) + ), + + TP_fast_assign( + __entry->lpid = lpid; + __entry->local = local; + __entry->rb = rb; + __entry->rs = rs; + __entry->ric = ric; + __entry->prs = prs; + __entry->r = r; + ), + + TP_printk("lpid=%ld, local=%ld, rb=0x%lx, rs=0x%lx, ric=0x%lx, " + "prs=0x%lx, r=0x%lx", __entry->lpid, __entry->local, + __entry->rb, __entry->rs, __entry->ric, __entry->prs, + __entry->r) +); + #endif /* _TRACE_POWERPC_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild index b15bf6bc0e94..0d960ef78a9a 100644 --- a/arch/powerpc/include/uapi/asm/Kbuild +++ b/arch/powerpc/include/uapi/asm/Kbuild @@ -1,2 +1,8 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm + +generic-y += param.h +generic-y += poll.h +generic-y += resource.h +generic-y += sockios.h +generic-y += statfs.h diff --git a/arch/powerpc/include/uapi/asm/param.h b/arch/powerpc/include/uapi/asm/param.h deleted file mode 100644 index 965d45427975..000000000000 --- a/arch/powerpc/include/uapi/asm/param.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/param.h> diff --git a/arch/powerpc/include/uapi/asm/poll.h b/arch/powerpc/include/uapi/asm/poll.h deleted file mode 100644 index c98509d3149e..000000000000 --- a/arch/powerpc/include/uapi/asm/poll.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/poll.h> diff --git a/arch/powerpc/include/uapi/asm/resource.h b/arch/powerpc/include/uapi/asm/resource.h deleted file mode 100644 index 04bc4db8921b..000000000000 --- a/arch/powerpc/include/uapi/asm/resource.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/resource.h> diff --git a/arch/powerpc/include/uapi/asm/sockios.h b/arch/powerpc/include/uapi/asm/sockios.h deleted file mode 100644 index 55cef7675a31..000000000000 --- a/arch/powerpc/include/uapi/asm/sockios.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _ASM_POWERPC_SOCKIOS_H -#define _ASM_POWERPC_SOCKIOS_H - -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -/* Socket-level I/O control calls. */ -#define FIOSETOWN 0x8901 -#define SIOCSPGRP 0x8902 -#define FIOGETOWN 0x8903 -#define SIOCGPGRP 0x8904 -#define SIOCATMARK 0x8905 -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ - -#endif /* _ASM_POWERPC_SOCKIOS_H */ diff --git a/arch/powerpc/include/uapi/asm/statfs.h b/arch/powerpc/include/uapi/asm/statfs.h deleted file mode 100644 index 5244834583a4..000000000000 --- a/arch/powerpc/include/uapi/asm/statfs.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_POWERPC_STATFS_H -#define _ASM_POWERPC_STATFS_H - -#include <asm-generic/statfs.h> - -#endif diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index e132902e1f14..0845eebc5af3 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -25,8 +25,6 @@ CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -# timers used by tracing -CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) endif obj-y := cputable.o ptrace.o syscalls.o \ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index ae8e89e0d083..6e95c2c19a7e 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -100,12 +100,12 @@ int main(void) OFFSET(THREAD_NORMSAVES, thread_struct, normsave[0]); #endif OFFSET(THREAD_FPEXC_MODE, thread_struct, fpexc_mode); - OFFSET(THREAD_FPSTATE, thread_struct, fp_state); + OFFSET(THREAD_FPSTATE, thread_struct, fp_state.fpr); OFFSET(THREAD_FPSAVEAREA, thread_struct, fp_save_area); OFFSET(FPSTATE_FPSCR, thread_fp_state, fpscr); OFFSET(THREAD_LOAD_FP, thread_struct, load_fp); #ifdef CONFIG_ALTIVEC - OFFSET(THREAD_VRSTATE, thread_struct, vr_state); + OFFSET(THREAD_VRSTATE, thread_struct, vr_state.vr); OFFSET(THREAD_VRSAVEAREA, thread_struct, vr_save_area); OFFSET(THREAD_VRSAVE, thread_struct, vrsave); OFFSET(THREAD_USED_VR, thread_struct, used_vr); @@ -145,9 +145,9 @@ int main(void) OFFSET(THREAD_TM_PPR, thread_struct, tm_ppr); OFFSET(THREAD_TM_DSCR, thread_struct, tm_dscr); OFFSET(PT_CKPT_REGS, thread_struct, ckpt_regs); - OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state); + OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state.vr); OFFSET(THREAD_CKVRSAVE, thread_struct, ckvrsave); - OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state); + OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state.fpr); /* Local pt_regs on stack for Transactional Memory funcs. */ DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); @@ -745,9 +745,11 @@ int main(void) OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask); OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); + OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr); #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); + DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE); #ifdef CONFIG_PPC_8xx DEFINE(VIRT_IMMR_BASE, (u64)__fix_to_virt(FIX_IMMR_BASE)); diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index bfbad08a1207..49d8422767b4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -57,7 +57,7 @@ system_call_common: #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */ - bne tabort_syscall + bne .Ltabort_syscall END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif andi. r10,r12,MSR_PR @@ -143,6 +143,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) mtmsrd r11,1 #endif /* CONFIG_PPC_BOOK3E */ +system_call: /* label this so stack traces look sane */ /* We do need to set SOFTE in the stack frame or the return * from interrupt will be painful */ @@ -152,11 +153,11 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) CURRENT_THREAD_INFO(r11, r1) ld r10,TI_FLAGS(r11) andi. r11,r10,_TIF_SYSCALL_DOTRACE - bne syscall_dotrace /* does not return */ + bne .Lsyscall_dotrace /* does not return */ cmpldi 0,r0,NR_syscalls - bge- syscall_enosys + bge- .Lsyscall_enosys -system_call: /* label this so stack traces look sane */ +.Lsyscall: /* * Need to vector to 32 Bit or default sys_call_table here, * based on caller's run-mode / personality. @@ -185,8 +186,20 @@ system_call: /* label this so stack traces look sane */ #ifdef CONFIG_PPC_BOOK3S /* No MSR:RI on BookE */ andi. r10,r8,MSR_RI - beq- unrecov_restore + beq- .Lunrecov_restore #endif + +/* + * This is a few instructions into the actual syscall exit path (which actually + * starts at .Lsyscall_exit) to cater to kprobe blacklisting and to reduce the + * number of visible symbols for profiling purposes. + * + * We can probe from system_call until this point as MSR_RI is set. But once it + * is cleared below, we won't be able to take a trap. + * + * This is blacklisted from kprobes further below with _ASM_NOKPROBE_SYMBOL(). + */ +system_call_exit: /* * Disable interrupts so current_thread_info()->flags can't change, * and so that we don't get interrupted after loading SRR0/1. @@ -208,31 +221,21 @@ system_call: /* label this so stack traces look sane */ ld r9,TI_FLAGS(r12) li r11,-MAX_ERRNO andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) - bne- syscall_exit_work + bne- .Lsyscall_exit_work - andi. r0,r8,MSR_FP - beq 2f + /* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */ + li r7,MSR_FP #ifdef CONFIG_ALTIVEC - andis. r0,r8,MSR_VEC@h - bne 3f -#endif -2: addi r3,r1,STACK_FRAME_OVERHEAD -#ifdef CONFIG_PPC_BOOK3S - li r10,MSR_RI - mtmsrd r10,1 /* Restore RI */ -#endif - bl restore_math -#ifdef CONFIG_PPC_BOOK3S - li r11,0 - mtmsrd r11,1 + oris r7,r7,MSR_VEC@h #endif - ld r8,_MSR(r1) - ld r3,RESULT(r1) - li r11,-MAX_ERRNO + and r0,r8,r7 + cmpd r0,r7 + bne .Lsyscall_restore_math +.Lsyscall_restore_math_cont: -3: cmpld r3,r11 + cmpld r3,r11 ld r5,_CCR(r1) - bge- syscall_error + bge- .Lsyscall_error .Lsyscall_error_cont: ld r7,_NIP(r1) BEGIN_FTR_SECTION @@ -258,14 +261,48 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) RFI b . /* prevent speculative execution */ -syscall_error: +.Lsyscall_error: oris r5,r5,0x1000 /* Set SO bit in CR */ neg r3,r3 std r5,_CCR(r1) b .Lsyscall_error_cont - + +.Lsyscall_restore_math: + /* + * Some initial tests from restore_math to avoid the heavyweight + * C code entry and MSR manipulations. + */ + LOAD_REG_IMMEDIATE(r0, MSR_TS_MASK) + and. r0,r0,r8 + bne 1f + + ld r7,PACACURRENT(r13) + lbz r0,THREAD+THREAD_LOAD_FP(r7) +#ifdef CONFIG_ALTIVEC + lbz r6,THREAD+THREAD_LOAD_VEC(r7) + add r0,r0,r6 +#endif + cmpdi r0,0 + beq .Lsyscall_restore_math_cont + +1: addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_PPC_BOOK3S + li r10,MSR_RI + mtmsrd r10,1 /* Restore RI */ +#endif + bl restore_math +#ifdef CONFIG_PPC_BOOK3S + li r11,0 + mtmsrd r11,1 +#endif + /* Restore volatiles, reload MSR from updated one */ + ld r8,_MSR(r1) + ld r3,RESULT(r1) + li r11,-MAX_ERRNO + b .Lsyscall_restore_math_cont + /* Traced system call support */ -syscall_dotrace: +.Lsyscall_dotrace: bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl do_syscall_trace_enter @@ -286,23 +323,23 @@ syscall_dotrace: ld r7,GPR7(r1) ld r8,GPR8(r1) - /* Repopulate r9 and r10 for the system_call path */ + /* Repopulate r9 and r10 for the syscall path */ addi r9,r1,STACK_FRAME_OVERHEAD CURRENT_THREAD_INFO(r10, r1) ld r10,TI_FLAGS(r10) cmpldi r0,NR_syscalls - blt+ system_call + blt+ .Lsyscall /* Return code is already in r3 thanks to do_syscall_trace_enter() */ b .Lsyscall_exit -syscall_enosys: +.Lsyscall_enosys: li r3,-ENOSYS b .Lsyscall_exit -syscall_exit_work: +.Lsyscall_exit_work: #ifdef CONFIG_PPC_BOOK3S li r10,MSR_RI mtmsrd r10,1 /* Restore RI */ @@ -362,7 +399,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) b ret_from_except #ifdef CONFIG_PPC_TRANSACTIONAL_MEM -tabort_syscall: +.Ltabort_syscall: /* Firstly we need to enable TM in the kernel */ mfmsr r10 li r9, 1 @@ -388,6 +425,8 @@ tabort_syscall: rfid b . /* prevent speculative execution */ #endif +_ASM_NOKPROBE_SYMBOL(system_call_common); +_ASM_NOKPROBE_SYMBOL(system_call_exit); /* Save non-volatile GPRs, if not already saved. */ _GLOBAL(save_nvgprs) @@ -398,6 +437,7 @@ _GLOBAL(save_nvgprs) clrrdi r0,r11,1 std r0,_TRAP(r1) blr +_ASM_NOKPROBE_SYMBOL(save_nvgprs); /* @@ -488,33 +528,30 @@ _GLOBAL(_switch) std r23,_CCR(r1) std r1,KSP(r3) /* Set old stack pointer */ -#ifdef CONFIG_SMP - /* We need a sync somewhere here to make sure that if the - * previous task gets rescheduled on another CPU, it sees all - * stores it has performed on this one. + /* + * On SMP kernels, care must be taken because a task may be + * scheduled off CPUx and on to CPUy. Memory ordering must be + * considered. + * + * Cacheable stores on CPUx will be visible when the task is + * scheduled on CPUy by virtue of the core scheduler barriers + * (see "Notes on Program-Order guarantees on SMP systems." in + * kernel/sched/core.c). + * + * Uncacheable stores in the case of involuntary preemption must + * be taken care of. The smp_mb__before_spin_lock() in __schedule() + * is implemented as hwsync on powerpc, which orders MMIO too. So + * long as there is an hwsync in the context switch path, it will + * be executed on the source CPU after the task has performed + * all MMIO ops on that CPU, and on the destination CPU before the + * task performs any MMIO ops there. */ - sync -#endif /* CONFIG_SMP */ /* - * If we optimise away the clear of the reservation in system - * calls because we know the CPU tracks the address of the - * reservation, then we need to clear it here to cover the - * case that the kernel context switch path has no larx - * instructions. + * The kernel context switch path must contain a spin_lock, + * which contains larx/stcx, which will clear any reservation + * of the task being switched. */ -BEGIN_FTR_SECTION - ldarx r6,0,r1 -END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) - -BEGIN_FTR_SECTION -/* - * A cp_abort (copy paste abort) here ensures that when context switching, a - * copy from one process can't leak into the paste of another. - */ - PPC_CP_ABORT -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - #ifdef CONFIG_PPC_BOOK3S /* Cancel all explict user streams as they will have no use after context * switch and will stop the HW from creating streams itself @@ -583,6 +620,14 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) top of the kernel stack. */ addi r7,r7,THREAD_SIZE-SWITCH_FRAME_SIZE + /* + * PMU interrupts in radix may come in here. They will use r1, not + * PACAKSAVE, so this stack switch will not cause a problem. They + * will store to the process stack, which may then be migrated to + * another CPU. However the rq lock release on this CPU paired with + * the rq lock acquire on the new CPU before the stack becomes + * active on the new CPU, will order those stores. + */ mr r1,r8 /* start using new stack pointer */ std r7,PACAKSAVE(r13) @@ -763,11 +808,11 @@ restore: ld r5,SOFTE(r1) lbz r6,PACASOFTIRQEN(r13) cmpwi cr0,r5,0 - beq restore_irq_off + beq .Lrestore_irq_off /* We are enabling, were we already enabled ? Yes, just return */ cmpwi cr0,r6,1 - beq cr0,do_restore + beq cr0,.Ldo_restore /* * We are about to soft-enable interrupts (we are hard disabled @@ -776,14 +821,14 @@ restore: */ lbz r0,PACAIRQHAPPENED(r13) cmpwi cr0,r0,0 - bne- restore_check_irq_replay + bne- .Lrestore_check_irq_replay /* * Get here when nothing happened while soft-disabled, just * soft-enable and move-on. We will hard-enable as a side * effect of rfi */ -restore_no_replay: +.Lrestore_no_replay: TRACE_ENABLE_INTS li r0,1 stb r0,PACASOFTIRQEN(r13); @@ -791,7 +836,7 @@ restore_no_replay: /* * Final return path. BookE is handled in a different file */ -do_restore: +.Ldo_restore: #ifdef CONFIG_PPC_BOOK3E b exception_return_book3e #else @@ -825,7 +870,7 @@ fast_exception_return: REST_8GPRS(5, r1) andi. r0,r3,MSR_RI - beq- unrecov_restore + beq- .Lunrecov_restore /* Load PPR from thread struct before we clear MSR:RI */ BEGIN_FTR_SECTION @@ -883,7 +928,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * make sure that in this case, we also clear PACA_IRQ_HARD_DIS * or that bit can get out of sync and bad things will happen */ -restore_irq_off: +.Lrestore_irq_off: ld r3,_MSR(r1) lbz r7,PACAIRQHAPPENED(r13) andi. r0,r3,MSR_EE @@ -893,13 +938,13 @@ restore_irq_off: 1: li r0,0 stb r0,PACASOFTIRQEN(r13); TRACE_DISABLE_INTS - b do_restore + b .Ldo_restore /* * Something did happen, check if a re-emit is needed * (this also clears paca->irq_happened) */ -restore_check_irq_replay: +.Lrestore_check_irq_replay: /* XXX: We could implement a fast path here where we check * for irq_happened being just 0x01, in which case we can * clear it and return. That means that we would potentially @@ -909,7 +954,7 @@ restore_check_irq_replay: */ bl __check_irq_replay cmpwi cr0,r3,0 - beq restore_no_replay + beq .Lrestore_no_replay /* * We need to re-emit an interrupt. We do so by re-using our @@ -958,10 +1003,18 @@ restore_check_irq_replay: #endif /* CONFIG_PPC_DOORBELL */ 1: b ret_from_except /* What else to do here ? */ -unrecov_restore: +.Lunrecov_restore: addi r3,r1,STACK_FRAME_OVERHEAD bl unrecoverable_exception - b unrecov_restore + b .Lunrecov_restore + +_ASM_NOKPROBE_SYMBOL(ret_from_except); +_ASM_NOKPROBE_SYMBOL(ret_from_except_lite); +_ASM_NOKPROBE_SYMBOL(resume_kernel); +_ASM_NOKPROBE_SYMBOL(fast_exc_return_irq); +_ASM_NOKPROBE_SYMBOL(restore); +_ASM_NOKPROBE_SYMBOL(fast_exception_return); + #ifdef CONFIG_PPC_RTAS /* @@ -1038,6 +1091,8 @@ _GLOBAL(enter_rtas) rldicr r9,r9,MSR_SF_LG,(63-MSR_SF_LG) ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE andc r6,r0,r9 + +__enter_rtas: sync /* disable interrupts so SRR0/1 */ mtmsrd r0 /* don't get trashed */ @@ -1074,6 +1129,8 @@ rtas_return_loc: mtspr SPRN_SRR1,r4 rfid b . /* prevent speculative execution */ +_ASM_NOKPROBE_SYMBOL(__enter_rtas) +_ASM_NOKPROBE_SYMBOL(rtas_return_loc) .align 3 1: .llong rtas_restore_regs diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b886795060fd..4c18a5fbb4bb 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -99,7 +99,11 @@ EXC_VIRT_NONE(0x4000, 0x100) #ifdef CONFIG_PPC_P7_NAP /* * If running native on arch 2.06 or later, check if we are waking up - * from nap/sleep/winkle, and branch to idle handler. + * from nap/sleep/winkle, and branch to idle handler. This tests SRR1 + * bits 46:47. A non-0 value indicates that we are coming from a power + * saving state. The idle wakeup handler initially runs in real mode, + * but we branch to the 0xc000... address so we can turn on relocation + * with mtmsr. */ #define IDLETEST(n) \ BEGIN_FTR_SECTION ; \ @@ -107,7 +111,7 @@ EXC_VIRT_NONE(0x4000, 0x100) rlwinm. r10,r10,47-31,30,31 ; \ beq- 1f ; \ cmpwi cr3,r10,2 ; \ - BRANCH_TO_COMMON(r10, system_reset_idle_common) ; \ + BRANCH_TO_C000(r10, system_reset_idle_common) ; \ 1: \ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #else @@ -128,6 +132,7 @@ EXC_VIRT_NONE(0x4100, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) + mfspr r12,SPRN_SRR1 b pnv_powersave_wakeup #endif @@ -507,46 +512,22 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_DAR - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crset 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - /* - * We can't just use a direct branch to slb_miss_realmode - * because the distance from here to there depends on where - * the kernel ends up being put. - */ - mfctr r11 - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_DAR - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crset 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - /* - * We can't just use a direct branch to slb_miss_realmode - * because the distance from here to there depends on where - * the kernel ends up being put. - */ - mfctr r11 - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) @@ -575,88 +556,82 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crclr 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - mfctr r11 - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crclr 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - mfctr r11 - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) TRAMP_KVM(PACA_EXSLB, 0x480) -/* This handler is used by both 0x380 and 0x480 slb miss interrupts */ -EXC_COMMON_BEGIN(slb_miss_realmode) +/* + * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as + * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled. + */ +EXC_COMMON_BEGIN(slb_miss_common) /* * r13 points to the PACA, r9 contains the saved CR, - * r12 contain the saved SRR1, SRR0 is still ready for return + * r12 contains the saved r3, + * r11 contain the saved SRR1, SRR0 is still ready for return * r3 has the faulting address * r9 - r13 are saved in paca->exslb. - * r3 is saved in paca->slb_r3 * cr6.eq is set for a D-SLB miss, clear for a I-SLB miss * We assume we aren't going to take any exceptions during this * procedure. */ mflr r10 -#ifdef CONFIG_RELOCATABLE - mtctr r11 -#endif - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - std r3,PACA_EXSLB+EX_DAR(r13) + + /* + * Test MSR_RI before calling slb_allocate_realmode, because the + * MSR in r11 gets clobbered. However we still want to allocate + * SLB in case MSR_RI=0, to minimise the risk of getting stuck in + * recursive SLB faults. So use cr5 for this, which is preserved. + */ + andi. r11,r11,MSR_RI /* check for unrecoverable exception */ + cmpdi cr5,r11,MSR_RI crset 4*cr0+eq #ifdef CONFIG_PPC_STD_MMU_64 BEGIN_MMU_FTR_SECTION - bl slb_allocate_realmode + bl slb_allocate END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) #endif ld r10,PACA_EXSLB+EX_LR(r13) - ld r3,PACA_EXSLB+EX_R3(r13) lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ mtlr r10 - beq 8f /* if bad address, make full stack frame */ + beq- 8f /* if bad address, make full stack frame */ - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ - beq- 2f + bne- cr5,2f /* if unrecoverable exception, oops */ /* All done -- return from exception. */ .machine push .machine "power4" mtcrf 0x80,r9 + mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ mtcrf 0x02,r9 /* I/D indication is in cr6 */ mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ .machine pop + RESTORE_CTR(r9, PACA_EXSLB) RESTORE_PPR_PACA(PACA_EXSLB, r9) + mr r3,r12 ld r9,PACA_EXSLB+EX_R9(r13) ld r10,PACA_EXSLB+EX_R10(r13) ld r11,PACA_EXSLB+EX_R11(r13) @@ -665,7 +640,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) rfid b . /* prevent speculative execution */ -2: mfspr r11,SPRN_SRR0 +2: std r3,PACA_EXSLB+EX_DAR(r13) + mr r3,r12 + mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 LOAD_HANDLER(r10,unrecov_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) @@ -673,7 +651,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) rfid b . -8: mfspr r11,SPRN_SRR0 +8: std r3,PACA_EXSLB+EX_DAR(r13) + mr r3,r12 + mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 LOAD_HANDLER(r10,bad_addr_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) @@ -821,46 +802,80 @@ EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00) TRAMP_KVM(PACA_EXGEN, 0xb00) EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) +/* + * system call / hypercall (0xc00, 0x4c00) + * + * The system call exception is invoked with "sc 0" and does not alter HV bit. + * There is support for kernel code to invoke system calls but there are no + * in-tree users. + * + * The hypercall is invoked with "sc 1" and sets HV=1. + * + * In HPT, sc 1 always goes to 0xc00 real mode. In RADIX, sc 1 can go to + * 0x4c00 virtual mode. + * + * Call convention: + * + * syscall register convention is in Documentation/powerpc/syscall64-abi.txt + * + * For hypercalls, the register convention is as follows: + * r0 volatile + * r1-2 nonvolatile + * r3 volatile parameter and return value for status + * r4-r10 volatile input and output value + * r11 volatile hypercall number and output value + * r12 volatile + * r13-r31 nonvolatile + * LR nonvolatile + * CTR volatile + * XER volatile + * CR0-1 CR5-7 volatile + * CR2-4 nonvolatile + * Other registers nonvolatile + * + * The intersection of volatile registers that don't contain possible + * inputs is: r12, cr0, xer, ctr. We may use these as scratch regs + * upon entry without saving. + */ #ifdef CONFIG_KVM_BOOK3S_64_HANDLER - /* - * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems - * that support it) before changing to HMT_MEDIUM. That allows the KVM - * code to save that value into the guest state (it is the guest's PPR - * value). Otherwise just change to HMT_MEDIUM as userspace has - * already saved the PPR. - */ + /* + * There is a little bit of juggling to get syscall and hcall + * working well. Save r10 in ctr to be restored in case it is a + * hcall. + * + * Userspace syscalls have already saved the PPR, hcalls must save + * it before setting HMT_MEDIUM. + */ #define SYSCALL_KVMTEST \ - SET_SCRATCH0(r13); \ + mr r12,r13; \ GET_PACA(r13); \ - std r9,PACA_EXGEN+EX_R9(r13); \ - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \ + mtctr r10; \ + KVMTEST_PR(0xc00); /* uses r10, branch to do_kvm_0xc00_system_call */ \ HMT_MEDIUM; \ - std r10,PACA_EXGEN+EX_R10(r13); \ - OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); \ - mfcr r9; \ - KVMTEST_PR(0xc00); \ - GET_SCRATCH0(r13) + mr r9,r12; \ #else #define SYSCALL_KVMTEST \ - HMT_MEDIUM + HMT_MEDIUM; \ + mr r9,r13; \ + GET_PACA(r13); #endif #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) -/* Syscall routine is used twice, in reloc-off and reloc-on paths */ -#define SYSCALL_PSERIES_1 \ +#define SYSCALL_FASTENDIAN_TEST \ BEGIN_FTR_SECTION \ cmpdi r0,0x1ebe ; \ beq- 1f ; \ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ - mr r9,r13 ; \ - GET_PACA(r13) ; \ - mfspr r11,SPRN_SRR0 ; \ -0: -#define SYSCALL_PSERIES_2_RFID \ +/* + * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9, + * and HMT_MEDIUM. + */ +#define SYSCALL_REAL \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ LOAD_SYSCALL_HANDLER(r10) ; \ mtspr SPRN_SRR0,r10 ; \ @@ -869,11 +884,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ rfid ; \ b . ; /* prevent speculative execution */ -#define SYSCALL_PSERIES_3 \ +#define SYSCALL_FASTENDIAN \ /* Fast LE/BE switch system call */ \ 1: mfspr r12,SPRN_SRR1 ; \ xori r12,r12,MSR_LE ; \ mtspr SPRN_SRR1,r12 ; \ + mr r13,r9 ; \ rfid ; /* return to userspace */ \ b . ; /* prevent speculative execution */ @@ -882,16 +898,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ * We can't branch directly so we do it via the CTR which * is volatile across system calls. */ -#define SYSCALL_PSERIES_2_DIRECT \ - LOAD_SYSCALL_HANDLER(r12) ; \ - mtctr r12 ; \ +#define SYSCALL_VIRT \ + LOAD_SYSCALL_HANDLER(r10) ; \ + mtctr r10 ; \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ li r10,MSR_RI ; \ mtmsrd r10,1 ; \ bctr ; #else /* We can branch directly */ -#define SYSCALL_PSERIES_2_DIRECT \ +#define SYSCALL_VIRT \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ li r10,MSR_RI ; \ mtmsrd r10,1 ; /* Set RI (EE=0) */ \ @@ -899,20 +917,43 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ #endif EXC_REAL_BEGIN(system_call, 0xc00, 0x100) - SYSCALL_KVMTEST - SYSCALL_PSERIES_1 - SYSCALL_PSERIES_2_RFID - SYSCALL_PSERIES_3 + SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */ + SYSCALL_FASTENDIAN_TEST + SYSCALL_REAL + SYSCALL_FASTENDIAN EXC_REAL_END(system_call, 0xc00, 0x100) EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100) - SYSCALL_KVMTEST - SYSCALL_PSERIES_1 - SYSCALL_PSERIES_2_DIRECT - SYSCALL_PSERIES_3 + SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */ + SYSCALL_FASTENDIAN_TEST + SYSCALL_VIRT + SYSCALL_FASTENDIAN EXC_VIRT_END(system_call, 0x4c00, 0x100) -TRAMP_KVM(PACA_EXGEN, 0xc00) +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + /* + * This is a hcall, so register convention is as above, with these + * differences: + * r13 = PACA + * r12 = orig r13 + * ctr = orig r10 + */ +TRAMP_KVM_BEGIN(do_kvm_0xc00) + /* + * Save the PPR (on systems that support it) before changing to + * HMT_MEDIUM. That allows the KVM code to save that value into the + * guest state (it is the guest's PPR value). + */ + OPT_GET_SPR(r0, SPRN_PPR, CPU_FTR_HAS_PPR) + HMT_MEDIUM + OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r0, CPU_FTR_HAS_PPR) + mfctr r10 + SET_SCRATCH0(r12) + std r9,PACA_EXGEN+EX_R9(r13) + mfcr r9 + std r10,PACA_EXGEN+EX_R10(r13) + KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00) +#endif EXC_REAL(single_step, 0xd00, 0x100) @@ -1553,6 +1594,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) 1: addi r3,r1,STACK_FRAME_OVERHEAD bl kernel_bad_stack b 1b +_ASM_NOKPROBE_SYMBOL(bad_stack); + +/* + * When doorbell is triggered from system reset wakeup, the message is + * not cleared, so it would fire again when EE is enabled. + * + * When coming from local_irq_enable, there may be the same problem if + * we were hard disabled. + * + * Execute msgclr to clear pending exceptions before handling it. + */ +h_doorbell_common_msgclr: + LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) + PPC_MSGCLR(3) + b h_doorbell_common + +doorbell_super_common_msgclr: + LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) + PPC_MSGCLRP(3) + b doorbell_super_common /* * Called from arch_local_irq_enable when an interrupt needs @@ -1563,6 +1624,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) * Note: While MSR:EE is off, we need to make sure that _MSR * in the generated frame has EE set to 1 or the exception * handler will not properly re-enable them. + * + * Note that we don't specify LR as the NIP (return address) for + * the interrupt because that would unbalance the return branch + * predictor. */ _GLOBAL(__replay_interrupt) /* We are going to jump to the exception common code which @@ -1570,7 +1635,7 @@ _GLOBAL(__replay_interrupt) * we don't give a damn about, so we don't bother storing them. */ mfmsr r12 - mflr r11 + LOAD_REG_ADDR(r11, 1f) mfcr r9 ori r12,r12,MSR_EE cmpwi r3,0x900 @@ -1579,13 +1644,16 @@ _GLOBAL(__replay_interrupt) beq hardware_interrupt_common BEGIN_FTR_SECTION cmpwi r3,0xe80 - beq h_doorbell_common + beq h_doorbell_common_msgclr cmpwi r3,0xea0 beq h_virt_irq_common cmpwi r3,0xe60 beq hmi_exception_common FTR_SECTION_ELSE cmpwi r3,0xa00 - beq doorbell_super_common + beq doorbell_super_common_msgclr ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) +1: blr + +_ASM_NOKPROBE_SYMBOL(__replay_interrupt) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 466569e26278..3079518f2245 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -113,11 +113,55 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, return 1; } +/* + * If fadump is registered, check if the memory provided + * falls within boot memory area. + */ +int is_fadump_boot_memory_area(u64 addr, ulong size) +{ + if (!fw_dump.dump_registered) + return 0; + + return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size; +} + int is_fadump_active(void) { return fw_dump.dump_active; } +/* + * Returns 1, if there are no holes in boot memory area, + * 0 otherwise. + */ +static int is_boot_memory_area_contiguous(void) +{ + struct memblock_region *reg; + unsigned long tstart, tend; + unsigned long start_pfn = PHYS_PFN(RMA_START); + unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size); + unsigned int ret = 0; + + for_each_memblock(memory, reg) { + tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); + tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + if (tstart < tend) { + /* Memory hole from start_pfn to tstart */ + if (tstart > start_pfn) + break; + + if (tend == end_pfn) { + ret = 1; + break; + } + + start_pfn = tend + 1; + } + } + + return ret; +} + /* Print firmware assisted dump configurations for debugging purpose. */ static void fadump_show_config(void) { @@ -212,20 +256,46 @@ static inline unsigned long fadump_calculate_reserve_size(void) int ret; unsigned long long base, size; + if (fw_dump.reserve_bootvar) + pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n"); + /* * Check if the size is specified through crashkernel= cmdline - * option. If yes, then use that but ignore base as fadump - * reserves memory at end of RAM. + * option. If yes, then use that but ignore base as fadump reserves + * memory at a predefined offset. */ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &size, &base); if (ret == 0 && size > 0) { + unsigned long max_size; + + if (fw_dump.reserve_bootvar) + pr_info("Using 'crashkernel=' parameter for memory reservation.\n"); + fw_dump.reserve_bootvar = (unsigned long)size; + + /* + * Adjust if the boot memory size specified is above + * the upper limit. + */ + max_size = memblock_phys_mem_size() / MAX_BOOT_MEM_RATIO; + if (fw_dump.reserve_bootvar > max_size) { + fw_dump.reserve_bootvar = max_size; + pr_info("Adjusted boot memory size to %luMB\n", + (fw_dump.reserve_bootvar >> 20)); + } + + return fw_dump.reserve_bootvar; + } else if (fw_dump.reserve_bootvar) { + /* + * 'fadump_reserve_mem=' is being used to reserve memory + * for firmware-assisted dump. + */ return fw_dump.reserve_bootvar; } /* divide by 20 to get 5% of value */ - size = memblock_end_of_DRAM() / 20; + size = memblock_phys_mem_size() / 20; /* round it down in multiples of 256 */ size = size & ~0x0FFFFFFFUL; @@ -377,9 +447,22 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -static void register_fw_dump(struct fadump_mem_struct *fdm) +/* + * Look for fadump_reserve_mem= cmdline option + * TODO: Remove references to 'fadump_reserve_mem=' parameter, + * the sooner 'crashkernel=' parameter is accustomed to. + */ +static int __init early_fadump_reserve_mem(char *p) +{ + if (p) + fw_dump.reserve_bootvar = memparse(p, &p); + return 0; +} +early_param("fadump_reserve_mem", early_fadump_reserve_mem); + +static int register_fw_dump(struct fadump_mem_struct *fdm) { - int rc; + int rc, err; unsigned int wait_time; pr_debug("Registering for firmware-assisted kernel dump...\n"); @@ -396,26 +479,38 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) } while (wait_time); + err = -EIO; switch (rc) { + default: + pr_err("Failed to register. Unknown Error(%d).\n", rc); + break; case -1: printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Hardware Error(%d).\n", rc); break; case -3: + if (!is_boot_memory_area_contiguous()) + pr_err("Can't have holes in boot memory area while " + "registering fadump\n"); + printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Parameter Error(%d).\n", rc); + err = -EINVAL; break; case -9: printk(KERN_ERR "firmware-assisted kernel dump is already " " registered."); fw_dump.dump_registered = 1; + err = -EEXIST; break; case 0: printk(KERN_INFO "firmware-assisted kernel dump registration" " is successful\n"); fw_dump.dump_registered = 1; + err = 0; break; } + return err; } void crash_fadump(struct pt_regs *regs, const char *str) @@ -831,8 +926,19 @@ static void fadump_setup_crash_memory_ranges(void) for_each_memblock(memory, reg) { start = (unsigned long long)reg->base; end = start + (unsigned long long)reg->size; - if (start == RMA_START && end >= fw_dump.boot_memory_size) - start = fw_dump.boot_memory_size; + + /* + * skip the first memory chunk that is already added (RMA_START + * through boot_memory_size). This logic needs a relook if and + * when RMA_START changes to a non-zero value. + */ + BUILD_BUG_ON(RMA_START != 0); + if (start < fw_dump.boot_memory_size) { + if (end > fw_dump.boot_memory_size) + start = fw_dump.boot_memory_size; + else + continue; + } /* add this range excluding the reserved dump area. */ fadump_exclude_reserved_area(start, end); @@ -956,7 +1062,7 @@ static unsigned long init_fadump_header(unsigned long addr) return addr; } -static void register_fadump(void) +static int register_fadump(void) { unsigned long addr; void *vaddr; @@ -966,7 +1072,7 @@ static void register_fadump(void) * assisted dump. */ if (!fw_dump.reserve_dump_area_size) - return; + return -ENODEV; fadump_setup_crash_memory_ranges(); @@ -979,7 +1085,7 @@ static void register_fadump(void) fadump_create_elfcore_headers(vaddr); /* register the future kernel dump with firmware. */ - register_fw_dump(&fdm); + return register_fw_dump(&fdm); } static int fadump_unregister_dump(struct fadump_mem_struct *fdm) @@ -1046,28 +1152,71 @@ void fadump_cleanup(void) } } +static void fadump_free_reserved_memory(unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn; + unsigned long time_limit = jiffies + HZ; + + pr_info("freeing reserved memory (0x%llx - 0x%llx)\n", + PFN_PHYS(start_pfn), PFN_PHYS(end_pfn)); + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + free_reserved_page(pfn_to_page(pfn)); + + if (time_after(jiffies, time_limit)) { + cond_resched(); + time_limit = jiffies + HZ; + } + } +} + +/* + * Skip memory holes and free memory that was actually reserved. + */ +static void fadump_release_reserved_area(unsigned long start, unsigned long end) +{ + struct memblock_region *reg; + unsigned long tstart, tend; + unsigned long start_pfn = PHYS_PFN(start); + unsigned long end_pfn = PHYS_PFN(end); + + for_each_memblock(memory, reg) { + tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); + tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + if (tstart < tend) { + fadump_free_reserved_memory(tstart, tend); + + if (tend == end_pfn) + break; + + start_pfn = tend + 1; + } + } +} + /* * Release the memory that was reserved in early boot to preserve the memory * contents. The released memory will be available for general use. */ static void fadump_release_memory(unsigned long begin, unsigned long end) { - unsigned long addr; unsigned long ra_start, ra_end; ra_start = fw_dump.reserve_dump_area_start; ra_end = ra_start + fw_dump.reserve_dump_area_size; - for (addr = begin; addr < end; addr += PAGE_SIZE) { - /* - * exclude the dump reserve area. Will reuse it for next - * fadump registration. - */ - if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start)) - continue; - - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); - } + /* + * exclude the dump reserve area. Will reuse it for next + * fadump registration. + */ + if (begin < ra_end && end > ra_start) { + if (begin < ra_start) + fadump_release_reserved_area(begin, ra_start); + if (end > ra_end) + fadump_release_reserved_area(ra_end, end); + } else + fadump_release_reserved_area(begin, end); } static void fadump_invalidate_release_mem(void) @@ -1161,7 +1310,6 @@ static ssize_t fadump_register_store(struct kobject *kobj, switch (buf[0]) { case '0': if (fw_dump.dump_registered == 0) { - ret = -EINVAL; goto unlock_out; } /* Un-register Firmware-assisted dump */ @@ -1169,11 +1317,11 @@ static ssize_t fadump_register_store(struct kobject *kobj, break; case '1': if (fw_dump.dump_registered == 1) { - ret = -EINVAL; + ret = -EEXIST; goto unlock_out; } /* Register Firmware-assisted dump */ - register_fadump(); + ret = register_fadump(); break; default: ret = -EINVAL; diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 4898d676dcae..5adb390e773b 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -31,6 +31,7 @@ * registers for winkle support. */ #define _SDR1 GPR3 +#define _PTCR GPR3 #define _RPR GPR4 #define _SPURR GPR5 #define _PURR GPR6 @@ -39,7 +40,7 @@ #define _AMOR GPR9 #define _WORT GPR10 #define _WORC GPR11 -#define _PTCR GPR12 +#define _LPCR GPR12 #define PSSCR_EC_ESL_MASK_SHIFTED (PSSCR_EC | PSSCR_ESL) >> 16 @@ -55,12 +56,14 @@ save_sprs_to_stack: * here since any thread in the core might wake up first */ BEGIN_FTR_SECTION - mfspr r3,SPRN_PTCR - std r3,_PTCR(r1) /* * Note - SDR1 is dropped in Power ISA v3. Hence not restoring * SDR1 here */ + mfspr r3,SPRN_PTCR + std r3,_PTCR(r1) + mfspr r3,SPRN_LPCR + std r3,_LPCR(r1) FTR_SECTION_ELSE mfspr r3,SPRN_SDR1 std r3,_SDR1(r1) @@ -106,13 +109,9 @@ core_idle_lock_held: /* * Pass requested state in r3: * r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8 - * - Requested STOP state in POWER9 - * - * To check IRQ_HAPPENED in r4 - * 0 - don't check - * 1 - check + * - Requested PSSCR value in POWER9 * - * Address to 'rfid' to in r5 + * Address of idle handler to branch to in realmode in r4 */ pnv_powersave_common: /* Use r3 to pass state nap/sleep/winkle */ @@ -122,37 +121,14 @@ pnv_powersave_common: * need to save PC, some CR bits and the NV GPRs, * but for now an interrupt frame will do. */ + mtctr r4 + mflr r0 std r0,16(r1) stdu r1,-INT_FRAME_SIZE(r1) std r0,_LINK(r1) std r0,_NIP(r1) - /* Hard disable interrupts */ - mfmsr r9 - rldicl r9,r9,48,1 - rotldi r9,r9,16 - mtmsrd r9,1 /* hard-disable interrupts */ - - /* Check if something happened while soft-disabled */ - lbz r0,PACAIRQHAPPENED(r13) - andi. r0,r0,~PACA_IRQ_HARD_DIS@l - beq 1f - cmpwi cr0,r4,0 - beq 1f - addi r1,r1,INT_FRAME_SIZE - ld r0,16(r1) - li r3,0 /* Return 0 (no nap) */ - mtlr r0 - blr - -1: /* We mark irqs hard disabled as this is the state we'll - * be in when returning and we need to tell arch_local_irq_restore() - * about it - */ - li r0,PACA_IRQ_HARD_DIS - stb r0,PACAIRQHAPPENED(r13) - /* We haven't lost state ... yet */ li r0,0 stb r0,PACA_NAPSTATELOST(r13) @@ -160,9 +136,8 @@ pnv_powersave_common: /* Continue saving state */ SAVE_GPR(2, r1) SAVE_NVGPRS(r1) - mfcr r4 - std r4,_CCR(r1) - std r9,_MSR(r1) + mfcr r5 + std r5,_CCR(r1) std r1,PACAR1(r13) /* @@ -172,12 +147,8 @@ pnv_powersave_common: * the MMU context to the guest. */ LOAD_REG_IMMEDIATE(r7, MSR_IDLE) - li r6, MSR_RI - andc r6, r9, r6 - mtmsrd r6, 1 /* clear RI before setting SRR0/1 */ - mtspr SPRN_SRR0, r5 - mtspr SPRN_SRR1, r7 - rfid + mtmsrd r7,0 + bctr .globl pnv_enter_arch207_idle_mode pnv_enter_arch207_idle_mode: @@ -285,6 +256,19 @@ power_enter_stop: bne .Lhandle_esl_ec_set IDLE_STATE_ENTER_SEQ(PPC_STOP) li r3,0 /* Since we didn't lose state, return 0 */ + + /* + * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so + * it can determine if the wakeup reason is an HMI in + * CHECK_HMI_INTERRUPT. + * + * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup + * reason, so there is no point setting r12 to SRR1. + * + * Further, we clear r12 here, so that we don't accidentally enter the + * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI. + */ + li r12, 0 b pnv_wakeup_noloss .Lhandle_esl_ec_set: @@ -319,45 +303,23 @@ lwarx_loop_stop: IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) -_GLOBAL(power7_idle) +/* + * Entered with MSR[EE]=0 and no soft-masked interrupts pending. + * r3 contains desired idle state (PNV_THREAD_NAP/SLEEP/WINKLE). + */ +_GLOBAL(power7_idle_insn) /* Now check if user or arch enabled NAP mode */ - LOAD_REG_ADDRBASE(r3,powersave_nap) - lwz r4,ADDROFF(powersave_nap)(r3) - cmpwi 0,r4,0 - beqlr - li r3, 1 - /* fall through */ - -_GLOBAL(power7_nap) - mr r4,r3 - li r3,PNV_THREAD_NAP - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) - b pnv_powersave_common - /* No return */ - -_GLOBAL(power7_sleep) - li r3,PNV_THREAD_SLEEP - li r4,1 - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) - b pnv_powersave_common - /* No return */ - -_GLOBAL(power7_winkle) - li r3,PNV_THREAD_WINKLE - li r4,1 - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) + LOAD_REG_ADDR(r4, pnv_enter_arch207_idle_mode) b pnv_powersave_common - /* No return */ #define CHECK_HMI_INTERRUPT \ - mfspr r0,SPRN_SRR1; \ BEGIN_FTR_SECTION_NESTED(66); \ - rlwinm r0,r0,45-31,0xf; /* extract wake reason field (P8) */ \ + rlwinm r0,r12,45-31,0xf; /* extract wake reason field (P8) */ \ FTR_SECTION_ELSE_NESTED(66); \ - rlwinm r0,r0,45-31,0xe; /* P7 wake reason field is 3 bits */ \ + rlwinm r0,r12,45-31,0xe; /* P7 wake reason field is 3 bits */ \ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ cmpwi r0,0xa; /* Hypervisor maintenance ? */ \ - bne 20f; \ + bne+ 20f; \ /* Invoke opal call to handle hmi */ \ ld r2,PACATOC(r13); \ ld r1,PACAR1(r13); \ @@ -369,16 +331,13 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ 20: nop; /* - * r3 - The PSSCR value corresponding to the stop state. - * r4 - The PSSCR mask corrresonding to the stop state. + * Entered with MSR[EE]=0 and no soft-masked interrupts pending. + * r3 contains desired PSSCR register value. */ _GLOBAL(power9_idle_stop) - mfspr r5,SPRN_PSSCR - andc r5,r5,r4 - or r3,r3,r5 + std r3, PACA_REQ_PSSCR(r13) mtspr SPRN_PSSCR,r3 - LOAD_REG_ADDR(r5,power_enter_stop) - li r4,1 + LOAD_REG_ADDR(r4,power_enter_stop) b pnv_powersave_common /* No return */ @@ -436,17 +395,17 @@ pnv_powersave_wakeup_mce: /* * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake - * reason into SRR1, which allows reuse of the system reset wakeup + * reason into r12, which allows reuse of the system reset wakeup * code without being mistaken for another type of wakeup. */ - oris r3,r3,SRR1_WAKEMCE_RESVD@h - mtspr SPRN_SRR1,r3 + oris r12,r3,SRR1_WAKEMCE_RESVD@h b pnv_powersave_wakeup /* * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss + * r12 - SRR1 */ .global pnv_powersave_wakeup pnv_powersave_wakeup: @@ -464,6 +423,8 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) li r0,PNV_THREAD_RUNNING stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ + mr r3,r12 + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE li r0,KVM_HWTHREAD_IN_KERNEL stb r0,HSTATE_HWTHREAD_STATE(r13) @@ -477,7 +438,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) #endif /* Return SRR1 from power7_nap() */ - mfspr r3,SPRN_SRR1 blt cr3,pnv_wakeup_noloss b pnv_wakeup_loss @@ -489,18 +449,35 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) */ pnv_restore_hyp_resource_arch300: /* + * Workaround for POWER9, if we lost resources, the ERAT + * might have been mixed up and needs flushing. + */ + blt cr3,1f + PPC_INVALIDATE_ERAT +1: + /* * POWER ISA 3. Use PSSCR to determine if we * are waking up from deep idle state */ LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state) ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) - mfspr r5,SPRN_PSSCR +BEGIN_FTR_SECTION_NESTED(71) + /* + * Assume that we are waking up from the state + * same as the Requested Level (RL) in the PSSCR + * which are Bits 60-63 + */ + ld r5,PACA_REQ_PSSCR(r13) + rldicl r5,r5,0,60 +FTR_SECTION_ELSE_NESTED(71) /* * 0-3 bits correspond to Power-Saving Level Status * which indicates the idle state we are waking up from */ + mfspr r5, SPRN_PSSCR rldicl r5,r5,4,60 +ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 71) cmpd cr4,r5,r4 bge cr4,pnv_wakeup_tb_loss /* returns to caller */ @@ -567,9 +544,9 @@ pnv_wakeup_tb_loss: * is required to return back to reset vector after hypervisor state * restore is complete. */ + mr r19,r12 mr r18,r4 mflr r17 - mfspr r16,SPRN_SRR1 BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) @@ -731,13 +708,14 @@ timebase_resync: * Use cr3 which indicates that we are waking up with atleast partial * hypervisor state loss to determine if TIMEBASE RESYNC is needed. */ - ble cr3,clear_lock + ble cr3,.Ltb_resynced /* Time base re-sync */ bl opal_resync_timebase; /* - * If waking up from sleep, per core state is not lost, skip to - * clear_lock. + * If waking up from sleep (POWER8), per core state + * is not lost, skip to clear_lock. */ +.Ltb_resynced: blt cr4,clear_lock /* @@ -812,9 +790,13 @@ no_segments: mtctr r12 bctrl +BEGIN_FTR_SECTION + ld r4,_LPCR(r1) + mtspr SPRN_LPCR,r4 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) hypervisor_state_restored: - mtspr SPRN_SRR1,r16 + mr r12,r19 mtlr r17 blr /* return to pnv_powersave_wakeup */ @@ -827,6 +809,7 @@ fastsleep_workaround_at_exit: /* * R3 here contains the value that will be returned to the caller * of power7_nap. + * R12 contains SRR1 for CHECK_HMI_INTERRUPT. */ .global pnv_wakeup_loss pnv_wakeup_loss: @@ -836,32 +819,33 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) REST_NVGPRS(r1) REST_GPR(2, r1) + ld r4,PACAKMSR(r13) + ld r5,_LINK(r1) ld r6,_CCR(r1) - ld r4,_MSR(r1) - ld r5,_NIP(r1) addi r1,r1,INT_FRAME_SIZE + mtlr r5 mtcr r6 - mtspr SPRN_SRR1,r4 - mtspr SPRN_SRR0,r5 - rfid + mtmsrd r4 + blr /* * R3 here contains the value that will be returned to the caller * of power7_nap. + * R12 contains SRR1 for CHECK_HMI_INTERRUPT. */ pnv_wakeup_noloss: lbz r0,PACA_NAPSTATELOST(r13) cmpwi r0,0 bne pnv_wakeup_loss + ld r1,PACAR1(r13) BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - ld r1,PACAR1(r13) - ld r6,_CCR(r1) - ld r4,_MSR(r1) + ld r4,PACAKMSR(r13) ld r5,_NIP(r1) + ld r6,_CCR(r1) addi r1,r1,INT_FRAME_SIZE + mtlr r5 mtcr r6 - mtspr SPRN_SRR1,r4 - mtspr SPRN_SRR0,r5 - rfid + mtmsrd r4 + blr diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5c291df30fe3..0bcec745a672 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -322,7 +322,8 @@ bool prep_irq_for_idle(void) * First we need to hard disable to ensure no interrupt * occurs before we effectively enter the low power state */ - hard_irq_disable(); + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; /* * If anything happened while we were soft-disabled, @@ -347,6 +348,65 @@ bool prep_irq_for_idle(void) return true; } +#ifdef CONFIG_PPC_BOOK3S +/* + * This is for idle sequences that return with IRQs off, but the + * idle state itself wakes on interrupt. Tell the irq tracer that + * IRQs are enabled for the duration of idle so it does not get long + * off times. Must be paired with fini_irq_for_idle_irqsoff. + */ +bool prep_irq_for_idle_irqsoff(void) +{ + WARN_ON(!irqs_disabled()); + + /* + * First we need to hard disable to ensure no interrupt + * occurs before we effectively enter the low power state + */ + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + + /* + * If anything happened while we were soft-disabled, + * we return now and do not enter the low power state. + */ + if (lazy_irq_pending()) + return false; + + /* Tell lockdep we are about to re-enable */ + trace_hardirqs_on(); + + return true; +} + +/* + * Take the SRR1 wakeup reason, index into this table to find the + * appropriate irq_happened bit. + */ +static const u8 srr1_to_lazyirq[0x10] = { + 0, 0, 0, + PACA_IRQ_DBELL, + 0, + PACA_IRQ_DBELL, + PACA_IRQ_DEC, + 0, + PACA_IRQ_EE, + PACA_IRQ_EE, + PACA_IRQ_HMI, + 0, 0, 0, 0, 0 }; + +void irq_set_pending_from_srr1(unsigned long srr1) +{ + unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18; + + /* + * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, + * so this can be called unconditionally with srr1 wake reason. + */ + local_paca->irq_happened |= srr1_to_lazyirq[idx]; +} +#endif /* CONFIG_PPC_BOOK3S */ + /* * Force a replay of the external interrupt handler on this CPU. */ diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 01addfb0ed0a..45f1ff721c32 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -164,17 +164,13 @@ NOKPROBE_SYMBOL(arch_prepare_kprobe); void arch_arm_kprobe(struct kprobe *p) { - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + patch_instruction(p->addr, BREAKPOINT_INSTRUCTION); } NOKPROBE_SYMBOL(arch_arm_kprobe); void arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + patch_instruction(p->addr, p->opcode); } NOKPROBE_SYMBOL(arch_disarm_kprobe); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index a9bfa49f3698..e0e131e662ed 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -268,6 +268,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, static const char *mc_ra_types[] = { "Indeterminate", "Instruction fetch (bad)", + "Instruction fetch (foreign)", "Page table walk ifetch (bad)", "Page table walk ifetch (foreign)", "Load (bad)", diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index f913139bb0c2..d24e689e893f 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -236,6 +236,9 @@ static const struct mce_ierror_table mce_p9_ierror_table[] = { { 0x00000000081c0000, 0x0000000000180000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000001c0000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH_FOREIGN, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000000081c0000, 0x0000000008000000, true, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 84db14e435f5..3f7a9a2d2435 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -244,8 +244,7 @@ _GLOBAL(_nmask_and_or_msr) */ _GLOBAL(real_readb) mfmsr r7 - ori r0,r7,MSR_DR - xori r0,r0,MSR_DR + rlwinm r0,r7,0,~MSR_DR sync mtmsr r0 sync @@ -262,8 +261,7 @@ _GLOBAL(real_readb) */ _GLOBAL(real_writeb) mfmsr r7 - ori r0,r7,MSR_DR - xori r0,r0,MSR_DR + rlwinm r0,r7,0,~MSR_DR sync mtmsr r0 sync diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index ec60ed0d4aad..6f8273f5e988 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -158,12 +158,13 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr) { /* addis r4,0,(insn)@h */ - *addr++ = PPC_INST_ADDIS | ___PPC_RT(4) | - ((val >> 16) & 0xffff); + patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(4) | + ((val >> 16) & 0xffff)); + addr++; /* ori r4,r4,(insn)@l */ - *addr = PPC_INST_ORI | ___PPC_RA(4) | ___PPC_RS(4) | - (val & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(4) | + ___PPC_RS(4) | (val & 0xffff)); } /* @@ -173,24 +174,28 @@ void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr) void patch_imm64_load_insns(unsigned long val, kprobe_opcode_t *addr) { /* lis r3,(op)@highest */ - *addr++ = PPC_INST_ADDIS | ___PPC_RT(3) | - ((val >> 48) & 0xffff); + patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(3) | + ((val >> 48) & 0xffff)); + addr++; /* ori r3,r3,(op)@higher */ - *addr++ = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) | - ((val >> 32) & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) | + ___PPC_RS(3) | ((val >> 32) & 0xffff)); + addr++; /* rldicr r3,r3,32,31 */ - *addr++ = PPC_INST_RLDICR | ___PPC_RA(3) | ___PPC_RS(3) | - __PPC_SH64(32) | __PPC_ME64(31); + patch_instruction(addr, PPC_INST_RLDICR | ___PPC_RA(3) | + ___PPC_RS(3) | __PPC_SH64(32) | __PPC_ME64(31)); + addr++; /* oris r3,r3,(op)@h */ - *addr++ = PPC_INST_ORIS | ___PPC_RA(3) | ___PPC_RS(3) | - ((val >> 16) & 0xffff); + patch_instruction(addr, PPC_INST_ORIS | ___PPC_RA(3) | + ___PPC_RS(3) | ((val >> 16) & 0xffff)); + addr++; /* ori r3,r3,(op)@l */ - *addr = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) | - (val & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) | + ___PPC_RS(3) | (val & 0xffff)); } int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) @@ -198,7 +203,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) kprobe_opcode_t *buff, branch_op_callback, branch_emulate_step; kprobe_opcode_t *op_callback_addr, *emulate_step_addr; long b_offset; - unsigned long nip; + unsigned long nip, size; + int rc, i; kprobe_ppc_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; @@ -231,8 +237,14 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) goto error; /* Setup template */ - memcpy(buff, optprobe_template_entry, - TMPL_END_IDX * sizeof(kprobe_opcode_t)); + /* We can optimize this via patch_instruction_window later */ + size = (TMPL_END_IDX * sizeof(kprobe_opcode_t)) / sizeof(int); + pr_devel("Copying template to %p, size %lu\n", buff, size); + for (i = 0; i < size; i++) { + rc = patch_instruction(buff + i, *(optprobe_template_entry + i)); + if (rc < 0) + goto error; + } /* * Fixup the template with instructions to: @@ -261,8 +273,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) if (!branch_op_callback || !branch_emulate_step) goto error; - buff[TMPL_CALL_HDLR_IDX] = branch_op_callback; - buff[TMPL_EMULATE_IDX] = branch_emulate_step; + patch_instruction(buff + TMPL_CALL_HDLR_IDX, branch_op_callback); + patch_instruction(buff + TMPL_EMULATE_IDX, branch_emulate_step); /* * 3. load instruction to be emulated into relevant register, and @@ -272,8 +284,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) /* * 4. branch back from trampoline */ - buff[TMPL_RET_IDX] = create_branch((unsigned int *)buff + TMPL_RET_IDX, - (unsigned long)nip, 0); + patch_branch(buff + TMPL_RET_IDX, (unsigned long)nip, 0); flush_icache_range((unsigned long)buff, (unsigned long)(&buff[TMPL_END_IDX])); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 2ad725ef4368..9f3e2c932dcc 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -511,6 +511,10 @@ void restore_math(struct pt_regs *regs) { unsigned long msr; + /* + * Syscall exit makes a similar initial check before branching + * to restore_math. Keep them in synch. + */ if (!msr_tm_active(regs->msr) && !current->thread.load_fp && !loadvec(current->thread)) return; @@ -1133,6 +1137,11 @@ static inline void restore_sprs(struct thread_struct *old_thread, #endif } +#ifdef CONFIG_PPC_BOOK3S_64 +#define CP_SIZE 128 +static const u8 dummy_copy_buffer[CP_SIZE] __attribute__((aligned(CP_SIZE))); +#endif + struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *new) { @@ -1195,12 +1204,14 @@ struct task_struct *__switch_to(struct task_struct *prev, __switch_to_tm(prev, new); - /* - * We can't take a PMU exception inside _switch() since there is a - * window where the kernel stack SLB and the kernel stack are out - * of sync. Hard disable here. - */ - hard_irq_disable(); + if (!radix_enabled()) { + /* + * We can't take a PMU exception inside _switch() since there + * is a window where the kernel stack SLB and the kernel stack + * are out of sync. Hard disable here. + */ + hard_irq_disable(); + } /* * Call restore_sprs() before calling _switch(). If we move it after @@ -1220,8 +1231,28 @@ struct task_struct *__switch_to(struct task_struct *prev, batch->active = 1; } - if (current_thread_info()->task->thread.regs) + if (current_thread_info()->task->thread.regs) { restore_math(current_thread_info()->task->thread.regs); + + /* + * The copy-paste buffer can only store into foreign real + * addresses, so unprivileged processes can not see the + * data or use it in any way unless they have foreign real + * mappings. We don't have a VAS driver that allocates those + * yet, so no cpabort is required. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + /* + * DD1 allows paste into normal system memory, so we + * do an unpaired copy here to clear the buffer and + * prevent a covert channel being set up. + * + * cpabort is not used because it is quite expensive. + */ + asm volatile(PPC_COPY(%0, %1) + : : "r"(dummy_copy_buffer), "r"(0)); + } + } #endif /* CONFIG_PPC_STD_MMU_64 */ return last; diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 857129acf960..94a948207cd2 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -335,6 +335,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) maj = ((pvr >> 8) & 0xFF) - 1; min = pvr & 0xFF; break; + case 0x004e: /* POWER9 bits 12-15 give chip type */ + maj = (pvr >> 8) & 0x0F; + min = pvr & 0xFF; + break; default: maj = (pvr >> 8) & 0xFF; min = pvr & 0xFF; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 1069f74fca47..c6b8bace1766 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -33,6 +33,7 @@ #include <linux/notifier.h> #include <linux/topology.h> #include <linux/profile.h> +#include <linux/processor.h> #include <asm/ptrace.h> #include <linux/atomic.h> @@ -112,7 +113,8 @@ int smp_generic_cpu_bootable(unsigned int nr) #ifdef CONFIG_PPC64 int smp_generic_kick_cpu(int nr) { - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= nr_cpu_ids) + return -EINVAL; /* * The processor is currently spinning, waiting for the @@ -766,8 +768,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) smp_ops->give_timebase(); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu)) - cpu_relax(); + spin_until_cond(cpu_online(cpu)); return 0; } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 2b33cfaac7b8..fe6f3a285455 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -59,10 +59,10 @@ #include <linux/suspend.h> #include <linux/rtc.h> #include <linux/sched/cputime.h> +#include <linux/processor.h> #include <asm/trace.h> #include <asm/io.h> -#include <asm/processor.h> #include <asm/nvram.h> #include <asm/cache.h> #include <asm/machdep.h> @@ -442,6 +442,7 @@ void __delay(unsigned long loops) unsigned long start; int diff; + spin_begin(); if (__USE_RTC()) { start = get_rtcl(); do { @@ -449,13 +450,14 @@ void __delay(unsigned long loops) diff = get_rtcl() - start; if (diff < 0) diff += 1000000000; + spin_cpu_relax(); } while (diff < loops); } else { start = get_tbl(); while (get_tbl() - start < loops) - HMT_low(); - HMT_medium(); + spin_cpu_relax(); } + spin_end(); } EXPORT_SYMBOL(__delay); @@ -675,7 +677,7 @@ EXPORT_SYMBOL_GPL(tb_to_ns); * the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b * are 64-bit unsigned numbers. */ -unsigned long long sched_clock(void) +notrace unsigned long long sched_clock(void) { if (__USE_RTC()) return get_rtc(); @@ -739,12 +741,20 @@ static int __init get_freq(char *name, int cells, unsigned long *val) static void start_cpu_decrementer(void) { #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) + unsigned int tcr; + /* Clear any pending timer interrupts */ mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS); - /* Enable decrementer interrupt */ - mtspr(SPRN_TCR, TCR_DIE); -#endif /* defined(CONFIG_BOOKE) || defined(CONFIG_40x) */ + tcr = mfspr(SPRN_TCR); + /* + * The watchdog may have already been enabled by u-boot. So leave + * TRC[WP] (Watchdog Period) alone. + */ + tcr &= TCR_WP_MASK; /* Clear all bits except for TCR[WP] */ + tcr |= TCR_DIE; /* Enable decrementer */ + mtspr(SPRN_TCR, tcr); +#endif } void __init generic_calibrate_decr(void) @@ -823,38 +833,76 @@ void read_persistent_clock(struct timespec *ts) } /* clocksource code */ -static u64 rtc_read(struct clocksource *cs) +static notrace u64 rtc_read(struct clocksource *cs) { return (u64)get_rtc(); } -static u64 timebase_read(struct clocksource *cs) +static notrace u64 timebase_read(struct clocksource *cs) { return (u64)get_tb(); } -void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm, - struct clocksource *clock, u32 mult, u64 cycle_last) + +void update_vsyscall(struct timekeeper *tk) { + struct timespec xt; + struct clocksource *clock = tk->tkr_mono.clock; + u32 mult = tk->tkr_mono.mult; + u32 shift = tk->tkr_mono.shift; + u64 cycle_last = tk->tkr_mono.cycle_last; u64 new_tb_to_xs, new_stamp_xsec; - u32 frac_sec; + u64 frac_sec; if (clock != &clocksource_timebase) return; + xt.tv_sec = tk->xtime_sec; + xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + /* Make userspace gettimeofday spin until we're done. */ ++vdso_data->tb_update_count; smp_mb(); - /* 19342813113834067 ~= 2^(20+64) / 1e9 */ - new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); - new_stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC; - do_div(new_stamp_xsec, 1000000000); - new_stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC; + /* + * This computes ((2^20 / 1e9) * mult) >> shift as a + * 0.64 fixed-point fraction. + * The computation in the else clause below won't overflow + * (as long as the timebase frequency is >= 1.049 MHz) + * but loses precision because we lose the low bits of the constant + * in the shift. Note that 19342813113834067 ~= 2^(20+64) / 1e9. + * For a shift of 24 the error is about 0.5e-9, or about 0.5ns + * over a second. (Shift values are usually 22, 23 or 24.) + * For high frequency clocks such as the 512MHz timebase clock + * on POWER[6789], the mult value is small (e.g. 32768000) + * and so we can shift the constant by 16 initially + * (295147905179 ~= 2^(20+64-16) / 1e9) and then do the + * remaining shifts after the multiplication, which gives a + * more accurate result (e.g. with mult = 32768000, shift = 24, + * the error is only about 1.2e-12, or 0.7ns over 10 minutes). + */ + if (mult <= 62500000 && clock->shift >= 16) + new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16); + else + new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); + + /* + * Compute the fractional second in units of 2^-32 seconds. + * The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift + * in nanoseconds, so multiplying that by 2^32 / 1e9 gives + * it in units of 2^-32 seconds. + * We assume shift <= 32 because clocks_calc_mult_shift() + * generates shift values in the range 0 - 32. + */ + frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift); + do_div(frac_sec, NSEC_PER_SEC); - BUG_ON(wall_time->tv_nsec >= NSEC_PER_SEC); - /* this is tv_nsec / 1e9 as a 0.32 fraction */ - frac_sec = ((u64) wall_time->tv_nsec * 18446744073ULL) >> 32; + /* + * Work out new stamp_xsec value for any legacy users of systemcfg. + * stamp_xsec is in units of 2^-20 seconds. + */ + new_stamp_xsec = frac_sec >> 12; + new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC; /* * tb_update_count is used to allow the userspace gettimeofday code @@ -864,15 +912,13 @@ void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm, * the two values of tb_update_count match and are even then the * tb_to_xs and stamp_xsec values are consistent. If not, then it * loops back and reads them again until this criteria is met. - * We expect the caller to have done the first increment of - * vdso_data->tb_update_count already. */ vdso_data->tb_orig_stamp = cycle_last; vdso_data->stamp_xsec = new_stamp_xsec; vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = wtm->tv_sec; - vdso_data->wtom_clock_nsec = wtm->tv_nsec; - vdso_data->stamp_xtime = *wall_time; + vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec; + vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec; + vdso_data->stamp_xtime = xt; vdso_data->stamp_sec_fraction = frac_sec; smp_wmb(); ++(vdso_data->tb_update_count); diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 3a2d04134da9..c4ba37822ba0 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -313,8 +313,8 @@ dont_backup_fp: blr - /* void tm_recheckpoint(struct thread_struct *thread, - * unsigned long orig_msr) + /* void __tm_recheckpoint(struct thread_struct *thread, + * unsigned long orig_msr) * - Restore the checkpointed register state saved by tm_reclaim * when we switch_to a process. * diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index d4e545d27ef9..bfcfd9ef09f2 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -237,6 +237,7 @@ void die(const char *str, struct pt_regs *regs, long err) err = 0; oops_end(flags, regs, err); } +NOKPROBE_SYMBOL(die); void user_single_step_siginfo(struct task_struct *tsk, struct pt_regs *regs, siginfo_t *info) @@ -1968,6 +1969,7 @@ void unrecoverable_exception(struct pt_regs *regs) regs->trap, regs->nip); die("Unrecoverable exception", regs, SIGABRT); } +NOKPROBE_SYMBOL(unrecoverable_exception); #if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x) /* @@ -1998,6 +2000,7 @@ void kernel_bad_stack(struct pt_regs *regs) regs->gpr[1], regs->nip); die("Bad kernel stack pointer", regs, SIGABRT); } +NOKPROBE_SYMBOL(kernel_bad_stack); void __init trap_init(void) { diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 2f793be3d2b1..b1a250560198 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -8,6 +8,12 @@ #include <asm/cache.h> #include <asm/thread_info.h> +#ifdef CONFIG_STRICT_KERNEL_RWX +#define STRICT_ALIGN_SIZE (1 << 24) +#else +#define STRICT_ALIGN_SIZE PAGE_SIZE +#endif + ENTRY(_stext) PHDRS { @@ -58,7 +64,6 @@ SECTIONS #ifdef CONFIG_PPC64 KEEP(*(.head.text.first_256B)); #ifdef CONFIG_PPC_BOOK3E -# define END_FIXED 0x100 #else KEEP(*(.head.text.real_vectors)); *(.head.text.real_trampolines); @@ -66,12 +71,8 @@ SECTIONS *(.head.text.virt_trampolines); # if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) KEEP(*(.head.data.fwnmi_page)); -# define END_FIXED 0x8000 -# else -# define END_FIXED 0x7000 # endif #endif - ASSERT((. == END_FIXED), "vmlinux.lds.S: fixed section overflow error"); #else /* !CONFIG_PPC64 */ HEAD_TEXT #endif @@ -79,23 +80,6 @@ SECTIONS __head_end = .; - /* - * If the build dies here, it's likely code in head_64.S is referencing - * labels it can't reach, and the linker inserting stubs without the - * assembler's knowledge. To debug, remove the above assert and - * rebuild. Look for branch stubs in the fixed section region. - * - * Linker stub generation could be allowed in "trampoline" - * sections if absolutely necessary, but this would require - * some rework of the fixed sections. Before resorting to this, - * consider references that have sufficient addressing range, - * (e.g., hand coded trampolines) so the linker does not have - * to add stubs. - * - * Linker stubs at the top of the main text section are currently not - * detected, and will result in a crash at boot due to offsets being - * wrong. - */ #ifdef CONFIG_PPC64 /* * BLOCK(0) overrides the default output section alignment because @@ -103,18 +87,31 @@ SECTIONS * section placement to work. */ .text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_LD_HEAD_STUB_CATCH + *(.linker_stub_catch); + . = . ; +#endif + #else .text : AT(ADDR(.text) - LOAD_OFFSET) { ALIGN_FUNCTION(); #endif /* careful! __ftr_alt_* sections need to be close to .text */ - *(.text .fixup __ftr_alt_* .ref.text) + *(.text.hot .text .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text); SCHED_TEXT CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT + /* + * -Os builds call FP save/restore functions. The powerpc64 + * linker generates those on demand in the .sfpr section. + * .sfpr gets placed at the beginning of a group of input + * sections, which can break start-of-text offset if it is + * included with the main text sections, so put it by itself. + */ + *(.sfpr); MEM_KEEP(init.text) MEM_KEEP(exit.text) @@ -132,7 +129,7 @@ SECTIONS PROVIDE32 (etext = .); /* Read-only data */ - RODATA + RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(0) @@ -149,7 +146,7 @@ SECTIONS /* * Init sections discarded at runtime */ - . = ALIGN(PAGE_SIZE); + . = ALIGN(STRICT_ALIGN_SIZE); __init_begin = .; INIT_TEXT_SECTION(PAGE_SIZE) :kernel @@ -267,7 +264,9 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { DATA_DATA *(.sdata) + *(.sdata2) *(.got.plt) *(.got) + *(.plt) } #else .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -330,6 +329,16 @@ SECTIONS _end = . ; PROVIDE32 (end = .); - /* Sections to be discarded. */ + STABS_DEBUG + + DWARF_DEBUG + DISCARDS + /DISCARD/ : { + *(*.EMB.apuinfo) + *(.glink .iplt .plt .rela* .comment) + *(.gnu.version*) + *(.gnu.attributes) + *(.eh_frame) + } } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index ce6f2121fffe..584c74c8119f 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -15,6 +15,7 @@ #include <linux/log2.h> #include <asm/tlbflush.h> +#include <asm/trace.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> #include <asm/book3s/64/mmu-hash.h> @@ -443,17 +444,23 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, cpu_relax(); if (need_sync) asm volatile("ptesync" : : : "memory"); - for (i = 0; i < npages; ++i) + for (i = 0; i < npages; ++i) { asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : : "r" (rbvalues[i]), "r" (kvm->arch.lpid)); + trace_tlbie(kvm->arch.lpid, 0, rbvalues[i], + kvm->arch.lpid, 0, 0, 0); + } asm volatile("eieio; tlbsync; ptesync" : : : "memory"); kvm->arch.tlbie_lock = 0; } else { if (need_sync) asm volatile("ptesync" : : : "memory"); - for (i = 0; i < npages; ++i) + for (i = 0; i < npages; ++i) { asm volatile(PPC_TLBIEL(%0,%1,0,0,0) : : "r" (rbvalues[i]), "r" (0)); + trace_tlbie(kvm->arch.lpid, 1, rbvalues[i], + 0, 0, 0, 0); + } asm volatile("ptesync" : : : "memory"); } } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 6ea4b53f4b16..cb44065e2946 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -313,15 +313,21 @@ kvm_novcpu_exit: * We come in here when wakened from nap mode. * Relocation is off and most register values are lost. * r13 points to the PACA. + * r3 contains the SRR1 wakeup value, SRR1 is trashed. */ .globl kvm_start_guest kvm_start_guest: - /* Set runlatch bit the minute you wake up from nap */ mfspr r0, SPRN_CTRLF ori r0, r0, 1 mtspr SPRN_CTRLT, r0 + /* + * Could avoid this and pass it through in r3. For now, + * code expects it to be in SRR1. + */ + mtspr SPRN_SRR1,r3 + ld r2,PACATOC(r13) li r0,KVM_HWTHREAD_IN_KVM @@ -440,13 +446,15 @@ kvm_no_guest: /* * We jump to pnv_wakeup_loss, which will return to the caller * of power7_nap in the powernv cpu offline loop. The value we - * put in r3 becomes the return value for power7_nap. + * put in r3 becomes the return value for power7_nap. pnv_wakeup_loss + * requires SRR1 in r12. */ li r3, LPCR_PECE0 mfspr r4, SPRN_LPCR rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 mtspr SPRN_LPCR, r4 li r3, 0 + mfspr r12,SPRN_SRR1 b pnv_wakeup_loss 53: HMT_LOW diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index ed7dfce331e0..3c3146ba62da 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -9,10 +9,17 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE) -obj-y += string.o alloc.o crtsavres.o code-patching.o \ - feature-fixups.o +obj-y += string.o alloc.o code-patching.o feature-fixups.o -obj-$(CONFIG_PPC32) += div64.o copy_32.o +obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o + +# See corresponding test in arch/powerpc/Makefile +# 64-bit linker creates .sfpr on demand for final link (vmlinux), +# so it is only needed for modules, and only for older linkers which +# do not support --save-restore-funcs +ifeq ($(call ld-ifversion, -lt, 225000000, y),y) +extra-$(CONFIG_PPC64) += crtsavres.o +endif obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \ @@ -30,7 +37,7 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o -obj-$(CONFIG_ALTIVEC) += xor_vmx.o +obj-$(CONFIG_ALTIVEC) += xor_vmx.o xor_vmx_glue.o CFLAGS_xor_vmx.o += -maltivec $(call cc-option,-mabi=altivec) obj-$(CONFIG_PPC64) += $(obj64-y) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 500b0f6a0b64..c9de03e0c1f1 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -12,23 +12,186 @@ #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/mm.h> -#include <asm/page.h> -#include <asm/code-patching.h> +#include <linux/cpuhotplug.h> +#include <linux/slab.h> #include <linux/uaccess.h> #include <linux/kprobes.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/page.h> +#include <asm/code-patching.h> -int patch_instruction(unsigned int *addr, unsigned int instr) +static int __patch_instruction(unsigned int *addr, unsigned int instr) { int err; __put_user_size(instr, addr, 4, err); if (err) return err; - asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" : : "r" (addr)); + + asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" :: "r" (addr)); + + return 0; +} + +#ifdef CONFIG_STRICT_KERNEL_RWX +static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); + +static int text_area_cpu_up(unsigned int cpu) +{ + struct vm_struct *area; + + area = get_vm_area(PAGE_SIZE, VM_ALLOC); + if (!area) { + WARN_ONCE(1, "Failed to create text area for cpu %d\n", + cpu); + return -1; + } + this_cpu_write(text_poke_area, area); + + return 0; +} + +static int text_area_cpu_down(unsigned int cpu) +{ + free_vm_area(this_cpu_read(text_poke_area)); + return 0; +} + +/* + * Run as a late init call. This allows all the boot time patching to be done + * simply by patching the code, and then we're called here prior to + * mark_rodata_ro(), which happens after all init calls are run. Although + * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we judge + * it as being preferable to a kernel that will crash later when someone tries + * to use patch_instruction(). + */ +static int __init setup_text_poke_area(void) +{ + BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "powerpc/text_poke:online", text_area_cpu_up, + text_area_cpu_down)); + + return 0; +} +late_initcall(setup_text_poke_area); + +/* + * This can be called for kernel text or a module. + */ +static int map_patch_area(void *addr, unsigned long text_poke_addr) +{ + unsigned long pfn; + int err; + + if (is_vmalloc_addr(addr)) + pfn = vmalloc_to_pfn(addr); + else + pfn = __pa_symbol(addr) >> PAGE_SHIFT; + + err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), + pgprot_val(PAGE_KERNEL)); + + pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err); + if (err) + return -1; + return 0; } +static inline int unmap_patch_area(unsigned long addr) +{ + pte_t *ptep; + pmd_t *pmdp; + pud_t *pudp; + pgd_t *pgdp; + + pgdp = pgd_offset_k(addr); + if (unlikely(!pgdp)) + return -EINVAL; + + pudp = pud_offset(pgdp, addr); + if (unlikely(!pudp)) + return -EINVAL; + + pmdp = pmd_offset(pudp, addr); + if (unlikely(!pmdp)) + return -EINVAL; + + ptep = pte_offset_kernel(pmdp, addr); + if (unlikely(!ptep)) + return -EINVAL; + + pr_devel("clearing mm %p, pte %p, addr %lx\n", &init_mm, ptep, addr); + + /* + * In hash, pte_clear flushes the tlb, in radix, we have to + */ + pte_clear(&init_mm, addr, ptep); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + + return 0; +} + +int patch_instruction(unsigned int *addr, unsigned int instr) +{ + int err; + unsigned int *dest = NULL; + unsigned long flags; + unsigned long text_poke_addr; + unsigned long kaddr = (unsigned long)addr; + + /* + * During early early boot patch_instruction is called + * when text_poke_area is not ready, but we still need + * to allow patching. We just do the plain old patching + * We use slab_is_available and per cpu read * via this_cpu_read + * of text_poke_area. Per-CPU areas might not be up early + * this can create problems with just using this_cpu_read() + */ + if (!slab_is_available() || !this_cpu_read(text_poke_area)) + return __patch_instruction(addr, instr); + + local_irq_save(flags); + + text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr; + if (map_patch_area(addr, text_poke_addr)) { + err = -1; + goto out; + } + + dest = (unsigned int *)(text_poke_addr) + + ((kaddr & ~PAGE_MASK) / sizeof(unsigned int)); + + /* + * We use __put_user_size so that we can handle faults while + * writing to dest and return err to handle faults gracefully + */ + __put_user_size(instr, dest, 4, err); + if (!err) + asm ("dcbst 0, %0; sync; icbi 0,%0; icbi 0,%1; sync; isync" + ::"r" (dest), "r"(addr)); + + err = unmap_patch_area(text_poke_addr); + if (err) + pr_warn("failed to unmap %lx\n", text_poke_addr); + +out: + local_irq_restore(flags); + + return err; +} +#else /* !CONFIG_STRICT_KERNEL_RWX */ + +int patch_instruction(unsigned int *addr, unsigned int instr) +{ + return __patch_instruction(addr, instr); +} + +#endif /* CONFIG_STRICT_KERNEL_RWX */ +NOKPROBE_SYMBOL(patch_instruction); + int patch_branch(unsigned int *addr, unsigned long target, int flags) { return patch_instruction(addr, create_branch(addr, target, flags)); diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S index a24b4039352c..706b7cc19846 100644 --- a/arch/powerpc/lib/copyuser_power7.S +++ b/arch/powerpc/lib/copyuser_power7.S @@ -82,14 +82,14 @@ _GLOBAL(__copy_tofrom_user_power7) #ifdef CONFIG_ALTIVEC cmpldi r5,16 - cmpldi cr1,r5,4096 + cmpldi cr1,r5,3328 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) blt .Lshort_copy - bgt cr1,.Lvmx_copy + bge cr1,.Lvmx_copy #else cmpldi r5,16 diff --git a/arch/powerpc/lib/crtsavres.S b/arch/powerpc/lib/crtsavres.S index 18af0b3d3eb2..7e5e1c28e56a 100644 --- a/arch/powerpc/lib/crtsavres.S +++ b/arch/powerpc/lib/crtsavres.S @@ -44,10 +44,10 @@ #ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE -#ifndef CONFIG_PPC64 - .section ".text" +#ifndef CONFIG_PPC64 + /* Routines for saving integer registers, called by the compiler. */ /* Called with r11 pointing to the stack header word of the caller of the */ /* function, just beyond the end of the integer save area. */ @@ -314,8 +314,6 @@ _GLOBAL(_restvr_31) #else /* CONFIG_PPC64 */ - .section ".text.save.restore","ax",@progbits - .globl _savegpr0_14 _savegpr0_14: std r14,-144(r1) diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c index f9de69a04e88..4df240aa5f81 100644 --- a/arch/powerpc/lib/xor_vmx.c +++ b/arch/powerpc/lib/xor_vmx.c @@ -29,10 +29,7 @@ #define vector __attribute__((vector_size(16))) #endif -#include <linux/preempt.h> -#include <linux/export.h> -#include <linux/sched.h> -#include <asm/switch_to.h> +#include "xor_vmx.h" typedef vector signed char unative_t; @@ -64,16 +61,13 @@ typedef vector signed char unative_t; V1##_3 = vec_xor(V1##_3, V2##_3); \ } while (0) -void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in) +void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in) { DEFINE(v1); DEFINE(v2); unsigned long lines = bytes / (sizeof(unative_t)) / 4; - preempt_disable(); - enable_kernel_altivec(); - do { LOAD(v1); LOAD(v2); @@ -83,23 +77,16 @@ void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, v1 += 4; v2 += 4; } while (--lines > 0); - - disable_kernel_altivec(); - preempt_enable(); } -EXPORT_SYMBOL(xor_altivec_2); -void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in) +void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in) { DEFINE(v1); DEFINE(v2); DEFINE(v3); unsigned long lines = bytes / (sizeof(unative_t)) / 4; - preempt_disable(); - enable_kernel_altivec(); - do { LOAD(v1); LOAD(v2); @@ -112,15 +99,11 @@ void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, v2 += 4; v3 += 4; } while (--lines > 0); - - disable_kernel_altivec(); - preempt_enable(); } -EXPORT_SYMBOL(xor_altivec_3); -void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in, - unsigned long *v4_in) +void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in) { DEFINE(v1); DEFINE(v2); @@ -128,9 +111,6 @@ void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, DEFINE(v4); unsigned long lines = bytes / (sizeof(unative_t)) / 4; - preempt_disable(); - enable_kernel_altivec(); - do { LOAD(v1); LOAD(v2); @@ -146,15 +126,11 @@ void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, v3 += 4; v4 += 4; } while (--lines > 0); - - disable_kernel_altivec(); - preempt_enable(); } -EXPORT_SYMBOL(xor_altivec_4); -void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, - unsigned long *v2_in, unsigned long *v3_in, - unsigned long *v4_in, unsigned long *v5_in) +void __xor_altivec_5(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in, unsigned long *v5_in) { DEFINE(v1); DEFINE(v2); @@ -163,9 +139,6 @@ void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, DEFINE(v5); unsigned long lines = bytes / (sizeof(unative_t)) / 4; - preempt_disable(); - enable_kernel_altivec(); - do { LOAD(v1); LOAD(v2); @@ -184,8 +157,4 @@ void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, v4 += 4; v5 += 4; } while (--lines > 0); - - disable_kernel_altivec(); - preempt_enable(); } -EXPORT_SYMBOL(xor_altivec_5); diff --git a/arch/powerpc/lib/xor_vmx.h b/arch/powerpc/lib/xor_vmx.h new file mode 100644 index 000000000000..4746708451ae --- /dev/null +++ b/arch/powerpc/lib/xor_vmx.h @@ -0,0 +1,20 @@ +/* + * Simple interface to link xor_vmx.c and xor_vmx_glue.c + * + * Separating these file ensures that no altivec instructions are run + * outside of the enable/disable altivec block. + */ + +void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in); + +void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in); + +void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in); + +void __xor_altivec_5(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in, unsigned long *v5_in); diff --git a/arch/powerpc/lib/xor_vmx_glue.c b/arch/powerpc/lib/xor_vmx_glue.c new file mode 100644 index 000000000000..6521fe5e8cef --- /dev/null +++ b/arch/powerpc/lib/xor_vmx_glue.c @@ -0,0 +1,62 @@ +/* + * Altivec XOR operations + * + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/preempt.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <asm/switch_to.h> +#include "xor_vmx.h" + +void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in) +{ + preempt_disable(); + enable_kernel_altivec(); + __xor_altivec_2(bytes, v1_in, v2_in); + disable_kernel_altivec(); + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_2); + +void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in) +{ + preempt_disable(); + enable_kernel_altivec(); + __xor_altivec_3(bytes, v1_in, v2_in, v3_in); + disable_kernel_altivec(); + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_3); + +void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in) +{ + preempt_disable(); + enable_kernel_altivec(); + __xor_altivec_4(bytes, v1_in, v2_in, v3_in, v4_in); + disable_kernel_altivec(); + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_4); + +void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in, unsigned long *v5_in) +{ + preempt_disable(); + enable_kernel_altivec(); + __xor_altivec_5(bytes, v1_in, v2_in, v3_in, v4_in, v5_in); + disable_kernel_altivec(); + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_5); diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index 6c5025e81236..f4c6472f2fc4 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -88,7 +88,7 @@ static void mmu_mapin_immr(void) int offset; for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE) - map_page(v + offset, p + offset, f); + map_kernel_page(v + offset, p + offset, f); } /* Address of instructions to patch */ diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index 2dc74e5c6458..382528475433 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -227,7 +227,7 @@ __dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t do { SetPageReserved(page); - map_page(vaddr, page_to_phys(page), + map_kernel_page(vaddr, page_to_phys(page), pgprot_val(pgprot_noncached(PAGE_KERNEL))); page++; vaddr += PAGE_SIZE; diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c index c6b900f54c07..b1c144b03fcf 100644 --- a/arch/powerpc/mm/dump_hashpagetable.c +++ b/arch/powerpc/mm/dump_hashpagetable.c @@ -335,7 +335,7 @@ static unsigned long hpte_find(struct pg_state *st, unsigned long ea, int psize) unsigned long rpn, lp_bits; int base_psize = 0, actual_psize = 0; - if (ea <= PAGE_OFFSET) + if (ea < PAGE_OFFSET) return -1; /* Look in primary table */ diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 3a7d580fdc59..4c422632047b 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -206,6 +206,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, int is_write = 0; int trap = TRAP(regs); int is_exec = trap == 0x400; + int is_user = user_mode(regs); int fault; int rc = 0, store_update_sp = 0; @@ -216,7 +217,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * bits we are interested in. But there are some bits which * indicate errors in DSISR but can validly be set in SRR1. */ - if (trap == 0x400) + if (is_exec) error_code &= 0x48200000; else is_write = error_code & DSISR_ISSTORE; @@ -247,13 +248,13 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * The kernel should never take an execute fault nor should it * take a page fault to a kernel address. */ - if (!user_mode(regs) && (is_exec || (address >= TASK_SIZE))) { + if (!is_user && (is_exec || (address >= TASK_SIZE))) { rc = SIGSEGV; goto bail; } #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \ - defined(CONFIG_PPC_BOOK3S_64)) + defined(CONFIG_PPC_BOOK3S_64) || defined(CONFIG_PPC_8xx)) if (error_code & DSISR_DABRMATCH) { /* breakpoint match */ do_break(regs, address, error_code); @@ -266,7 +267,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, local_irq_enable(); if (faulthandler_disabled() || mm == NULL) { - if (!user_mode(regs)) { + if (!is_user) { rc = SIGSEGV; goto bail; } @@ -287,10 +288,10 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * can result in fault, which will cause a deadlock when called with * mmap_sem held */ - if (!is_exec && user_mode(regs)) + if (is_write && is_user) store_update_sp = store_updates_sp(regs); - if (user_mode(regs)) + if (is_user) flags |= FAULT_FLAG_USER; /* When running in the kernel we expect faults to occur only to @@ -309,7 +310,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * thus avoiding the deadlock. */ if (!down_read_trylock(&mm->mmap_sem)) { - if (!user_mode(regs) && !search_exception_tables(regs->nip)) + if (!is_user && !search_exception_tables(regs->nip)) goto bad_area_nosemaphore; retry: @@ -509,7 +510,7 @@ bad_area: bad_area_nosemaphore: /* User mode accesses cause a SIGSEGV */ - if (user_mode(regs)) { + if (is_user) { _exception(SIGSEGV, regs, code, address); goto bail; } diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 65bb8f33b399..3848af167df9 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -15,6 +15,7 @@ #include <linux/spinlock.h> #include <linux/bitops.h> #include <linux/of.h> +#include <linux/processor.h> #include <linux/threads.h> #include <linux/smp.h> @@ -23,6 +24,7 @@ #include <asm/mmu_context.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> +#include <asm/trace.h> #include <asm/tlb.h> #include <asm/cputable.h> #include <asm/udbg.h> @@ -98,6 +100,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) : "memory"); break; } + trace_tlbie(0, 0, va, 0, 0, 0, 0); } static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) @@ -147,6 +150,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) : "memory"); break; } + trace_tlbie(0, 1, va, 0, 0, 0, 0); } @@ -181,8 +185,10 @@ static inline void native_lock_hpte(struct hash_pte *hptep) while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; + spin_begin(); while(test_bit(HPTE_LOCK_BIT, word)) - cpu_relax(); + spin_cpu_relax(); + spin_end(); } } @@ -407,6 +413,38 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, tlbie(vpn, psize, psize, ssize, 0); } +/* + * Remove a bolted kernel entry. Memory hotplug uses this. + * + * No need to lock here because we should be the only user. + */ +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) +{ + unsigned long vpn; + unsigned long vsid; + long slot; + struct hash_pte *hptep; + + vsid = get_kernel_vsid(ea, ssize); + vpn = hpt_vpn(ea, vsid, ssize); + + slot = native_hpte_find(vpn, psize, ssize); + if (slot == -1) + return -ENOENT; + + hptep = htab_address + slot; + + VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED)); + + /* Invalidate the hpte */ + hptep->v = 0; + + /* Invalidate the TLB */ + tlbie(vpn, psize, psize, ssize, 0); + return 0; +} + + static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, int bpsize, int apsize, int ssize, int local) { @@ -725,6 +763,7 @@ void __init hpte_init_native(void) mmu_hash_ops.hpte_invalidate = native_hpte_invalidate; mmu_hash_ops.hpte_updatepp = native_hpte_updatepp; mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp; + mmu_hash_ops.hpte_removebolted = native_hpte_removebolted; mmu_hash_ops.hpte_insert = native_hpte_insert; mmu_hash_ops.hpte_remove = native_hpte_remove; mmu_hash_ops.hpte_clear_all = native_hpte_clear; diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index f2095ce9d4b0..7a20669c19e7 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -810,6 +810,8 @@ static void update_hid_for_hash(void) asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(0), "i"(0), "i"(2), "r"(0) : "memory"); asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory"); + trace_tlbie(0, 0, rb, 0, 2, 0, 0); + /* * now switch the HID */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index c41dc44472c5..e1bf5ca397fe 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -34,6 +34,7 @@ #define PAGE_SHIFT_16G 34 unsigned int HPAGE_SHIFT; +EXPORT_SYMBOL(HPAGE_SHIFT); /* * Tracks gpages after the device tree is scanned and before the @@ -79,7 +80,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, num_hugepd = 1; } - new = kmem_cache_zalloc(cachep, GFP_KERNEL); + new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); BUG_ON(pshift > HUGEPD_SHIFT_MASK); BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); @@ -945,7 +946,7 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, if (pmd_none(pmd)) return NULL; - if (pmd_trans_huge(pmd)) { + if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { if (is_thp) *is_thp = true; ret_pte = (pte_t *) pmdp; diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index ec84b31c6c86..5b4c25d12ff3 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -44,6 +44,7 @@ #include <linux/slab.h> #include <linux/of_fdt.h> #include <linux/libfdt.h> +#include <linux/memremap.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -110,8 +111,29 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size) return 0; } +/* + * vmemmap virtual address space management does not have a traditonal page + * table to track which virtual struct pages are backed by physical mapping. + * The virtual to physical mappings are tracked in a simple linked list + * format. 'vmemmap_list' maintains the entire vmemmap physical mapping at + * all times where as the 'next' list maintains the available + * vmemmap_backing structures which have been deleted from the + * 'vmemmap_global' list during system runtime (memory hotplug remove + * operation). The freed 'vmemmap_backing' structures are reused later when + * new requests come in without allocating fresh memory. This pointer also + * tracks the allocated 'vmemmap_backing' structures as we allocate one + * full page memory at a time when we dont have any. + */ struct vmemmap_backing *vmemmap_list; static struct vmemmap_backing *next; + +/* + * The same pointer 'next' tracks individual chunks inside the allocated + * full page during the boot time and again tracks the freeed nodes during + * runtime. It is racy but it does not happen as they are separated by the + * boot process. Will create problem if some how we have memory hotplug + * operation during boot !! + */ static int num_left; static int num_freed; @@ -171,13 +193,17 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node); for (; start < end; start += page_size) { + struct vmem_altmap *altmap; void *p; int rc; if (vmemmap_populated(start, page_size)) continue; - p = vmemmap_alloc_block(page_size, node); + /* altmap lookups only work at section boundaries */ + altmap = to_vmem_altmap(SECTION_ALIGN_DOWN(start)); + + p = __vmemmap_alloc_block_buf(page_size, node, altmap); if (!p) return -ENOMEM; @@ -234,13 +260,17 @@ static unsigned long vmemmap_list_free(unsigned long start) void __ref vmemmap_free(unsigned long start, unsigned long end) { unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; + unsigned long page_order = get_order(page_size); start = _ALIGN_DOWN(start, page_size); pr_debug("vmemmap_free %lx...%lx\n", start, end); for (; start < end; start += page_size) { - unsigned long addr; + unsigned long nr_pages, addr; + struct vmem_altmap *altmap; + struct page *section_base; + struct page *page; /* * the section has already be marked as invalid, so @@ -251,29 +281,33 @@ void __ref vmemmap_free(unsigned long start, unsigned long end) continue; addr = vmemmap_list_free(start); - if (addr) { - struct page *page = pfn_to_page(addr >> PAGE_SHIFT); - - if (PageReserved(page)) { - /* allocated from bootmem */ - if (page_size < PAGE_SIZE) { - /* - * this shouldn't happen, but if it is - * the case, leave the memory there - */ - WARN_ON_ONCE(1); - } else { - unsigned int nr_pages = - 1 << get_order(page_size); - while (nr_pages--) - free_reserved_page(page++); - } - } else - free_pages((unsigned long)(__va(addr)), - get_order(page_size)); - - vmemmap_remove_mapping(start, page_size); + if (!addr) + continue; + + page = pfn_to_page(addr >> PAGE_SHIFT); + section_base = pfn_to_page(vmemmap_section_start(start)); + nr_pages = 1 << page_order; + + altmap = to_vmem_altmap((unsigned long) section_base); + if (altmap) { + vmem_altmap_free(altmap, nr_pages); + } else if (PageReserved(page)) { + /* allocated from bootmem */ + if (page_size < PAGE_SIZE) { + /* + * this shouldn't happen, but if it is + * the case, leave the memory there + */ + WARN_ON_ONCE(1); + } else { + while (nr_pages--) + free_reserved_page(page++); + } + } else { + free_pages((unsigned long)(__va(addr)), page_order); } + + vmemmap_remove_mapping(start, page_size); } } #endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index de5a90e1ceaa..8541f18694a4 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -36,6 +36,7 @@ #include <linux/hugetlb.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/memremap.h> #include <asm/pgalloc.h> #include <asm/prom.h> @@ -151,11 +152,20 @@ int arch_remove_memory(u64 start, u64 size) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; + struct vmem_altmap *altmap; + struct page *page; int ret; - zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages); + /* + * If we have an altmap then we need to skip over any reserved PFNs + * when querying the zone. + */ + page = pfn_to_page(start_pfn); + altmap = to_vmem_altmap((unsigned long) page); + if (altmap) + page += vmem_altmap_offset(altmap); + + ret = __remove_pages(page_zone(page), start_pfn, nr_pages); if (ret) return ret; @@ -305,11 +315,11 @@ void __init paging_init(void) unsigned long end = __fix_to_virt(FIX_HOLE); for (; v < end; v += PAGE_SIZE) - map_page(v, 0, 0); /* XXX gross */ + map_kernel_page(v, 0, 0); /* XXX gross */ #endif #ifdef CONFIG_HIGHMEM - map_page(PKMAP_BASE, 0, 0); /* XXX gross */ + map_kernel_page(PKMAP_BASE, 0, 0); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index a3edf813d455..71de2c6d88f3 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -235,10 +235,15 @@ void destroy_context(struct mm_struct *mm) #ifdef CONFIG_PPC_RADIX_MMU void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) { - asm volatile("isync": : :"memory"); - mtspr(SPRN_PID, next->context.id); - asm volatile("isync \n" - PPC_SLBIA(0x7) - : : :"memory"); + + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + isync(); + mtspr(SPRN_PID, next->context.id); + isync(); + asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); + } else { + mtspr(SPRN_PID, next->context.id); + isync(); + } } #endif diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index f988db655e5b..d46128b22150 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -94,7 +94,6 @@ extern void _tlbia(void); #ifdef CONFIG_PPC32 extern void mapin_ram(void); -extern int map_page(unsigned long va, phys_addr_t pa, int flags); extern void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot); diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 5fcb3dd74c13..31eed8fa8e99 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -32,7 +32,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, { int changed; #ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); + WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); assert_spin_locked(&vma->vm_mm->page_table_lock); #endif changed = !pmd_same(*(pmdp), entry); @@ -59,7 +59,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, #ifdef CONFIG_DEBUG_VM WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); assert_spin_locked(&mm->page_table_lock); - WARN_ON(!pmd_trans_huge(pmd)); + WARN_ON(!(pmd_trans_huge(pmd) || pmd_devmap(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 8b85a14b08ea..188b4107584d 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -11,8 +11,12 @@ #include <linux/sched.h> #include <linux/mm_types.h> +#include <linux/mm.h> #include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/sections.h> +#include <asm/mmu.h> #include <asm/tlb.h> #include "mmu_decl.h" @@ -22,6 +26,81 @@ #ifdef CONFIG_SPARSEMEM_VMEMMAP /* + * vmemmap is the starting address of the virtual address space where + * struct pages are allocated for all possible PFNs present on the system + * including holes and bad memory (hence sparse). These virtual struct + * pages are stored in sequence in this virtual address space irrespective + * of the fact whether the corresponding PFN is valid or not. This achieves + * constant relationship between address of struct page and its PFN. + * + * During boot or memory hotplug operation when a new memory section is + * added, physical memory allocation (including hash table bolting) will + * be performed for the set of struct pages which are part of the memory + * section. This saves memory by not allocating struct pages for PFNs + * which are not valid. + * + * ---------------------------------------------- + * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES| + * ---------------------------------------------- + * + * f000000000000000 c000000000000000 + * vmemmap +--------------+ +--------------+ + * + | page struct | +--------------> | page struct | + * | +--------------+ +--------------+ + * | | page struct | +--------------> | page struct | + * | +--------------+ | +--------------+ + * | | page struct | + +------> | page struct | + * | +--------------+ | +--------------+ + * | | page struct | | +--> | page struct | + * | +--------------+ | | +--------------+ + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | +-------+ | + * | +--------------+ | + * | | page struct | +-----------+ + * | +--------------+ + * | | page struct | No mapping + * | +--------------+ + * | | page struct | No mapping + * v +--------------+ + * + * ----------------------------------------- + * | RELATION BETWEEN STRUCT PAGES AND PFNS| + * ----------------------------------------- + * + * vmemmap +--------------+ +---------------+ + * + | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * v +--------------+ +---------------+ + */ +/* * On hash-based CPUs, the vmemmap is bolted in the hash table. * */ @@ -109,7 +188,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr unsigned long old; #ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); + WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); assert_spin_locked(&mm->page_table_lock); #endif @@ -141,6 +220,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_trans_huge(*pmdp)); + VM_BUG_ON(pmd_devmap(*pmdp)); pmd = *pmdp; pmd_clear(pmdp); @@ -221,6 +301,7 @@ void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, { VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); + VM_BUG_ON(pmd_devmap(*pmdp)); /* * We can't mark the pmd none here, because that will cause a race @@ -342,3 +423,35 @@ int hash__has_transparent_hugepage(void) return 1; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_STRICT_KERNEL_RWX +void hash__mark_rodata_ro(void) +{ + unsigned long start = (unsigned long)_stext; + unsigned long end = (unsigned long)__init_begin; + unsigned long idx; + unsigned int step, shift; + unsigned long newpp = PP_RXXX; + + shift = mmu_psize_defs[mmu_linear_psize].shift; + step = 1 << shift; + + start = ((start + step - 1) >> shift) << shift; + end = (end >> shift) << shift; + + pr_devel("marking ro start %lx, end %lx, step %x\n", + start, end, step); + + if (start == end) { + pr_warn("could not set rodata ro, relocate the start" + " of the kernel to a 0x%x boundary\n", step); + return; + } + + for (idx = start; idx < end; idx += step) + /* Not sure if we can do much with the return value */ + mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, + mmu_kernel_ssize); + +} +#endif diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index c28165d8970b..8c13e4282308 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -11,6 +11,7 @@ #include <linux/sched/mm.h> #include <linux/memblock.h> #include <linux/of_fdt.h> +#include <linux/mm.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -19,6 +20,8 @@ #include <asm/mmu.h> #include <asm/firmware.h> #include <asm/powernv.h> +#include <asm/sections.h> +#include <asm/trace.h> #include <trace/events/thp.h> @@ -108,6 +111,49 @@ set_the_pte: return 0; } +#ifdef CONFIG_STRICT_KERNEL_RWX +void radix__mark_rodata_ro(void) +{ + unsigned long start = (unsigned long)_stext; + unsigned long end = (unsigned long)__init_begin; + unsigned long idx; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + start = ALIGN_DOWN(start, PAGE_SIZE); + end = PAGE_ALIGN(end); // aligns up + + pr_devel("marking ro start %lx, end %lx\n", start, end); + + for (idx = start; idx < end; idx += PAGE_SIZE) { + pgdp = pgd_offset_k(idx); + pudp = pud_alloc(&init_mm, pgdp, idx); + if (!pudp) + continue; + if (pud_huge(*pudp)) { + ptep = (pte_t *)pudp; + goto update_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, idx); + if (!pmdp) + continue; + if (pmd_huge(*pmdp)) { + ptep = pmdp_ptep(pmdp); + goto update_the_pte; + } + ptep = pte_alloc_kernel(pmdp, idx); + if (!ptep) + continue; +update_the_pte: + radix__pte_update(&init_mm, idx, ptep, _PAGE_WRITE, 0, 0); + } + + radix__flush_tlb_kernel_range(start, end); +} +#endif /* CONFIG_STRICT_KERNEL_RWX */ + static inline void __meminit print_mapping(unsigned long start, unsigned long end, unsigned long size) @@ -121,7 +167,14 @@ static inline void __meminit print_mapping(unsigned long start, static int __meminit create_physical_mapping(unsigned long start, unsigned long end) { - unsigned long addr, mapping_size = 0; + unsigned long vaddr, addr, mapping_size = 0; + pgprot_t prot; + unsigned long max_mapping_size; +#ifdef CONFIG_STRICT_KERNEL_RWX + int split_text_mapping = 1; +#else + int split_text_mapping = 0; +#endif start = _ALIGN_UP(start, PAGE_SIZE); for (addr = start; addr < end; addr += mapping_size) { @@ -130,9 +183,12 @@ static int __meminit create_physical_mapping(unsigned long start, gap = end - addr; previous_size = mapping_size; + max_mapping_size = PUD_SIZE; +retry: if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && - mmu_psize_defs[MMU_PAGE_1G].shift) + mmu_psize_defs[MMU_PAGE_1G].shift && + PUD_SIZE <= max_mapping_size) mapping_size = PUD_SIZE; else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && mmu_psize_defs[MMU_PAGE_2M].shift) @@ -140,13 +196,32 @@ static int __meminit create_physical_mapping(unsigned long start, else mapping_size = PAGE_SIZE; + if (split_text_mapping && (mapping_size == PUD_SIZE) && + (addr <= __pa_symbol(__init_begin)) && + (addr + mapping_size) >= __pa_symbol(_stext)) { + max_mapping_size = PMD_SIZE; + goto retry; + } + + if (split_text_mapping && (mapping_size == PMD_SIZE) && + (addr <= __pa_symbol(__init_begin)) && + (addr + mapping_size) >= __pa_symbol(_stext)) + mapping_size = PAGE_SIZE; + if (mapping_size != previous_size) { print_mapping(start, addr, previous_size); start = addr; } - rc = radix__map_kernel_page((unsigned long)__va(addr), addr, - PAGE_KERNEL_X, mapping_size); + vaddr = (unsigned long)__va(addr); + + if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || + overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) + prot = PAGE_KERNEL_X; + else + prot = PAGE_KERNEL; + + rc = radix__map_kernel_page(vaddr, addr, prot, mapping_size); if (rc) return rc; } @@ -190,6 +265,7 @@ static void __init radix_init_pgtable(void) asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); } static void __init radix_init_partition_table(void) @@ -316,6 +392,9 @@ static void update_hid_for_radix(void) asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(1), "i"(1), "i"(2), "r"(0) : "memory"); asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory"); + trace_tlbie(0, 0, rb, 0, 2, 0, 1); + trace_tlbie(0, 0, rb, 0, 2, 1, 1); + /* * now switch the HID */ @@ -683,7 +762,7 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add unsigned long old; #ifdef CONFIG_DEBUG_VM - WARN_ON(!radix__pmd_trans_huge(*pmdp)); + WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); assert_spin_locked(&mm->page_table_lock); #endif @@ -701,6 +780,7 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); + VM_BUG_ON(pmd_devmap(*pmdp)); /* * khugepaged calls this for normal pmd */ diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index a65c0b4c0669..a9e4bfc025bc 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -60,7 +60,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *ptepage; - gfp_t flags = GFP_KERNEL | __GFP_ZERO; + gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT; ptepage = alloc_pages(flags, 0); if (!ptepage) @@ -189,7 +189,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, err = 0; for (i = 0; i < size && err == 0; i += PAGE_SIZE) - err = map_page(v+i, p+i, flags); + err = map_kernel_page(v+i, p+i, flags); if (err) { if (slab_is_available()) vunmap((void *)v); @@ -215,7 +215,7 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); -int map_page(unsigned long va, phys_addr_t pa, int flags) +int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) { pmd_t *pd; pte_t *pg; @@ -255,7 +255,7 @@ void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) ktext = ((char *)v >= _stext && (char *)v < etext) || ((char *)v >= _sinittext && (char *)v < _einittext); f = ktext ? pgprot_val(PAGE_KERNEL_TEXT) : pgprot_val(PAGE_KERNEL); - map_page(v, p, f); + map_kernel_page(v, p, f); #ifdef CONFIG_PPC_STD_MMU_32 if (ktext) hash_preload(&init_mm, v, 0, 0x300); @@ -387,11 +387,6 @@ void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) return; } - map_page(address, phys, pgprot_val(flags)); + map_kernel_page(address, phys, pgprot_val(flags)); fixmaps++; } - -void __this_fixmap_does_not_exist(void) -{ - WARN_ON(1); -} diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index db93cf747a03..5c0b795d656c 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -47,6 +47,7 @@ #include <asm/smp.h> #include <asm/machdep.h> #include <asm/tlb.h> +#include <asm/trace.h> #include <asm/processor.h> #include <asm/cputable.h> #include <asm/sections.h> @@ -323,7 +324,7 @@ struct page *pud_page(pud_t pud) */ struct page *pmd_page(pmd_t pmd) { - if (pmd_trans_huge(pmd) || pmd_huge(pmd)) + if (pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd)) return pte_page(pmd_pte(pmd)); return virt_to_page(pmd_page_vaddr(pmd)); } @@ -351,12 +352,20 @@ static pte_t *get_from_cache(struct mm_struct *mm) static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) { void *ret = NULL; - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - if (!page) - return NULL; - if (!kernel && !pgtable_page_ctor(page)) { - __free_page(page); - return NULL; + struct page *page; + + if (!kernel) { + page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); + if (!page) + return NULL; + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + } else { + page = alloc_page(PGALLOC_GFP); + if (!page) + return NULL; } ret = page_address(page); @@ -469,13 +478,31 @@ void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, * use of this partition ID was, not the new use. */ asm volatile("ptesync" : : : "memory"); - if (old & PATB_HR) + if (old & PATB_HR) { asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - else + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); + } else { asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); + } asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); #endif /* CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_STRICT_KERNEL_RWX +void mark_rodata_ro(void) +{ + if (!mmu_has_feature(MMU_FTR_KERNEL_RO)) { + pr_warn("Warning: Unable to mark rodata read only on this CPU.\n"); + return; + } + + if (radix_enabled()) + radix__mark_rodata_ro(); + else + hash__mark_rodata_ro(); +} +#endif diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 654a0d7ba0e7..13cfe413b40d 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -33,15 +33,7 @@ enum slb_index { KSTACK_INDEX = 2, /* Kernel stack map */ }; -extern void slb_allocate_realmode(unsigned long ea); - -static void slb_allocate(unsigned long ea) -{ - /* Currently, we do real mode for all SLBs including user, but - * that will change if we bring back dynamic VSIDs - */ - slb_allocate_realmode(ea); -} +extern void slb_allocate(unsigned long ea); #define slb_esid_mask(ssize) \ (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 1519617aab36..bde378559d01 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -65,14 +65,15 @@ MMU_FTR_SECTION_ELSE \ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) -/* void slb_allocate_realmode(unsigned long ea); +/* void slb_allocate(unsigned long ea); * * Create an SLB entry for the given EA (user or kernel). * r3 = faulting address, r13 = PACA * r9, r10, r11 are clobbered by this function + * r3 is preserved. * No other registers are examined or changed. */ -_GLOBAL(slb_allocate_realmode) +_GLOBAL(slb_allocate) /* * check for bad kernel/user address * (ea & ~REGION_MASK) >= PGTABLE_RANGE @@ -235,6 +236,9 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) * dont have any LRU information to help us choose a slot. */ + mr r9,r3 + + /* slb_finish_load_1T continues here. r9=EA with non-ESID bits clear */ 7: ld r10,PACASTABRR(r13) addi r10,r10,1 /* This gets soft patched on boot. */ @@ -249,10 +253,10 @@ slb_compare_rr_to_size: std r10,PACASTABRR(r13) 3: - rldimi r3,r10,0,36 /* r3= EA[0:35] | entry */ - oris r10,r3,SLB_ESID_V@h /* r3 |= SLB_ESID_V */ + rldimi r9,r10,0,36 /* r9 = EA[0:35] | entry */ + oris r10,r9,SLB_ESID_V@h /* r10 = r9 | SLB_ESID_V */ - /* r3 = ESID data, r11 = VSID data */ + /* r9 = ESID data, r11 = VSID data */ /* * No need for an isync before or after this slbmte. The exception @@ -265,21 +269,21 @@ slb_compare_rr_to_size: bgelr cr7 /* Update the slb cache */ - lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ - cmpldi r3,SLB_CACHE_ENTRIES + lhz r9,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ + cmpldi r9,SLB_CACHE_ENTRIES bge 1f /* still room in the slb cache */ - sldi r11,r3,2 /* r11 = offset * sizeof(u32) */ + sldi r11,r9,2 /* r11 = offset * sizeof(u32) */ srdi r10,r10,28 /* get the 36 bits of the ESID */ add r11,r11,r13 /* r11 = (u32 *)paca + offset */ stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ - addi r3,r3,1 /* offset++ */ + addi r9,r9,1 /* offset++ */ b 2f 1: /* offset >= SLB_CACHE_ENTRIES */ - li r3,SLB_CACHE_ENTRIES+1 + li r9,SLB_CACHE_ENTRIES+1 2: - sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ + sth r9,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ crclr 4*cr0+eq /* set result to "success" */ blr @@ -301,11 +305,11 @@ slb_compare_rr_to_size: rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ /* r3 = EA, r11 = VSID data */ - clrrdi r3,r3,SID_SHIFT_1T /* clear out non-ESID bits */ + clrrdi r9,r3,SID_SHIFT_1T /* clear out non-ESID bits */ b 7b -_ASM_NOKPROBE_SYMBOL(slb_allocate_realmode) +_ASM_NOKPROBE_SYMBOL(slb_allocate) _ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_linear) _ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_io) _ASM_NOKPROBE_SYMBOL(slb_compare_rr_to_size) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 02e71402fdd3..744e0164ecf5 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -16,6 +16,7 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/trace.h> #define RIC_FLUSH_TLB 0 @@ -35,6 +36,7 @@ static inline void __tlbiel_pid(unsigned long pid, int set, asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 1, rb, rs, ric, prs, r); } /* @@ -87,6 +89,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); } static inline void _tlbiel_va(unsigned long va, unsigned long pid, @@ -104,6 +107,7 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid, asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); asm volatile("ptesync": : :"memory"); + trace_tlbie(0, 1, rb, rs, ric, prs, r); } static inline void _tlbie_va(unsigned long va, unsigned long pid, @@ -121,6 +125,7 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid, asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); } /* @@ -377,6 +382,7 @@ void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa, asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } EXPORT_SYMBOL(radix__flush_tlb_lpid_va); @@ -394,6 +400,7 @@ void radix__flush_tlb_lpid(unsigned long lpid) asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } EXPORT_SYMBOL(radix__flush_tlb_lpid); @@ -420,12 +427,14 @@ void radix__flush_tlb_all(void) */ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); /* * now flush host entires by passing PRS = 0 and LPID == 0 */ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory"); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + trace_tlbie(0, 0, rb, 0, ric, prs, r); } void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm, diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 4517aa43a8b1..b5b0fb97b9c0 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -93,12 +93,10 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, /* * Check if we have an active batch on this CPU. If not, just - * flush now and return. For now, we don global invalidates - * in that case, might be worth testing the mm cpu mask though - * and decide to use local invalidates instead... + * flush now and return. */ if (!batch->active) { - flush_hash_page(vpn, rpte, psize, ssize, 0); + flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm)); put_cpu_var(ppc64_tlb_batch); return; } diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 7b2ca16b1eb4..9c88b82f6229 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> +#include <asm/cputhreads.h> #include <asm/firmware.h> #include <asm/hvcall.h> #include <asm/io.h> @@ -27,6 +28,12 @@ #include "hv-24x7-catalog.h" #include "hv-common.h" +/* Version of the 24x7 hypervisor API that we should use in this machine. */ +static int interface_version; + +/* Whether we have to aggregate result data for some domains. */ +static bool aggregate_result_elements; + static bool domain_is_valid(unsigned domain) { switch (domain) { @@ -54,6 +61,15 @@ static bool is_physical_domain(unsigned domain) } } +/* Domains for which more than one result element are returned for each event. */ +static bool domain_needs_aggregation(unsigned int domain) +{ + return aggregate_result_elements && + (domain == HV_PERF_DOMAIN_PHYS_CORE || + (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE && + domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE)); +} + static const char *domain_name(unsigned domain) { if (!domain_is_valid(domain)) @@ -74,7 +90,11 @@ static const char *domain_name(unsigned domain) static bool catalog_entry_domain_is_valid(unsigned domain) { - return is_physical_domain(domain); + /* POWER8 doesn't support virtual domains. */ + if (interface_version == 1) + return is_physical_domain(domain); + else + return domain_is_valid(domain); } /* @@ -166,6 +186,12 @@ DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); +static unsigned int max_num_requests(int interface_version) +{ + return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer)) + / H24x7_REQUEST_SIZE(interface_version); +} + static char *event_name(struct hv_24x7_event_data *ev, int *len) { *len = be16_to_cpu(ev->event_name_len) - 2; @@ -260,9 +286,8 @@ static void *event_end(struct hv_24x7_event_data *ev, void *end) return start + nl + dl + ldl; } -static unsigned long h_get_24x7_catalog_page_(unsigned long phys_4096, - unsigned long version, - unsigned long index) +static long h_get_24x7_catalog_page_(unsigned long phys_4096, + unsigned long version, unsigned long index) { pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)", phys_4096, version, index); @@ -273,8 +298,7 @@ static unsigned long h_get_24x7_catalog_page_(unsigned long phys_4096, phys_4096, version, index); } -static unsigned long h_get_24x7_catalog_page(char page[], - u64 version, u32 index) +static long h_get_24x7_catalog_page(char page[], u64 version, u32 index) { return h_get_24x7_catalog_page_(virt_to_phys(page), version, index); @@ -664,13 +688,13 @@ static int create_events_from_catalog(struct attribute ***events_, struct attribute ***event_descs_, struct attribute ***event_long_descs_) { - unsigned long hret; + long hret; size_t catalog_len, catalog_page_len, event_entry_count, event_data_len, event_data_offs, event_data_bytes, junk_events, event_idx, event_attr_ct, i, attr_max, event_idx_last, desc_ct, long_desc_ct; ssize_t ct, ev_len; - uint32_t catalog_version_num; + uint64_t catalog_version_num; struct attribute **events, **event_descs, **event_long_descs; struct hv_24x7_catalog_page_0 *page_0 = kmem_cache_alloc(hv_page_cache, GFP_KERNEL); @@ -706,8 +730,8 @@ static int create_events_from_catalog(struct attribute ***events_, event_data_offs = be16_to_cpu(page_0->event_data_offs); event_data_len = be16_to_cpu(page_0->event_data_len); - pr_devel("cv %zu cl %zu eec %zu edo %zu edl %zu\n", - (size_t)catalog_version_num, catalog_len, + pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n", + catalog_version_num, catalog_len, event_entry_count, event_data_offs, event_data_len); if ((MAX_4K < event_data_len) @@ -761,8 +785,8 @@ static int create_events_from_catalog(struct attribute ***events_, catalog_version_num, i + event_data_offs); if (hret) { - pr_err("failed to get event data in page %zu\n", - i + event_data_offs); + pr_err("Failed to get event data in page %zu: rc=%ld\n", + i + event_data_offs, hret); ret = -EIO; goto e_event_data; } @@ -903,7 +927,7 @@ static ssize_t catalog_read(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t offset, size_t count) { - unsigned long hret; + long hret; ssize_t ret = 0; size_t catalog_len = 0, catalog_page_len = 0; loff_t page_offset = 0; @@ -988,7 +1012,7 @@ static ssize_t _name##_show(struct device *dev, \ struct device_attribute *dev_attr, \ char *buf) \ { \ - unsigned long hret; \ + long hret; \ ssize_t ret = 0; \ void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); \ struct hv_24x7_catalog_page_0 *page_0 = page; \ @@ -1040,21 +1064,6 @@ static const struct attribute_group *attr_groups[] = { NULL, }; -static void log_24x7_hcall(struct hv_24x7_request_buffer *request_buffer, - struct hv_24x7_data_result_buffer *result_buffer, - unsigned long ret) -{ - struct hv_24x7_request *req; - - req = &request_buffer->requests[0]; - pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => " - "ret 0x%lx (%ld) detail=0x%x failing ix=%x\n", - req->performance_domain, req->data_offset, - req->starting_ix, req->starting_lpar_ix, ret, ret, - result_buffer->detailed_rc, - result_buffer->failing_request_ix); -} - /* * Start the process for a new H_GET_24x7_DATA hcall. */ @@ -1062,10 +1071,10 @@ static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer, struct hv_24x7_data_result_buffer *result_buffer) { - memset(request_buffer, 0, 4096); - memset(result_buffer, 0, 4096); + memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE); + memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE); - request_buffer->interface_version = HV_24X7_IF_VERSION_CURRENT; + request_buffer->interface_version = interface_version; /* memset above set request_buffer->num_requests to 0 */ } @@ -1076,7 +1085,7 @@ static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer, static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer, struct hv_24x7_data_result_buffer *result_buffer) { - unsigned long ret; + long ret; /* * NOTE: Due to variable number of array elements in request and @@ -1087,10 +1096,19 @@ static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer, virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE, virt_to_phys(result_buffer), H24x7_DATA_BUFFER_SIZE); - if (ret) - log_24x7_hcall(request_buffer, result_buffer, ret); + if (ret) { + struct hv_24x7_request *req; + + req = request_buffer->requests; + pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n", + req->performance_domain, req->data_offset, + req->starting_ix, req->starting_lpar_ix, + ret, ret, result_buffer->detailed_rc, + result_buffer->failing_request_ix); + return -EIO; + } - return ret; + return 0; } /* @@ -1105,9 +1123,11 @@ static int add_event_to_24x7_request(struct perf_event *event, { u16 idx; int i; + size_t req_size; struct hv_24x7_request *req; - if (request_buffer->num_requests > 254) { + if (request_buffer->num_requests >= + max_num_requests(request_buffer->interface_version)) { pr_devel("Too many requests for 24x7 HCALL %d\n", request_buffer->num_requests); return -EINVAL; @@ -1124,23 +1144,113 @@ static int add_event_to_24x7_request(struct perf_event *event, idx = event_get_vcpu(event); } + req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version); + i = request_buffer->num_requests++; - req = &request_buffer->requests[i]; + req = (void *) request_buffer->requests + i * req_size; req->performance_domain = event_get_domain(event); req->data_size = cpu_to_be16(8); req->data_offset = cpu_to_be32(event_get_offset(event)); - req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event)), + req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event)); req->max_num_lpars = cpu_to_be16(1); req->starting_ix = cpu_to_be16(idx); req->max_ix = cpu_to_be16(1); + if (request_buffer->interface_version > 1) { + if (domain_needs_aggregation(req->performance_domain)) + req->max_num_thread_groups = -1; + else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) { + req->starting_thread_group_ix = idx % 2; + req->max_num_thread_groups = 1; + } + } + + return 0; +} + +/** + * get_count_from_result - get event count from all result elements in result + * + * If the event corresponding to this result needs aggregation of the result + * element values, then this function does that. + * + * @event: Event associated with @res. + * @resb: Result buffer containing @res. + * @res: Result to work on. + * @countp: Output variable containing the event count. + * @next: Optional output variable pointing to the next result in @resb. + */ +static int get_count_from_result(struct perf_event *event, + struct hv_24x7_data_result_buffer *resb, + struct hv_24x7_result *res, u64 *countp, + struct hv_24x7_result **next) +{ + u16 num_elements = be16_to_cpu(res->num_elements_returned); + u16 data_size = be16_to_cpu(res->result_element_data_size); + unsigned int data_offset; + void *element_data; + int i; + u64 count; + + /* + * We can bail out early if the result is empty. + */ + if (!num_elements) { + pr_debug("Result of request %hhu is empty, nothing to do\n", + res->result_ix); + + if (next) + *next = (struct hv_24x7_result *) res->elements; + + return -ENODATA; + } + + /* + * Since we always specify 1 as the maximum for the smallest resource + * we're requesting, there should to be only one element per result. + * Except when an event needs aggregation, in which case there are more. + */ + if (num_elements != 1 && + !domain_needs_aggregation(event_get_domain(event))) { + pr_err("Error: result of request %hhu has %hu elements\n", + res->result_ix, num_elements); + + return -EIO; + } + + if (data_size != sizeof(u64)) { + pr_debug("Error: result of request %hhu has data of %hu bytes\n", + res->result_ix, data_size); + + return -ENOTSUPP; + } + + if (resb->interface_version == 1) + data_offset = offsetof(struct hv_24x7_result_element_v1, + element_data); + else + data_offset = offsetof(struct hv_24x7_result_element_v2, + element_data); + + /* Go through the result elements in the result. */ + for (i = count = 0, element_data = res->elements + data_offset; + i < num_elements; + i++, element_data += data_size + data_offset) + count += be64_to_cpu(*((u64 *) element_data)); + + *countp = count; + + /* The next result is after the last result element. */ + if (next) + *next = element_data - data_offset; + return 0; } -static unsigned long single_24x7_request(struct perf_event *event, u64 *count) +static int single_24x7_request(struct perf_event *event, u64 *count) { - unsigned long ret; + int ret; struct hv_24x7_request_buffer *request_buffer; struct hv_24x7_data_result_buffer *result_buffer; @@ -1157,13 +1267,12 @@ static unsigned long single_24x7_request(struct perf_event *event, u64 *count) goto out; ret = make_24x7_request(request_buffer, result_buffer); - if (ret) { - log_24x7_hcall(request_buffer, result_buffer, ret); + if (ret) goto out; - } /* process result from hcall */ - *count = be64_to_cpu(result_buffer->results[0].elements[0].element_data[0]); + ret = get_count_from_result(event, result_buffer, + result_buffer->results, count, NULL); out: put_cpu_var(hv_24x7_reqb); @@ -1216,9 +1325,8 @@ static int h_24x7_event_init(struct perf_event *event) return -EINVAL; } - /* Domains above 6 are invalid */ domain = event_get_domain(event); - if (domain > 6) { + if (domain >= HV_PERF_DOMAIN_MAX) { pr_devel("invalid domain %d\n", domain); return -EINVAL; } @@ -1250,10 +1358,9 @@ static int h_24x7_event_init(struct perf_event *event) static u64 h_24x7_get_value(struct perf_event *event) { - unsigned long ret; u64 ct; - ret = single_24x7_request(event, &ct); - if (ret) + + if (single_24x7_request(event, &ct)) /* We checked this in event init, shouldn't fail here... */ return 0; @@ -1396,8 +1503,7 @@ static int h_24x7_event_commit_txn(struct pmu *pmu) { struct hv_24x7_request_buffer *request_buffer; struct hv_24x7_data_result_buffer *result_buffer; - struct hv_24x7_result *resb; - struct perf_event *event; + struct hv_24x7_result *res, *next_res; u64 count; int i, ret, txn_flags; struct hv_24x7_hw *h24x7hw; @@ -1417,19 +1523,21 @@ static int h_24x7_event_commit_txn(struct pmu *pmu) result_buffer = (void *)get_cpu_var(hv_24x7_resb); ret = make_24x7_request(request_buffer, result_buffer); - if (ret) { - log_24x7_hcall(request_buffer, result_buffer, ret); + if (ret) goto put_reqb; - } h24x7hw = &get_cpu_var(hv_24x7_hw); - /* Update event counts from hcall */ - for (i = 0; i < request_buffer->num_requests; i++) { - resb = &result_buffer->results[i]; - count = be64_to_cpu(resb->elements[0].element_data[0]); - event = h24x7hw->events[i]; - h24x7hw->events[i] = NULL; + /* Go through results in the result buffer to update event counts. */ + for (i = 0, res = result_buffer->results; + i < result_buffer->num_results; i++, res = next_res) { + struct perf_event *event = h24x7hw->events[res->result_ix]; + + ret = get_count_from_result(event, result_buffer, res, &count, + &next_res); + if (ret) + break; + update_event_count(event, count); } @@ -1480,6 +1588,18 @@ static int hv_24x7_init(void) if (!firmware_has_feature(FW_FEATURE_LPAR)) { pr_debug("not a virtualized system, not enabling\n"); return -ENODEV; + } else if (!cur_cpu_spec->oprofile_cpu_type) + return -ENODEV; + + /* POWER8 only supports v1, while POWER9 only supports v2. */ + if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8")) + interface_version = 1; + else { + interface_version = 2; + + /* SMT8 in POWER9 needs to aggregate result elements. */ + if (threads_per_core == 8) + aggregate_result_elements = true; } hret = hv_perf_caps_get(&caps); diff --git a/arch/powerpc/perf/hv-24x7.h b/arch/powerpc/perf/hv-24x7.h index 634ef4082cdc..5092c4a222a6 100644 --- a/arch/powerpc/perf/hv-24x7.h +++ b/arch/powerpc/perf/hv-24x7.h @@ -10,6 +10,8 @@ enum hv_perf_domains { HV_PERF_DOMAIN_MAX, }; +#define H24x7_REQUEST_SIZE(iface_version) (iface_version == 1 ? 16 : 32) + struct hv_24x7_request { /* PHYSICAL domains require enabling via phyp/hmc. */ __u8 performance_domain; @@ -42,19 +44,27 @@ struct hv_24x7_request { /* chip, core, or virtual processor based on @performance_domain */ __be16 starting_ix; __be16 max_ix; + + /* The following fields were added in v2 of the 24x7 interface. */ + + __u8 starting_thread_group_ix; + + /* -1 means all thread groups starting at @starting_thread_group_ix */ + __u8 max_num_thread_groups; + + __u8 reserved2[0xE]; } __packed; struct hv_24x7_request_buffer { /* 0 - ? */ /* 1 - ? */ -#define HV_24X7_IF_VERSION_CURRENT 0x01 __u8 interface_version; __u8 num_requests; __u8 reserved[0xE]; - struct hv_24x7_request requests[1]; + struct hv_24x7_request requests[]; } __packed; -struct hv_24x7_result_element { +struct hv_24x7_result_element_v1 { __be16 lpar_ix; /* @@ -67,10 +77,38 @@ struct hv_24x7_result_element { __be32 lpar_cfg_instance_id; /* size = @result_element_data_size of containing result. */ - __u64 element_data[1]; + __u64 element_data[]; +} __packed; + +/* + * We need a separate struct for v2 because the offset of @element_data changed + * between versions. + */ +struct hv_24x7_result_element_v2 { + __be16 lpar_ix; + + /* + * represents the core, chip, or virtual processor based on the + * request's @performance_domain + */ + __be16 domain_ix; + + /* -1 if @performance_domain does not refer to a virtual processor */ + __be32 lpar_cfg_instance_id; + + __u8 thread_group_ix; + + __u8 reserved[7]; + + /* size = @result_element_data_size of containing result. */ + __u64 element_data[]; } __packed; struct hv_24x7_result { + /* + * The index of the 24x7 Request Structure in the 24x7 Request Buffer + * used to request this result. + */ __u8 result_ix; /* @@ -81,14 +119,25 @@ struct hv_24x7_result { __u8 results_complete; __be16 num_elements_returned; - /* This is a copy of @data_size from the corresponding hv_24x7_request */ + /* + * This is a copy of @data_size from the corresponding hv_24x7_request + * + * Warning: to obtain the size of each element in @elements you have + * to add the size of the other members of the result_element struct. + */ __be16 result_element_data_size; __u8 reserved[0x2]; - /* WARNING: only valid for first result element due to variable sizes - * of result elements */ - /* struct hv_24x7_result_element[@num_elements_returned] */ - struct hv_24x7_result_element elements[1]; + /* + * Either + * struct hv_24x7_result_element_v1[@num_elements_returned] + * or + * struct hv_24x7_result_element_v2[@num_elements_returned] + * + * depending on the interface_version field of the + * struct hv_24x7_data_result_buffer containing this result. + */ + char elements[]; } __packed; struct hv_24x7_data_result_buffer { @@ -104,7 +153,7 @@ struct hv_24x7_data_result_buffer { __u8 reserved2[0x8]; /* WARNING: only valid for the first result due to variable sizes of * results */ - struct hv_24x7_result results[1]; /* [@num_results] */ + struct hv_24x7_result results[]; /* [@num_results] */ } __packed; #endif diff --git a/arch/powerpc/perf/power9-events-list.h b/arch/powerpc/perf/power9-events-list.h index 71a6bfee5c02..80204e064362 100644 --- a/arch/powerpc/perf/power9-events-list.h +++ b/arch/powerpc/perf/power9-events-list.h @@ -16,7 +16,7 @@ EVENT(PM_CYC, 0x0001e) EVENT(PM_ICT_NOSLOT_CYC, 0x100f8) EVENT(PM_CMPLU_STALL, 0x1e054) EVENT(PM_INST_CMPL, 0x00002) -EVENT(PM_BRU_CMPL, 0x10012) +EVENT(PM_BRU_CMPL, 0x4d05e) EVENT(PM_BR_MPRED_CMPL, 0x400f6) /* All L1 D cache load references counted at finish, gated by reject */ @@ -56,3 +56,5 @@ EVENT(PM_RUN_CYC, 0x600f4) /* Instruction Dispatched */ EVENT(PM_INST_DISP, 0x200f2) EVENT(PM_INST_DISP_ALT, 0x300f2) +/* Alternate Branch event code */ +EVENT(PM_BR_CMPL_ALT, 0x10012) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index bb28e1a41257..f17435e4a489 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -231,7 +231,7 @@ static int power9_generic_events_dd1[] = { [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PM_ICT_NOSLOT_CYC, [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = PM_CMPLU_STALL, [PERF_COUNT_HW_INSTRUCTIONS] = PM_INST_DISP, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PM_BRU_CMPL, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PM_BR_CMPL_ALT, [PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED_CMPL, [PERF_COUNT_HW_CACHE_REFERENCES] = PM_LD_REF_L1, [PERF_COUNT_HW_CACHE_MISSES] = PM_LD_MISS_L1_FIN, @@ -453,6 +453,12 @@ static int __init init_power9_pmu(void) * sampling scenarios in power9 DD1, instead use PM_INST_DISP. */ EVENT_VAR(PM_INST_CMPL, _g).id = PM_INST_DISP; + /* + * Power9 DD1 should use PM_BR_CMPL_ALT event code for + * "branches" to provide correct counter value. + */ + EVENT_VAR(PM_BRU_CMPL, _g).id = PM_BR_CMPL_ALT; + EVENT_VAR(PM_BRU_CMPL, _c).id = PM_BR_CMPL_ALT; rc = register_power_pmu(&power9_isa207_pmu); } else { rc = register_power_pmu(&power9_pmu); diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index 9b0afe935cc1..01cb109ebf17 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -199,6 +199,18 @@ config CURRITUCK help This option enables support for the IBM Currituck (476fpe) evaluation board +config FSP2 + bool "IBM FSP2 (476fpe) Support" + depends on PPC_47x + default n + select 476FPE + select IBM_EMAC_EMAC4 if IBM_EMAC + select IBM_EMAC_RGMII if IBM_EMAC + select COMMON_CLK + select DEFAULT_UIMAGE + help + This option enables support for the IBM FSP2 (476fpe) board + config AKEBONO bool "IBM Akebono (476gtr) Support" depends on PPC_47x diff --git a/arch/powerpc/platforms/44x/Makefile b/arch/powerpc/platforms/44x/Makefile index 26d35b5941f7..72b824160660 100644 --- a/arch/powerpc/platforms/44x/Makefile +++ b/arch/powerpc/platforms/44x/Makefile @@ -12,3 +12,4 @@ obj-$(CONFIG_ISS4xx) += iss4xx.o obj-$(CONFIG_CANYONLANDS)+= canyonlands.o obj-$(CONFIG_CURRITUCK) += ppc476.o obj-$(CONFIG_AKEBONO) += ppc476.o +obj-$(CONFIG_FSP2) += fsp2.o diff --git a/arch/powerpc/platforms/44x/fsp2.c b/arch/powerpc/platforms/44x/fsp2.c new file mode 100644 index 000000000000..92e98048404f --- /dev/null +++ b/arch/powerpc/platforms/44x/fsp2.c @@ -0,0 +1,62 @@ +/* + * FSP-2 board specific routines + * + * Based on earlier code: + * Matt Porter <mporter@kernel.crashing.org> + * Copyright 2002-2005 MontaVista Software Inc. + * + * Eugene Surovegin <eugene.surovegin@zultys.com> or <ebs@ebshome.net> + * Copyright (c) 2003-2005 Zultys Technologies + * + * Rewritten and ported to the merged powerpc tree: + * Copyright 2007 David Gibson <dwg@au1.ibm.com>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <linux/init.h> +#include <linux/of_platform.h> +#include <linux/rtc.h> + +#include <asm/machdep.h> +#include <asm/prom.h> +#include <asm/udbg.h> +#include <asm/time.h> +#include <asm/uic.h> +#include <asm/ppc4xx.h> + +static __initdata struct of_device_id fsp2_of_bus[] = { + { .compatible = "ibm,plb4", }, + { .compatible = "ibm,plb6", }, + { .compatible = "ibm,opb", }, + {}, +}; + +static int __init fsp2_device_probe(void) +{ + of_platform_bus_probe(NULL, fsp2_of_bus, NULL); + return 0; +} +machine_device_initcall(fsp2, fsp2_device_probe); + +static int __init fsp2_probe(void) +{ + unsigned long root = of_get_flat_dt_root(); + + if (!of_flat_dt_is_compatible(root, "ibm,fsp2")) + return 0; + return 1; +} + +define_machine(fsp2) { + .name = "FSP-2", + .probe = fsp2_probe, + .progress = udbg_progress, + .init_IRQ = uic_init_tree, + .get_irq = uic_get_irq, + .restart = ppc4xx_reset_system, + .calibrate_decr = generic_calibrate_decr, +}; diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index 895560f4be69..f84d52a2db40 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -115,7 +115,8 @@ static void smp_cell_setup_cpu(int cpu) static int smp_cell_kick_cpu(int nr) { - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= nr_cpu_ids) + return -EINVAL; if (!smp_startup_cpu(nr)) return -ENOENT; diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index d12ea7b9fd47..3f48f6df1cf3 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -48,6 +48,7 @@ static int pnv_eeh_init(void) { struct pci_controller *hose; struct pnv_phb *phb; + int max_diag_size = PNV_PCI_DIAG_BUF_SIZE; if (!firmware_has_feature(FW_FEATURE_OPAL)) { pr_warn("%s: OPAL is required !\n", @@ -69,6 +70,9 @@ static int pnv_eeh_init(void) if (phb->model == PNV_PHB_MODEL_P7IOC) eeh_add_flag(EEH_ENABLE_IO_FOR_LOG); + if (phb->diag_data_size > max_diag_size) + max_diag_size = phb->diag_data_size; + /* * PE#0 should be regarded as valid by EEH core * if it's not the reserved one. Currently, we @@ -82,6 +86,8 @@ static int pnv_eeh_init(void) break; } + eeh_set_pe_aux_size(max_diag_size); + return 0; } @@ -540,7 +546,7 @@ static void pnv_eeh_get_phb_diag(struct eeh_pe *pe) s64 rc; rc = opal_pci_get_phb_diag_data2(phb->opal_id, pe->data, - PNV_PCI_DIAG_BUF_SIZE); + phb->diag_data_size); if (rc != OPAL_SUCCESS) pr_warn("%s: Failure %lld getting PHB#%x diag-data\n", __func__, rc, pe->phb->global_number); @@ -1314,7 +1320,8 @@ static void pnv_eeh_dump_hub_diag_common(struct OpalIoP7IOCErrorData *data) static void pnv_eeh_get_and_dump_hub_diag(struct pci_controller *hose) { struct pnv_phb *phb = hose->private_data; - struct OpalIoP7IOCErrorData *data = &phb->diag.hub_diag; + struct OpalIoP7IOCErrorData *data = + (struct OpalIoP7IOCErrorData*)phb->diag_data; long rc; rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data)); @@ -1549,10 +1556,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) /* Dump PHB diag-data */ rc = opal_pci_get_phb_diag_data2(phb->opal_id, - phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE); + phb->diag_data, phb->diag_data_size); if (rc == OPAL_SUCCESS) pnv_pci_dump_phb_diag_data(hose, - phb->diag.blob); + phb->diag_data); /* Try best to clear it */ opal_pci_eeh_freeze_clear(phb->opal_id, @@ -1795,7 +1802,6 @@ static int __init eeh_powernv_init(void) { int ret = -EINVAL; - eeh_set_pe_aux_size(PNV_PCI_DIAG_BUF_SIZE); ret = eeh_ops_register(&pnv_eeh_ops); if (!ret) pr_info("EEH: PowerNV platform initialized\n"); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 445f30a2c5ef..2abee070373f 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -23,6 +23,7 @@ #include <asm/cpuidle.h> #include <asm/code-patching.h> #include <asm/smp.h> +#include <asm/runlatch.h> #include "powernv.h" #include "subcore.h" @@ -30,8 +31,33 @@ /* Power ISA 3.0 allows for stop states 0x0 - 0xF */ #define MAX_STOP_STATE 0xF +#define P9_STOP_SPR_MSR 2000 +#define P9_STOP_SPR_PSSCR 855 + static u32 supported_cpuidle_states; +/* + * The default stop state that will be used by ppc_md.power_save + * function on platforms that support stop instruction. + */ +static u64 pnv_default_stop_val; +static u64 pnv_default_stop_mask; +static bool default_stop_found; + +/* + * First deep stop state. Used to figure out when to save/restore + * hypervisor context. + */ +u64 pnv_first_deep_stop_state = MAX_STOP_STATE; + +/* + * psscr value and mask of the deepest stop idle state. + * Used when a cpu is offlined. + */ +static u64 pnv_deepest_stop_psscr_val; +static u64 pnv_deepest_stop_psscr_mask; +static bool deepest_stop_found; + static int pnv_save_sprs_for_deep_states(void) { int cpu; @@ -48,6 +74,8 @@ static int pnv_save_sprs_for_deep_states(void) uint64_t hid4_val = mfspr(SPRN_HID4); uint64_t hid5_val = mfspr(SPRN_HID5); uint64_t hmeer_val = mfspr(SPRN_HMEER); + uint64_t msr_val = MSR_IDLE; + uint64_t psscr_val = pnv_deepest_stop_psscr_val; for_each_possible_cpu(cpu) { uint64_t pir = get_hard_smp_processor_id(cpu); @@ -61,6 +89,18 @@ static int pnv_save_sprs_for_deep_states(void) if (rc != 0) return rc; + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + rc = opal_slw_set_reg(pir, P9_STOP_SPR_MSR, msr_val); + if (rc) + return rc; + + rc = opal_slw_set_reg(pir, + P9_STOP_SPR_PSSCR, psscr_val); + + if (rc) + return rc; + } + /* HIDs are per core registers */ if (cpu_thread_in_core(cpu) == 0) { @@ -72,17 +112,21 @@ static int pnv_save_sprs_for_deep_states(void) if (rc != 0) return rc; - rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); - if (rc != 0) - return rc; + /* Only p8 needs to set extra HID regiters */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { - rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val); - if (rc != 0) - return rc; + rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); + if (rc != 0) + return rc; - rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val); - if (rc != 0) - return rc; + rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val); + if (rc != 0) + return rc; + } } } @@ -96,15 +140,24 @@ static void pnv_alloc_idle_core_states(void) u32 *core_idle_state; /* - * core_idle_state - First 8 bits track the idle state of each thread - * of the core. The 8th bit is the lock bit. Initially all thread bits - * are set. They are cleared when the thread enters deep idle state - * like sleep and winkle. Initially the lock bit is cleared. - * The lock bit has 2 purposes - * a. While the first thread is restoring core state, it prevents - * other threads in the core from switching to process context. - * b. While the last thread in the core is saving the core state, it - * prevents a different thread from waking up. + * core_idle_state - The lower 8 bits track the idle state of + * each thread of the core. + * + * The most significant bit is the lock bit. + * + * Initially all the bits corresponding to threads_per_core + * are set. They are cleared when the thread enters deep idle + * state like sleep and winkle/stop. + * + * Initially the lock bit is cleared. The lock bit has 2 + * purposes: + * a. While the first thread in the core waking up from + * idle is restoring core state, it prevents other + * threads in the core from switching to process + * context. + * b. While the last thread in the core is saving the + * core state, it prevents a different thread from + * waking up. */ for (i = 0; i < nr_cores; i++) { int first_cpu = i * threads_per_core; @@ -112,7 +165,7 @@ static void pnv_alloc_idle_core_states(void) size_t paca_ptr_array_size; core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); - *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; + *core_idle_state = (1 << threads_per_core) - 1; paca_ptr_array_size = (threads_per_core * sizeof(struct paca_struct *)); @@ -231,56 +284,104 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, show_fastsleep_workaround_applyonce, store_fastsleep_workaround_applyonce); -/* - * The default stop state that will be used by ppc_md.power_save - * function on platforms that support stop instruction. - */ -static u64 pnv_default_stop_val; -static u64 pnv_default_stop_mask; -static bool default_stop_found; +static unsigned long __power7_idle_type(unsigned long type) +{ + unsigned long srr1; -/* - * Used for ppc_md.power_save which needs a function with no parameters - */ -static void power9_idle(void) + if (!prep_irq_for_idle_irqsoff()) + return 0; + + __ppc64_runlatch_off(); + srr1 = power7_idle_insn(type); + __ppc64_runlatch_on(); + + fini_irq_for_idle_irqsoff(); + + return srr1; +} + +void power7_idle_type(unsigned long type) +{ + unsigned long srr1; + + srr1 = __power7_idle_type(type); + irq_set_pending_from_srr1(srr1); +} + +void power7_idle(void) { - power9_idle_stop(pnv_default_stop_val, pnv_default_stop_mask); + if (!powersave_nap) + return; + + power7_idle_type(PNV_THREAD_NAP); } -/* - * First deep stop state. Used to figure out when to save/restore - * hypervisor context. - */ -u64 pnv_first_deep_stop_state = MAX_STOP_STATE; +static unsigned long __power9_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask) +{ + unsigned long psscr; + unsigned long srr1; + + if (!prep_irq_for_idle_irqsoff()) + return 0; + + psscr = mfspr(SPRN_PSSCR); + psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; + + __ppc64_runlatch_off(); + srr1 = power9_idle_stop(psscr); + __ppc64_runlatch_on(); + + fini_irq_for_idle_irqsoff(); + + return srr1; +} + +void power9_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask) +{ + unsigned long srr1; + + srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask); + irq_set_pending_from_srr1(srr1); +} /* - * psscr value and mask of the deepest stop idle state. - * Used when a cpu is offlined. + * Used for ppc_md.power_save which needs a function with no parameters */ -static u64 pnv_deepest_stop_psscr_val; -static u64 pnv_deepest_stop_psscr_mask; -static bool deepest_stop_found; +void power9_idle(void) +{ + power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask); +} +#ifdef CONFIG_HOTPLUG_CPU /* * pnv_cpu_offline: A function that puts the CPU into the deepest * available platform idle state on a CPU-Offline. + * interrupts hard disabled and no lazy irq pending. */ unsigned long pnv_cpu_offline(unsigned int cpu) { unsigned long srr1; - u32 idle_states = pnv_get_supported_cpuidle_states(); + __ppc64_runlatch_off(); + if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { - srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, - pnv_deepest_stop_psscr_mask); + unsigned long psscr; + + psscr = mfspr(SPRN_PSSCR); + psscr = (psscr & ~pnv_deepest_stop_psscr_mask) | + pnv_deepest_stop_psscr_val; + srr1 = power9_idle_stop(psscr); + } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { - srr1 = power7_winkle(); + srr1 = power7_idle_insn(PNV_THREAD_WINKLE); } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - srr1 = power7_sleep(); + srr1 = power7_idle_insn(PNV_THREAD_SLEEP); } else if (idle_states & OPAL_PM_NAP_ENABLED) { - srr1 = power7_nap(1); + srr1 = power7_idle_insn(PNV_THREAD_NAP); } else { /* This is the fallback method. We emulate snooze */ while (!generic_check_cpu_restart(cpu)) { @@ -291,8 +392,11 @@ unsigned long pnv_cpu_offline(unsigned int cpu) HMT_medium(); } + __ppc64_runlatch_on(); + return srr1; } +#endif /* * Power ISA 3.0 idle initialization. diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index f620572f891f..4ca6c26a56d5 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -99,10 +99,10 @@ opal_return: lwz r4,8(r1); ld r5,PPC_LR_STKOFF(r1); ld r6,PACASAVEDMSR(r13); - mtspr SPRN_SRR0,r5; - mtspr SPRN_SRR1,r6; mtcr r4; - rfid + mtspr SPRN_HSRR0,r5; + mtspr SPRN_HSRR1,r6; + hrfid opal_real_call: mfcr r11 diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 283caf1070c9..437613588df1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1718,6 +1718,100 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev */ } +static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe) +{ + unsigned short vendor = 0; + struct pci_dev *pdev; + + if (pe->device_count == 1) + return true; + + /* pe->pdev should be set if it's a single device, pe->pbus if not */ + if (!pe->pbus) + return true; + + list_for_each_entry(pdev, &pe->pbus->devices, bus_list) { + if (!vendor) { + vendor = pdev->vendor; + continue; + } + + if (pdev->vendor != vendor) + return false; + } + + return true; +} + +/* + * Reconfigure TVE#0 to be usable as 64-bit DMA space. + * + * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses. + * Devices can only access more than that if bit 59 of the PCI address is set + * by hardware, which indicates TVE#1 should be used instead of TVE#0. + * Many PCI devices are not capable of addressing that many bits, and as a + * result are limited to the 4GB of virtual memory made available to 32-bit + * devices in TVE#0. + * + * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit + * devices by configuring the virtual memory past the first 4GB inaccessible + * by 64-bit DMAs. This should only be used by devices that want more than + * 4GB, and only on PEs that have no 32-bit devices. + * + * Currently this will only work on PHB3 (POWER8). + */ +static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe) +{ + u64 window_size, table_size, tce_count, addr; + struct page *table_pages; + u64 tce_order = 28; /* 256MB TCEs */ + __be64 *tces; + s64 rc; + + /* + * Window size needs to be a power of two, but needs to account for + * shifting memory by the 4GB offset required to skip 32bit space. + */ + window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32)); + tce_count = window_size >> tce_order; + table_size = tce_count << 3; + + if (table_size < PAGE_SIZE) + table_size = PAGE_SIZE; + + table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL, + get_order(table_size)); + if (!table_pages) + goto err; + + tces = page_address(table_pages); + if (!tces) + goto err; + + memset(tces, 0, table_size); + + for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) { + tces[(addr + (1ULL << 32)) >> tce_order] = + cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE); + } + + rc = opal_pci_map_pe_dma_window(pe->phb->opal_id, + pe->pe_number, + /* reconfigure window 0 */ + (pe->pe_number << 1) + 0, + 1, + __pa(tces), + table_size, + 1 << tce_order); + if (rc == OPAL_SUCCESS) { + pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n"); + return 0; + } +err: + pe_err(pe, "Error configuring 64-bit DMA bypass\n"); + return -EIO; +} + static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); @@ -1726,6 +1820,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) struct pnv_ioda_pe *pe; uint64_t top; bool bypass = false; + s64 rc; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) return -ENODEV;; @@ -1740,8 +1835,27 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); set_dma_ops(&pdev->dev, &dma_direct_ops); } else { - dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); - set_dma_ops(&pdev->dev, &dma_iommu_ops); + /* + * If the device can't set the TCE bypass bit but still wants + * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to + * bypass the 32-bit region and be usable for 64-bit DMAs. + * The device needs to be able to address all of this space. + */ + if (dma_mask >> 32 && + dma_mask > (memory_hotplug_max() + (1ULL << 32)) && + pnv_pci_ioda_pe_single_vendor(pe) && + phb->model == PNV_PHB_MODEL_PHB3) { + /* Configure the bypass mode */ + rc = pnv_pci_ioda_dma_64bit_bypass(pe); + if (rc) + return rc; + /* 4GB offset bypasses 32-bit space */ + set_dma_offset(&pdev->dev, (1ULL << 32)); + set_dma_ops(&pdev->dev, &dma_direct_ops); + } else { + dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); + set_dma_ops(&pdev->dev, &dma_iommu_ops); + } } *pdev->dev.dma_mask = dma_mask; @@ -3123,13 +3237,13 @@ static int pnv_pci_diag_data_set(void *data, u64 val) phb = hose->private_data; /* Retrieve the diag data from firmware */ - ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, - PNV_PCI_DIAG_BUF_SIZE); + ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data, + phb->diag_data_size); if (ret != OPAL_SUCCESS) return -EIO; /* Print the diag data to the kernel log */ - pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob); + pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data); return 0; } @@ -3725,6 +3839,15 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, else phb->model = PNV_PHB_MODEL_UNKNOWN; + /* Initialize diagnostic data buffer */ + prop32 = of_get_property(np, "ibm,phb-diag-data-size", NULL); + if (prop32) + phb->diag_data_size = be32_to_cpup(prop32); + else + phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE; + + phb->diag_data = memblock_virt_alloc(phb->diag_data_size, 0); + /* Parse 32-bit and IO ranges (if any) */ pci_process_bridge_OF_ranges(hose, np, !hose->global_number); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 935ccb249a8a..7905d179d036 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -227,11 +227,39 @@ void pnv_teardown_msi_irqs(struct pci_dev *pdev) } #endif /* CONFIG_PCI_MSI */ +/* Nicely print the contents of the PE State Tables (PEST). */ +static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size) +{ + __be64 prevA = ULONG_MAX, prevB = ULONG_MAX; + bool dup = false; + int i; + + for (i = 0; i < pest_size; i++) { + __be64 peA = be64_to_cpu(pestA[i]); + __be64 peB = be64_to_cpu(pestB[i]); + + if (peA != prevA || peB != prevB) { + if (dup) { + pr_info("PE[..%03x] A/B: as above\n", i-1); + dup = false; + } + prevA = peA; + prevB = peB; + if (peA & PNV_IODA_STOPPED_STATE || + peB & PNV_IODA_STOPPED_STATE) + pr_info("PE[%03x] A/B: %016llx %016llx\n", + i, peA, peB); + } else if (!dup && (peA & PNV_IODA_STOPPED_STATE || + peB & PNV_IODA_STOPPED_STATE)) { + dup = true; + } + } +} + static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose, struct OpalIoPhbErrorCommon *common) { struct OpalIoP7IOCPhbErrorData *data; - int i; data = (struct OpalIoP7IOCPhbErrorData *)common; pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n", @@ -308,22 +336,13 @@ static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose, be64_to_cpu(data->dma1ErrorLog0), be64_to_cpu(data->dma1ErrorLog1)); - for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) { - if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 && - (be64_to_cpu(data->pestB[i]) >> 63) == 0) - continue; - - pr_info("PE[%3d] A/B: %016llx %016llx\n", - i, be64_to_cpu(data->pestA[i]), - be64_to_cpu(data->pestB[i])); - } + pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_P7IOC_NUM_PEST_REGS); } static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, struct OpalIoPhbErrorCommon *common) { struct OpalIoPhb3ErrorData *data; - int i; data = (struct OpalIoPhb3ErrorData*)common; pr_info("PHB3 PHB#%x Diag-data (Version: %d)\n", @@ -404,15 +423,109 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, be64_to_cpu(data->dma1ErrorLog0), be64_to_cpu(data->dma1ErrorLog1)); - for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) { - if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 && - (be64_to_cpu(data->pestB[i]) >> 63) == 0) - continue; + pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB3_NUM_PEST_REGS); +} - pr_info("PE[%3d] A/B: %016llx %016llx\n", - i, be64_to_cpu(data->pestA[i]), - be64_to_cpu(data->pestB[i])); - } +static void pnv_pci_dump_phb4_diag_data(struct pci_controller *hose, + struct OpalIoPhbErrorCommon *common) +{ + struct OpalIoPhb4ErrorData *data; + + data = (struct OpalIoPhb4ErrorData*)common; + pr_info("PHB4 PHB#%d Diag-data (Version: %d)\n", + hose->global_number, be32_to_cpu(common->version)); + if (data->brdgCtl) + pr_info("brdgCtl: %08x\n", + be32_to_cpu(data->brdgCtl)); + if (data->deviceStatus || data->slotStatus || + data->linkStatus || data->devCmdStatus || + data->devSecStatus) + pr_info("RootSts: %08x %08x %08x %08x %08x\n", + be32_to_cpu(data->deviceStatus), + be32_to_cpu(data->slotStatus), + be32_to_cpu(data->linkStatus), + be32_to_cpu(data->devCmdStatus), + be32_to_cpu(data->devSecStatus)); + if (data->rootErrorStatus || data->uncorrErrorStatus || + data->corrErrorStatus) + pr_info("RootErrSts: %08x %08x %08x\n", + be32_to_cpu(data->rootErrorStatus), + be32_to_cpu(data->uncorrErrorStatus), + be32_to_cpu(data->corrErrorStatus)); + if (data->tlpHdr1 || data->tlpHdr2 || + data->tlpHdr3 || data->tlpHdr4) + pr_info("RootErrLog: %08x %08x %08x %08x\n", + be32_to_cpu(data->tlpHdr1), + be32_to_cpu(data->tlpHdr2), + be32_to_cpu(data->tlpHdr3), + be32_to_cpu(data->tlpHdr4)); + if (data->sourceId) + pr_info("sourceId: %08x\n", be32_to_cpu(data->sourceId)); + if (data->nFir) + pr_info("nFir: %016llx %016llx %016llx\n", + be64_to_cpu(data->nFir), + be64_to_cpu(data->nFirMask), + be64_to_cpu(data->nFirWOF)); + if (data->phbPlssr || data->phbCsr) + pr_info("PhbSts: %016llx %016llx\n", + be64_to_cpu(data->phbPlssr), + be64_to_cpu(data->phbCsr)); + if (data->lemFir) + pr_info("Lem: %016llx %016llx %016llx\n", + be64_to_cpu(data->lemFir), + be64_to_cpu(data->lemErrorMask), + be64_to_cpu(data->lemWOF)); + if (data->phbErrorStatus) + pr_info("PhbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbErrorStatus), + be64_to_cpu(data->phbFirstErrorStatus), + be64_to_cpu(data->phbErrorLog0), + be64_to_cpu(data->phbErrorLog1)); + if (data->phbTxeErrorStatus) + pr_info("PhbTxeErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbTxeErrorStatus), + be64_to_cpu(data->phbTxeFirstErrorStatus), + be64_to_cpu(data->phbTxeErrorLog0), + be64_to_cpu(data->phbTxeErrorLog1)); + if (data->phbRxeArbErrorStatus) + pr_info("RxeArbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbRxeArbErrorStatus), + be64_to_cpu(data->phbRxeArbFirstErrorStatus), + be64_to_cpu(data->phbRxeArbErrorLog0), + be64_to_cpu(data->phbRxeArbErrorLog1)); + if (data->phbRxeMrgErrorStatus) + pr_info("RxeMrgErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbRxeMrgErrorStatus), + be64_to_cpu(data->phbRxeMrgFirstErrorStatus), + be64_to_cpu(data->phbRxeMrgErrorLog0), + be64_to_cpu(data->phbRxeMrgErrorLog1)); + if (data->phbRxeTceErrorStatus) + pr_info("RxeTceErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbRxeTceErrorStatus), + be64_to_cpu(data->phbRxeTceFirstErrorStatus), + be64_to_cpu(data->phbRxeTceErrorLog0), + be64_to_cpu(data->phbRxeTceErrorLog1)); + + if (data->phbPblErrorStatus) + pr_info("PblErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbPblErrorStatus), + be64_to_cpu(data->phbPblFirstErrorStatus), + be64_to_cpu(data->phbPblErrorLog0), + be64_to_cpu(data->phbPblErrorLog1)); + if (data->phbPcieDlpErrorStatus) + pr_info("PcieDlp: %016llx %016llx %016llx\n", + be64_to_cpu(data->phbPcieDlpErrorLog1), + be64_to_cpu(data->phbPcieDlpErrorLog2), + be64_to_cpu(data->phbPcieDlpErrorStatus)); + if (data->phbRegbErrorStatus) + pr_info("RegbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbRegbErrorStatus), + be64_to_cpu(data->phbRegbFirstErrorStatus), + be64_to_cpu(data->phbRegbErrorLog0), + be64_to_cpu(data->phbRegbErrorLog1)); + + + pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB4_NUM_PEST_REGS); } void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, @@ -431,6 +544,9 @@ void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, case OPAL_PHB_ERROR_DATA_TYPE_PHB3: pnv_pci_dump_phb3_diag_data(hose, common); break; + case OPAL_PHB_ERROR_DATA_TYPE_PHB4: + pnv_pci_dump_phb4_diag_data(hose, common); + break; default: pr_warn("%s: Unrecognized ioType %d\n", __func__, be32_to_cpu(common->ioType)); @@ -445,8 +561,8 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) spin_lock_irqsave(&phb->lock, flags); /* Fetch PHB diag-data */ - rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, - PNV_PCI_DIAG_BUF_SIZE); + rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data, + phb->diag_data_size); has_diag = (rc == OPAL_SUCCESS); /* If PHB supports compound PE, to handle it */ @@ -474,7 +590,7 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) * with the normal errors generated when probing empty slots */ if (has_diag && ret) - pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob); + pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data); spin_unlock_irqrestore(&phb->lock, flags); } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 18c8a2fa03b8..f16bc403ec03 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -33,6 +33,9 @@ enum pnv_phb_model { #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ +/* Indicates operations are frozen for a PE: MMIO in PESTA & DMA in PESTB. */ +#define PNV_IODA_STOPPED_STATE 0x8000000000000000 + /* Data associated with a PE, including IOMMU tracking etc.. */ struct pnv_phb; struct pnv_ioda_pe { @@ -169,13 +172,9 @@ struct pnv_phb { unsigned int pe_rmap[0x10000]; } ioda; - /* PHB and hub status structure */ - union { - unsigned char blob[PNV_PCI_DIAG_BUF_SIZE]; - struct OpalIoP7IOCPhbErrorData p7ioc; - struct OpalIoPhb3ErrorData phb3; - struct OpalIoP7IOCErrorData hub_diag; - } diag; + /* PHB and hub diagnostics */ + unsigned int diag_data_size; + u8 *diag_data; /* Nvlink2 data */ struct npu { diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 4aff754b6f2c..40dae96f7e20 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -63,7 +63,8 @@ static int pnv_smp_kick_cpu(int nr) long rc; uint8_t status; - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= nr_cpu_ids) + return -EINVAL; /* * If we already started or OPAL is not supported, we just @@ -144,7 +145,14 @@ static void pnv_smp_cpu_kill_self(void) unsigned long srr1, wmask; /* Standard hot unplug procedure */ - local_irq_disable(); + /* + * This hard disables local interurpts, ensuring we have no lazy + * irqs pending. + */ + WARN_ON(irqs_disabled()); + hard_irq_disable(); + WARN_ON(lazy_irq_pending()); + idle_task_exit(); current->active_mm = NULL; /* for sanity */ cpu = smp_processor_id(); @@ -162,16 +170,6 @@ static void pnv_smp_cpu_kill_self(void) */ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); - /* - * Hard-disable interrupts, and then clear irq_happened flags - * that we can safely ignore while off-line, since they - * are for things for which we do no processing when off-line - * (or in the case of HMI, all the processing we need to do - * is done in lower-level real-mode code). - */ - hard_irq_disable(); - local_paca->irq_happened &= ~(PACA_IRQ_DEC | PACA_IRQ_HMI); - while (!generic_check_cpu_restart(cpu)) { /* * Clear IPI flag, since we don't handle IPIs while @@ -182,9 +180,9 @@ static void pnv_smp_cpu_kill_self(void) */ kvmppc_set_host_ipi(cpu, 0); - ppc64_runlatch_off(); srr1 = pnv_cpu_offline(cpu); - ppc64_runlatch_on(); + + WARN_ON(lazy_irq_pending()); /* * If the SRR1 value indicates that we woke up due to @@ -198,8 +196,7 @@ static void pnv_smp_cpu_kill_self(void) * contains 0. */ if (((srr1 & wmask) == SRR1_WAKEEE) || - ((srr1 & wmask) == SRR1_WAKEHVI) || - (local_paca->irq_happened & PACA_IRQ_EE)) { + ((srr1 & wmask) == SRR1_WAKEHVI)) { if (cpu_has_feature(CPU_FTR_ARCH_300)) { if (xive_enabled()) xive_flush_interrupt(); @@ -211,14 +208,15 @@ static void pnv_smp_cpu_kill_self(void) unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); asm volatile(PPC_MSGCLR(%0) : : "r" (msg)); } - local_paca->irq_happened &= ~(PACA_IRQ_EE | PACA_IRQ_DBELL); smp_mb(); if (cpu_core_split_required()) continue; if (srr1 && !generic_check_cpu_restart(cpu)) - DBG("CPU%d Unexpected exit while offline !\n", cpu); + DBG("CPU%d Unexpected exit while offline srr1=%lx!\n", + cpu, srr1); + } /* Re-enable decrementer interrupts */ diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c index 309876d699e9..596ae2e98040 100644 --- a/arch/powerpc/platforms/powernv/subcore.c +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -18,6 +18,7 @@ #include <linux/stop_machine.h> #include <asm/cputhreads.h> +#include <asm/cpuidle.h> #include <asm/kvm_ppc.h> #include <asm/machdep.h> #include <asm/opal.h> @@ -182,7 +183,7 @@ static void unsplit_core(void) cpu = smp_processor_id(); if (cpu_thread_in_core(cpu) != 0) { while (mfspr(SPRN_HID0) & mask) - power7_nap(0); + power7_idle_insn(PNV_THREAD_NAP); per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT; return; diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 913c54e23eea..3a6dfd14f64b 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -124,7 +124,7 @@ config HV_PERF_CTRS Enable access to hypervisor supplied counters in perf. Currently, this enables code that uses the hcall GetPerfCounterInfo and 24x7 interfaces to retrieve counters. GPCI exists on Power 6 and later - systems. 24x7 is available on Power 8 systems. + systems. 24x7 is available on Power 8 and later systems. If unsure, select Y. diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 7bc0e91f8715..6afd1efd3633 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -554,7 +554,7 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) { int rc; - pr_debug("Attemping to remove CPU %s, drc index: %x\n", + pr_debug("Attempting to remove CPU %s, drc index: %x\n", dn->name, drc_index); rc = dlpar_offline_cpu(dn); diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 1fb162ba9d1c..ca9b2f4aaa22 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -22,6 +22,7 @@ #include <asm/machdep.h> #include <asm/prom.h> #include <asm/sparsemem.h> +#include <asm/fadump.h> #include "pseries.h" static bool rtas_hp_event; @@ -408,6 +409,12 @@ static bool lmb_is_removable(struct of_drconf_cell *lmb) scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; phys_addr = lmb->base_addr; +#ifdef CONFIG_FA_DUMP + /* Don't hot-remove memory that falls in fadump boot memory area */ + if (is_fadump_boot_memory_area(phys_addr, block_sz)) + return false; +#endif + for (i = 0; i < scns_per_block; i++) { pfn = PFN_DOWN(phys_addr); if (!pfn_present(pfn)) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 6541d0b03e4c..495ba4e7336d 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -301,7 +301,7 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot, int ssize, unsigned long inv_flags) { unsigned long lpar_rc; - unsigned long flags = (newpp & 7) | H_AVPN; + unsigned long flags; unsigned long want_v; want_v = hpte_encode_avpn(vpn, psize, ssize); @@ -309,6 +309,11 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot, pr_devel(" update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...", want_v, slot, flags, psize); + flags = (newpp & 7) | H_AVPN; + if (mmu_has_feature(MMU_FTR_KERNEL_RO)) + /* Move pp0 into bit 8 (IBM 55) */ + flags |= (newpp & HPTE_R_PP0) >> 55; + lpar_rc = plpar_pte_protect(flags, slot, want_v); if (lpar_rc == H_NOT_FOUND) { @@ -380,6 +385,10 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, BUG_ON(slot == -1); flags = newpp & 7; + if (mmu_has_feature(MMU_FTR_KERNEL_RO)) + /* Move pp0 into bit 8 (IBM 55) */ + flags |= (newpp & HPTE_R_PP0) >> 55; + lpar_rc = plpar_pte_protect(flags, slot, 0); BUG_ON(lpar_rc != H_SUCCESS); diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 52ca6b311d44..24785f63fb40 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -151,7 +151,8 @@ static void smp_setup_cpu(int cpu) static int smp_pSeries_kick_cpu(int nr) { - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= nr_cpu_ids) + return -EINVAL; if (!smp_startup_cpu(nr)) return -ENOENT; diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/sysdev/mpc8xx_pic.c index 3e828b20c21e..2842f9d63d21 100644 --- a/arch/powerpc/sysdev/mpc8xx_pic.c +++ b/arch/powerpc/sysdev/mpc8xx_pic.c @@ -79,7 +79,7 @@ unsigned int mpc8xx_get_irq(void) irq = in_be32(&siu_reg->sc_sivec) >> 26; if (irq == PIC_VEC_SPURRIOUS) - irq = 0; + return 0; return irq_linear_revmap(mpc8xx_pic_host, irq); diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 8f5e3035483b..6595462b1fc8 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1417,7 +1417,7 @@ bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, /* Get ready for interrupts */ xive_setup_cpu(); - pr_info("Interrupt handling intialized with %s backend\n", + pr_info("Interrupt handling initialized with %s backend\n", xive_ops->name); pr_info("Using priority %d for all interrupts\n", max_prio); diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index ab9ecce61ee5..0f95476b01f6 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -633,8 +633,8 @@ u32 xive_native_alloc_vp_block(u32 max_vcpus) if (max_vcpus > (1 << order)) order++; - pr_info("VP block alloc, for max VCPUs %d use order %d\n", - max_vcpus, order); + pr_debug("VP block alloc, for max VCPUs %d use order %d\n", + max_vcpus, order); for (;;) { rc = opal_xive_alloc_vp_block(order); diff --git a/arch/powerpc/tools/head_check.sh b/arch/powerpc/tools/head_check.sh new file mode 100644 index 000000000000..ad9e57209aa4 --- /dev/null +++ b/arch/powerpc/tools/head_check.sh @@ -0,0 +1,78 @@ +# Copyright © 2016 IBM Corporation + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version +# 2 of the License, or (at your option) any later version. + +# This script checks the head of a vmlinux for linker stubs that +# break our placement of fixed-location code for 64-bit. + +# based on relocs_check.pl +# Copyright © 2009 IBM Corporation + +# NOTE! +# +# If the build dies here, it's likely code in head_64.S/exception-64*.S or +# nearby, is branching to labels it can't reach directly, which results in the +# linker inserting branch stubs. This can move code around in ways that break +# the fixed section calculations (head-64.h). To debug this, disassemble the +# vmlinux and look for branch stubs (long_branch, plt_branch, etc.) in the +# fixed section region (0 - 0x8000ish). Check what code is calling those stubs, +# and perhaps change so a direct branch can reach. +# +# A ".linker_stub_catch" section is used to catch some stubs generated by +# early .text code, which tend to get placed at the start of the section. +# If there are too many such stubs, they can overflow this section. Expanding +# it may help (or reducing the number of stub branches). +# +# Linker stubs use the TOC pointer, so even if fixed section code could +# tolerate them being inserted into head code, they can't be allowed in low +# level entry code (boot, interrupt vectors, etc) until r2 is set up. This +# could cause the kernel to die in early boot. + +# Turn this on if you want more debug output: +# set -x + +if [ $# -lt 2 ]; then + echo "$0 [path to nm] [path to vmlinux]" 1>&2 + exit 1 +fi + +# Have Kbuild supply the path to nm so we handle cross compilation. +nm="$1" +vmlinux="$2" + +# gcc-4.6-era toolchain make _stext an A (absolute) symbol rather than T +$nm "$vmlinux" | grep -e " [TA] _stext$" -e " t start_first_256B$" -e " a text_start$" -e " t start_text$" -m4 > .tmp_symbols.txt + + +vma=$(cat .tmp_symbols.txt | grep -e " [TA] _stext$" | cut -d' ' -f1) + +expected_start_head_addr=$vma + +start_head_addr=$(cat .tmp_symbols.txt | grep " t start_first_256B$" | cut -d' ' -f1) + +if [ "$start_head_addr" != "$expected_start_head_addr" ]; then + echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr" + echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" + echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" + + exit 1 +fi + +top_vma=$(echo $vma | cut -d'0' -f1) + +expected_start_text_addr=$(cat .tmp_symbols.txt | grep " a text_start$" | cut -d' ' -f1 | sed "s/^0/$top_vma/") + +start_text_addr=$(cat .tmp_symbols.txt | grep " t start_text$" | cut -d' ' -f1) + +if [ "$start_text_addr" != "$expected_start_text_addr" ]; then + echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr" + echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" + echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" + + exit 1 +fi + +rm -f .tmp_symbols.txt diff --git a/arch/powerpc/tools/unrel_branch_check.sh b/arch/powerpc/tools/unrel_branch_check.sh new file mode 100755 index 000000000000..1e972df3107e --- /dev/null +++ b/arch/powerpc/tools/unrel_branch_check.sh @@ -0,0 +1,57 @@ +# Copyright © 2016 IBM Corporation +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version +# 2 of the License, or (at your option) any later version. +# +# This script checks the relocations of a vmlinux for "suspicious" +# branches from unrelocated code (head_64.S code). + +# Turn this on if you want more debug output: +# set -x + +# Have Kbuild supply the path to objdump so we handle cross compilation. +objdump="$1" +vmlinux="$2" + +#__end_interrupts should be located within the first 64K + +end_intr=0x$( +"$objdump" -R "$vmlinux" -d --start-address=0xc000000000000000 \ + --stop-address=0xc000000000010000 | +grep '\<__end_interrupts>:' | +awk '{print $1}' +) + +BRANCHES=$( +"$objdump" -R "$vmlinux" -D --start-address=0xc000000000000000 \ + --stop-address=${end_intr} | +grep -e "^c[0-9a-f]*:[[:space:]]*\([0-9a-f][0-9a-f][[:space:]]\)\{4\}[[:space:]]*b" | +grep -v '\<__start_initialization_multiplatform>' | +grep -v -e 'b.\?.\?ctr' | +grep -v -e 'b.\?.\?lr' | +sed 's/://' | +awk '{ print $1 ":" $6 ":0x" $7 ":" $8 " "}' +) + +for tuple in $BRANCHES +do + from=`echo $tuple | cut -d':' -f1` + branch=`echo $tuple | cut -d':' -f2` + to=`echo $tuple | cut -d':' -f3 | sed 's/cr[0-7],//'` + sym=`echo $tuple | cut -d':' -f4` + + if (( $to > $end_intr )) + then + if [ -z "$bad_branches" ]; then + echo "WARNING: Unrelocated relative branches" + bad_branches="yes" + fi + echo "$from $branch-> $to $sym" + fi +done + +if [ -z "$bad_branches" ]; then + exit 0 +fi diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index f11f65634aab..08e367e3e8c3 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -53,6 +53,7 @@ #include <asm/xive.h> #include <asm/opal.h> #include <asm/firmware.h> +#include <asm/code-patching.h> #ifdef CONFIG_PPC64 #include <asm/hvcall.h> @@ -837,7 +838,8 @@ static void insert_bpts(void) store_inst(&bp->instr[0]); if (bp->enabled & BP_CIABR) continue; - if (mwrite(bp->address, &bpinstr, 4) != 4) { + if (patch_instruction((unsigned int *)bp->address, + bpinstr) != 0) { printf("Couldn't write instruction at %lx, " "disabling breakpoint there\n", bp->address); bp->enabled &= ~BP_TRAP; @@ -874,7 +876,8 @@ static void remove_bpts(void) continue; if (mread(bp->address, &instr, 4) == 4 && instr == bpinstr - && mwrite(bp->address, &bp->instr, 4) != 4) + && patch_instruction( + (unsigned int *)bp->address, bp->instr[0]) != 0) printf("Couldn't remove breakpoint at %lx\n", bp->address); else @@ -1242,14 +1245,14 @@ bpt_cmds(void) { int cmd; unsigned long a; - int mode, i; + int i; struct bpt *bp; - const char badaddr[] = "Only kernel addresses are permitted " - "for breakpoints\n"; cmd = inchar(); switch (cmd) { -#ifndef CONFIG_8xx +#ifndef CONFIG_PPC_8xx + static const char badaddr[] = "Only kernel addresses are permitted for breakpoints\n"; + int mode; case 'd': /* bd - hardware data breakpoint */ mode = 7; cmd = inchar(); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7d7e0e811c46..94a18681353d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,7 @@ config X86 select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_ZONE_DEVICE if X86_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 12409a519cc5..37b0698b7193 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -32,18 +32,18 @@ static struct cpuidle_driver powernv_idle_driver = { .owner = THIS_MODULE, }; -static int max_idle_state; -static struct cpuidle_state *cpuidle_state_table; +static int max_idle_state __read_mostly; +static struct cpuidle_state *cpuidle_state_table __read_mostly; struct stop_psscr_table { u64 val; u64 mask; }; -static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX]; +static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; -static u64 snooze_timeout; -static bool snooze_timeout_en; +static u64 snooze_timeout __read_mostly; +static bool snooze_timeout_en __read_mostly; static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, @@ -51,21 +51,30 @@ static int snooze_loop(struct cpuidle_device *dev, { u64 snooze_exit_time; - local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); + local_irq_enable(); + snooze_exit_time = get_tb() + snooze_timeout; ppc64_runlatch_off(); HMT_very_low(); while (!need_resched()) { - if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) + if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + /* + * Task has not woken up but we are exiting the polling + * loop anyway. Require a barrier after polling is + * cleared to order subsequent test of need_resched(). + */ + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb(); break; + } } HMT_medium(); ppc64_runlatch_on(); clear_thread_flag(TIF_POLLING_NRFLAG); - smp_mb(); + return index; } @@ -73,9 +82,8 @@ static int nap_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - ppc64_runlatch_off(); - power7_idle(); - ppc64_runlatch_on(); + power7_idle_type(PNV_THREAD_NAP); + return index; } @@ -98,7 +106,8 @@ static int fastsleep_loop(struct cpuidle_device *dev, new_lpcr &= ~LPCR_PECE1; mtspr(SPRN_LPCR, new_lpcr); - power7_sleep(); + + power7_idle_type(PNV_THREAD_SLEEP); mtspr(SPRN_LPCR, old_lpcr); @@ -110,10 +119,8 @@ static int stop_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - ppc64_runlatch_off(); - power9_idle_stop(stop_psscr_table[index].val, + power9_idle_type(stop_psscr_table[index].val, stop_psscr_table[index].mask); - ppc64_runlatch_on(); return index; } @@ -354,6 +361,7 @@ static int powernv_add_idle_states(void) for (i = 0; i < dt_idle_states; i++) { unsigned int exit_latency, target_residency; + bool stops_timebase = false; /* * If an idle state has exit latency beyond * POWERNV_THRESHOLD_LATENCY_NS then don't use it @@ -381,6 +389,9 @@ static int powernv_add_idle_states(void) } } + if (flags[i] & OPAL_PM_TIMEBASE_STOP) + stops_timebase = true; + /* * For nap and fastsleep, use default target_residency * values if f/w does not expose it. @@ -392,8 +403,7 @@ static int powernv_add_idle_states(void) add_powernv_state(nr_idle_states, "Nap", CPUIDLE_FLAG_NONE, nap_loop, target_residency, exit_latency, 0, 0); - } else if ((flags[i] & OPAL_PM_STOP_INST_FAST) && - !(flags[i] & OPAL_PM_TIMEBASE_STOP)) { + } else if (has_stop_states && !stops_timebase) { add_powernv_state(nr_idle_states, names[i], CPUIDLE_FLAG_NONE, stop_loop, target_residency, exit_latency, @@ -405,8 +415,8 @@ static int powernv_add_idle_states(void) * within this config dependency check. */ #ifdef CONFIG_TICK_ONESHOT - if (flags[i] & OPAL_PM_SLEEP_ENABLED || - flags[i] & OPAL_PM_SLEEP_ENABLED_ER1) { + else if (flags[i] & OPAL_PM_SLEEP_ENABLED || + flags[i] & OPAL_PM_SLEEP_ENABLED_ER1) { if (!rc) target_residency = 300000; /* Add FASTSLEEP state */ @@ -414,14 +424,15 @@ static int powernv_add_idle_states(void) CPUIDLE_FLAG_TIMER_STOP, fastsleep_loop, target_residency, exit_latency, 0, 0); - } else if ((flags[i] & OPAL_PM_STOP_INST_DEEP) && - (flags[i] & OPAL_PM_TIMEBASE_STOP)) { + } else if (has_stop_states && stops_timebase) { add_powernv_state(nr_idle_states, names[i], CPUIDLE_FLAG_TIMER_STOP, stop_loop, target_residency, exit_latency, psscr_val[i], psscr_mask[i]); } #endif + else + continue; nr_idle_states++; } out: diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index 166ccd711ec9..e9b3853d93ea 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -25,10 +25,10 @@ struct cpuidle_driver pseries_idle_driver = { .owner = THIS_MODULE, }; -static int max_idle_state; -static struct cpuidle_state *cpuidle_state_table; -static u64 snooze_timeout; -static bool snooze_timeout_en; +static int max_idle_state __read_mostly; +static struct cpuidle_state *cpuidle_state_table __read_mostly; +static u64 snooze_timeout __read_mostly; +static bool snooze_timeout_en __read_mostly; static inline void idle_loop_prolog(unsigned long *in_purr) { @@ -62,21 +62,29 @@ static int snooze_loop(struct cpuidle_device *dev, unsigned long in_purr; u64 snooze_exit_time; + set_thread_flag(TIF_POLLING_NRFLAG); + idle_loop_prolog(&in_purr); local_irq_enable(); - set_thread_flag(TIF_POLLING_NRFLAG); snooze_exit_time = get_tb() + snooze_timeout; while (!need_resched()) { HMT_low(); HMT_very_low(); - if (snooze_timeout_en && get_tb() > snooze_exit_time) + if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + /* + * Task has not woken up but we are exiting the polling + * loop anyway. Require a barrier after polling is + * cleared to order subsequent test of need_resched(). + */ + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb(); break; + } } HMT_medium(); clear_thread_flag(TIF_POLLING_NRFLAG); - smp_mb(); idle_loop_epilog(in_purr); diff --git a/drivers/misc/cxl/Kconfig b/drivers/misc/cxl/Kconfig index b75cf830d08a..93397cb05b15 100644 --- a/drivers/misc/cxl/Kconfig +++ b/drivers/misc/cxl/Kconfig @@ -11,11 +11,16 @@ config CXL_AFU_DRIVER_OPS bool default n +config CXL_LIB + bool + default n + config CXL tristate "Support for IBM Coherent Accelerators (CXL)" depends on PPC_POWERNV && PCI_MSI && EEH select CXL_BASE select CXL_AFU_DRIVER_OPS + select CXL_LIB default m help Select this option to enable driver support for IBM Coherent diff --git a/drivers/misc/cxl/Makefile b/drivers/misc/cxl/Makefile index c14fd6b65b5a..0b5fd749d96d 100644 --- a/drivers/misc/cxl/Makefile +++ b/drivers/misc/cxl/Makefile @@ -3,7 +3,7 @@ ccflags-$(CONFIG_PPC_WERROR) += -Werror cxl-y += main.o file.o irq.o fault.o native.o cxl-y += context.o sysfs.o pci.o trace.o -cxl-y += vphb.o phb.o api.o +cxl-y += vphb.o phb.o api.o cxllib.o cxl-$(CONFIG_PPC_PSERIES) += flash.o guest.o of.o hcalls.o cxl-$(CONFIG_DEBUG_FS) += debugfs.o obj-$(CONFIG_CXL) += cxl.o diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index a03f8e7535e5..b1afeccbb97f 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -1010,6 +1010,7 @@ static inline void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct den void cxl_handle_fault(struct work_struct *work); void cxl_prefault(struct cxl_context *ctx, u64 wed); +int cxl_handle_mm_fault(struct mm_struct *mm, u64 dsisr, u64 dar); struct cxl *get_cxl_adapter(int num); int cxl_alloc_sst(struct cxl_context *ctx); @@ -1061,6 +1062,11 @@ int cxl_afu_slbia(struct cxl_afu *afu); int cxl_data_cache_flush(struct cxl *adapter); int cxl_afu_disable(struct cxl_afu *afu); int cxl_psl_purge(struct cxl_afu *afu); +int cxl_calc_capp_routing(struct pci_dev *dev, u64 *chipid, + u32 *phb_index, u64 *capp_unit_id); +int cxl_slot_is_switched(struct pci_dev *dev); +int cxl_get_xsl9_dsnctl(u64 capp_unit_id, u64 *reg); +u64 cxl_calculate_sr(bool master, bool kernel, bool real_mode, bool p9); void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx); void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx); diff --git a/drivers/misc/cxl/cxllib.c b/drivers/misc/cxl/cxllib.c new file mode 100644 index 000000000000..5dba23ca2e5f --- /dev/null +++ b/drivers/misc/cxl/cxllib.c @@ -0,0 +1,246 @@ +/* + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/hugetlb.h> +#include <linux/sched/mm.h> +#include <asm/pnv-pci.h> +#include <misc/cxllib.h> + +#include "cxl.h" + +#define CXL_INVALID_DRA ~0ull +#define CXL_DUMMY_READ_SIZE 128 +#define CXL_DUMMY_READ_ALIGN 8 +#define CXL_CAPI_WINDOW_START 0x2000000000000ull +#define CXL_CAPI_WINDOW_LOG_SIZE 48 +#define CXL_XSL_CONFIG_CURRENT_VERSION CXL_XSL_CONFIG_VERSION1 + + +bool cxllib_slot_is_supported(struct pci_dev *dev, unsigned long flags) +{ + int rc; + u32 phb_index; + u64 chip_id, capp_unit_id; + + /* No flags currently supported */ + if (flags) + return false; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return false; + + if (!cxl_is_power9()) + return false; + + if (cxl_slot_is_switched(dev)) + return false; + + /* on p9, some pci slots are not connected to a CAPP unit */ + rc = cxl_calc_capp_routing(dev, &chip_id, &phb_index, &capp_unit_id); + if (rc) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(cxllib_slot_is_supported); + +static DEFINE_MUTEX(dra_mutex); +static u64 dummy_read_addr = CXL_INVALID_DRA; + +static int allocate_dummy_read_buf(void) +{ + u64 buf, vaddr; + size_t buf_size; + + /* + * Dummy read buffer is 128-byte long, aligned on a + * 256-byte boundary and we need the physical address. + */ + buf_size = CXL_DUMMY_READ_SIZE + (1ull << CXL_DUMMY_READ_ALIGN); + buf = (u64) kzalloc(buf_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + vaddr = (buf + (1ull << CXL_DUMMY_READ_ALIGN) - 1) & + (~0ull << CXL_DUMMY_READ_ALIGN); + + WARN((vaddr + CXL_DUMMY_READ_SIZE) > (buf + buf_size), + "Dummy read buffer alignment issue"); + dummy_read_addr = virt_to_phys((void *) vaddr); + return 0; +} + +int cxllib_get_xsl_config(struct pci_dev *dev, struct cxllib_xsl_config *cfg) +{ + int rc; + u32 phb_index; + u64 chip_id, capp_unit_id; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return -EINVAL; + + mutex_lock(&dra_mutex); + if (dummy_read_addr == CXL_INVALID_DRA) { + rc = allocate_dummy_read_buf(); + if (rc) { + mutex_unlock(&dra_mutex); + return rc; + } + } + mutex_unlock(&dra_mutex); + + rc = cxl_calc_capp_routing(dev, &chip_id, &phb_index, &capp_unit_id); + if (rc) + return rc; + + rc = cxl_get_xsl9_dsnctl(capp_unit_id, &cfg->dsnctl); + if (rc) + return rc; + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + /* workaround for DD1 - nbwind = capiind */ + cfg->dsnctl |= ((u64)0x02 << (63-47)); + } + + cfg->version = CXL_XSL_CONFIG_CURRENT_VERSION; + cfg->log_bar_size = CXL_CAPI_WINDOW_LOG_SIZE; + cfg->bar_addr = CXL_CAPI_WINDOW_START; + cfg->dra = dummy_read_addr; + return 0; +} +EXPORT_SYMBOL_GPL(cxllib_get_xsl_config); + +int cxllib_switch_phb_mode(struct pci_dev *dev, enum cxllib_mode mode, + unsigned long flags) +{ + int rc = 0; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return -EINVAL; + + switch (mode) { + case CXL_MODE_PCI: + /* + * We currently don't support going back to PCI mode + * However, we'll turn the invalidations off, so that + * the firmware doesn't have to ack them and can do + * things like reset, etc.. with no worries. + * So always return EPERM (can't go back to PCI) or + * EBUSY if we couldn't even turn off snooping + */ + rc = pnv_phb_to_cxl_mode(dev, OPAL_PHB_CAPI_MODE_SNOOP_OFF); + if (rc) + rc = -EBUSY; + else + rc = -EPERM; + break; + case CXL_MODE_CXL: + /* DMA only supported on TVT1 for the time being */ + if (flags != CXL_MODE_DMA_TVT1) + return -EINVAL; + rc = pnv_phb_to_cxl_mode(dev, OPAL_PHB_CAPI_MODE_DMA_TVT1); + if (rc) + return rc; + rc = pnv_phb_to_cxl_mode(dev, OPAL_PHB_CAPI_MODE_SNOOP_ON); + break; + default: + rc = -EINVAL; + } + return rc; +} +EXPORT_SYMBOL_GPL(cxllib_switch_phb_mode); + +/* + * When switching the PHB to capi mode, the TVT#1 entry for + * the Partitionable Endpoint is set in bypass mode, like + * in PCI mode. + * Configure the device dma to use TVT#1, which is done + * by calling dma_set_mask() with a mask large enough. + */ +int cxllib_set_device_dma(struct pci_dev *dev, unsigned long flags) +{ + int rc; + + if (flags) + return -EINVAL; + + rc = dma_set_mask(&dev->dev, DMA_BIT_MASK(64)); + return rc; +} +EXPORT_SYMBOL_GPL(cxllib_set_device_dma); + +int cxllib_get_PE_attributes(struct task_struct *task, + unsigned long translation_mode, + struct cxllib_pe_attributes *attr) +{ + struct mm_struct *mm = NULL; + + if (translation_mode != CXL_TRANSLATED_MODE && + translation_mode != CXL_REAL_MODE) + return -EINVAL; + + attr->sr = cxl_calculate_sr(false, + task == NULL, + translation_mode == CXL_REAL_MODE, + true); + attr->lpid = mfspr(SPRN_LPID); + if (task) { + mm = get_task_mm(task); + if (mm == NULL) + return -EINVAL; + /* + * Caller is keeping a reference on mm_users for as long + * as XSL uses the memory context + */ + attr->pid = mm->context.id; + mmput(mm); + } else { + attr->pid = 0; + } + attr->tid = 0; + return 0; +} +EXPORT_SYMBOL_GPL(cxllib_get_PE_attributes); + +int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags) +{ + int rc; + u64 dar; + struct vm_area_struct *vma = NULL; + unsigned long page_size; + + if (mm == NULL) + return -EFAULT; + + down_read(&mm->mmap_sem); + + for (dar = addr; dar < addr + size; dar += page_size) { + if (!vma || dar < vma->vm_start || dar > vma->vm_end) { + vma = find_vma(mm, addr); + if (!vma) { + pr_err("Can't find vma for addr %016llx\n", addr); + rc = -EFAULT; + goto out; + } + /* get the size of the pages allocated */ + page_size = vma_kernel_pagesize(vma); + } + + rc = cxl_handle_mm_fault(mm, flags, dar); + if (rc) { + pr_err("cxl_handle_mm_fault failed %d", rc); + rc = -EFAULT; + goto out; + } + } + rc = 0; +out: + up_read(&mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(cxllib_handle_fault); diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index c79e39bad7a4..6eed7d03e2b5 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -132,18 +132,15 @@ static int cxl_handle_segment_miss(struct cxl_context *ctx, return IRQ_HANDLED; } -static void cxl_handle_page_fault(struct cxl_context *ctx, - struct mm_struct *mm, u64 dsisr, u64 dar) +int cxl_handle_mm_fault(struct mm_struct *mm, u64 dsisr, u64 dar) { unsigned flt = 0; int result; unsigned long access, flags, inv_flags = 0; - trace_cxl_pte_miss(ctx, dsisr, dar); - if ((result = copro_handle_mm_fault(mm, dar, dsisr, &flt))) { pr_devel("copro_handle_mm_fault failed: %#x\n", result); - return cxl_ack_ae(ctx); + return result; } if (!radix_enabled()) { @@ -155,9 +152,8 @@ static void cxl_handle_page_fault(struct cxl_context *ctx, if (dsisr & CXL_PSL_DSISR_An_S) access |= _PAGE_WRITE; - access |= _PAGE_PRIVILEGED; - if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID)) - access &= ~_PAGE_PRIVILEGED; + if (!mm && (REGION_ID(dar) != USER_REGION_ID)) + access |= _PAGE_PRIVILEGED; if (dsisr & DSISR_NOHPTE) inv_flags |= HPTE_NOHPTE_UPDATE; @@ -166,8 +162,21 @@ static void cxl_handle_page_fault(struct cxl_context *ctx, hash_page_mm(mm, dar, access, 0x300, inv_flags); local_irq_restore(flags); } - pr_devel("Page fault successfully handled for pe: %i!\n", ctx->pe); - cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_R, 0); + return 0; +} + +static void cxl_handle_page_fault(struct cxl_context *ctx, + struct mm_struct *mm, + u64 dsisr, u64 dar) +{ + trace_cxl_pte_miss(ctx, dsisr, dar); + + if (cxl_handle_mm_fault(mm, dsisr, dar)) { + cxl_ack_ae(ctx); + } else { + pr_devel("Page fault successfully handled for pe: %i!\n", ctx->pe); + cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_R, 0); + } } /* diff --git a/drivers/misc/cxl/flash.c b/drivers/misc/cxl/flash.c index 7c61c70ba3f6..3aa216bf0939 100644 --- a/drivers/misc/cxl/flash.c +++ b/drivers/misc/cxl/flash.c @@ -401,8 +401,10 @@ static int device_open(struct inode *inode, struct file *file) if (down_interruptible(&sem) != 0) return -EPERM; - if (!(adapter = get_cxl_adapter(adapter_num))) - return -ENODEV; + if (!(adapter = get_cxl_adapter(adapter_num))) { + rc = -ENODEV; + goto err_unlock; + } file->private_data = adapter; continue_token = 0; @@ -446,6 +448,8 @@ err1: free_page((unsigned long) le); err: put_device(&adapter->dev); +err_unlock: + up(&sem); return rc; } diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index 2b2f8894149d..4a82c313cf71 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -586,17 +586,17 @@ err: #define set_endian(sr) ((sr) &= ~(CXL_PSL_SR_An_LE)) #endif -static u64 calculate_sr(struct cxl_context *ctx) +u64 cxl_calculate_sr(bool master, bool kernel, bool real_mode, bool p9) { u64 sr = 0; set_endian(sr); - if (ctx->master) + if (master) sr |= CXL_PSL_SR_An_MP; if (mfspr(SPRN_LPCR) & LPCR_TC) sr |= CXL_PSL_SR_An_TC; - if (ctx->kernel) { - if (!ctx->real_mode) + if (kernel) { + if (!real_mode) sr |= CXL_PSL_SR_An_R; sr |= (mfmsr() & MSR_SF) | CXL_PSL_SR_An_HV; } else { @@ -608,7 +608,7 @@ static u64 calculate_sr(struct cxl_context *ctx) if (!test_tsk_thread_flag(current, TIF_32BIT)) sr |= CXL_PSL_SR_An_SF; } - if (cxl_is_power9()) { + if (p9) { if (radix_enabled()) sr |= CXL_PSL_SR_An_XLAT_ror; else @@ -617,6 +617,12 @@ static u64 calculate_sr(struct cxl_context *ctx) return sr; } +static u64 calculate_sr(struct cxl_context *ctx) +{ + return cxl_calculate_sr(ctx->master, ctx->kernel, ctx->real_mode, + cxl_is_power9()); +} + static void update_ivtes_directed(struct cxl_context *ctx) { bool need_update = (ctx->status == STARTED); diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 1eb9859809bf..d18b3d9292fd 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -375,7 +375,7 @@ static u64 get_capp_unit_id(struct device_node *np, u32 phb_index) return 0; } -static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, +int cxl_calc_capp_routing(struct pci_dev *dev, u64 *chipid, u32 *phb_index, u64 *capp_unit_id) { int rc; @@ -408,17 +408,9 @@ static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, return 0; } -static int init_implementation_adapter_regs_psl9(struct cxl *adapter, struct pci_dev *dev) +int cxl_get_xsl9_dsnctl(u64 capp_unit_id, u64 *reg) { - u64 xsl_dsnctl, psl_fircntl; - u64 chipid; - u32 phb_index; - u64 capp_unit_id; - int rc; - - rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); - if (rc) - return rc; + u64 xsl_dsnctl; /* * CAPI Identifier bits [0:7] @@ -454,6 +446,27 @@ static int init_implementation_adapter_regs_psl9(struct cxl *adapter, struct pci xsl_dsnctl |= ((u64)0x04 << (63-55)); } + *reg = xsl_dsnctl; + return 0; +} + +static int init_implementation_adapter_regs_psl9(struct cxl *adapter, + struct pci_dev *dev) +{ + u64 xsl_dsnctl, psl_fircntl; + u64 chipid; + u32 phb_index; + u64 capp_unit_id; + int rc; + + rc = cxl_calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); + if (rc) + return rc; + + rc = cxl_get_xsl9_dsnctl(capp_unit_id, &xsl_dsnctl); + if (rc) + return rc; + cxl_p1_write(adapter, CXL_XSL9_DSNCTL, xsl_dsnctl); /* Set fir_cntl to recommended value for production env */ @@ -505,7 +518,7 @@ static int init_implementation_adapter_regs_psl8(struct cxl *adapter, struct pci u64 capp_unit_id; int rc; - rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); + rc = cxl_calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); if (rc) return rc; @@ -538,7 +551,7 @@ static int init_implementation_adapter_regs_xsl(struct cxl *adapter, struct pci_ u64 capp_unit_id; int rc; - rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); + rc = cxl_calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); if (rc) return rc; @@ -1897,7 +1910,7 @@ static void cxl_pci_remove_adapter(struct cxl *adapter) #define CXL_MAX_PCIEX_PARENT 2 -static int cxl_slot_is_switched(struct pci_dev *dev) +int cxl_slot_is_switched(struct pci_dev *dev) { struct device_node *np; int depth = 0; diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 8b9049dac094..e6e31a16f68f 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -1688,7 +1688,7 @@ config MEN_A21_WDT config WATCHDOG_RTAS tristate "RTAS watchdog" - depends on PPC_RTAS || (PPC64 && COMPILE_TEST) + depends on PPC_RTAS help This driver adds watchdog support for the RTAS watchdog. diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f9f56f231ae6..da0be9a8d1de 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -594,6 +594,7 @@ #define SBSS(sbss_align) \ . = ALIGN(sbss_align); \ .sbss : AT(ADDR(.sbss) - LOAD_OFFSET) { \ + *(.dynsbss) \ *(.sbss) \ *(.scommon) \ } @@ -640,11 +641,22 @@ .debug_str 0 : { *(.debug_str) } \ .debug_loc 0 : { *(.debug_loc) } \ .debug_macinfo 0 : { *(.debug_macinfo) } \ + .debug_pubtypes 0 : { *(.debug_pubtypes) } \ + /* DWARF 3 */ \ + .debug_ranges 0 : { *(.debug_ranges) } \ /* SGI/MIPS DWARF 2 extensions */ \ .debug_weaknames 0 : { *(.debug_weaknames) } \ .debug_funcnames 0 : { *(.debug_funcnames) } \ .debug_typenames 0 : { *(.debug_typenames) } \ .debug_varnames 0 : { *(.debug_varnames) } \ + /* GNU DWARF 2 extensions */ \ + .debug_gnu_pubnames 0 : { *(.debug_gnu_pubnames) } \ + .debug_gnu_pubtypes 0 : { *(.debug_gnu_pubtypes) } \ + /* DWARF 4 */ \ + .debug_types 0 : { *(.debug_types) } \ + /* DWARF 5 */ \ + .debug_macro 0 : { *(.debug_macro) } \ + .debug_addr 0 : { *(.debug_addr) } /* Stabs debugging sections. */ #define STABS_DEBUG \ diff --git a/include/linux/processor.h b/include/linux/processor.h new file mode 100644 index 000000000000..da0c5e56ca02 --- /dev/null +++ b/include/linux/processor.h @@ -0,0 +1,70 @@ +/* Misc low level processor primitives */ +#ifndef _LINUX_PROCESSOR_H +#define _LINUX_PROCESSOR_H + +#include <asm/processor.h> + +/* + * spin_begin is used before beginning a busy-wait loop, and must be paired + * with spin_end when the loop is exited. spin_cpu_relax must be called + * within the loop. + * + * The loop body should be as small and fast as possible, on the order of + * tens of instructions/cycles as a guide. It should and avoid calling + * cpu_relax, or any "spin" or sleep type of primitive including nested uses + * of these primitives. It should not lock or take any other resource. + * Violations of these guidelies will not cause a bug, but may cause sub + * optimal performance. + * + * These loops are optimized to be used where wait times are expected to be + * less than the cost of a context switch (and associated overhead). + * + * Detection of resource owner and decision to spin or sleep or guest-yield + * (e.g., spin lock holder vcpu preempted, or mutex owner not on CPU) can be + * tested within the loop body. + */ +#ifndef spin_begin +#define spin_begin() +#endif + +#ifndef spin_cpu_relax +#define spin_cpu_relax() cpu_relax() +#endif + +/* + * spin_cpu_yield may be called to yield (undirected) to the hypervisor if + * necessary. This should be used if the wait is expected to take longer + * than context switch overhead, but we can't sleep or do a directed yield. + */ +#ifndef spin_cpu_yield +#define spin_cpu_yield() cpu_relax_yield() +#endif + +#ifndef spin_end +#define spin_end() +#endif + +/* + * spin_until_cond can be used to wait for a condition to become true. It + * may be expected that the first iteration will true in the common case + * (no spinning), so that callers should not require a first "likely" test + * for the uncontended case before using this primitive. + * + * Usage and implementation guidelines are the same as for the spin_begin + * primitives, above. + */ +#ifndef spin_until_cond +#define spin_until_cond(cond) \ +do { \ + if (unlikely(!(cond))) { \ + spin_begin(); \ + do { \ + spin_cpu_relax(); \ + } while (!(cond)); \ + spin_end(); \ + } \ +} while (0) + +#endif + +#endif /* _LINUX_PROCESSOR_H */ diff --git a/include/misc/cxllib.h b/include/misc/cxllib.h new file mode 100644 index 000000000000..e5aa29f019a6 --- /dev/null +++ b/include/misc/cxllib.h @@ -0,0 +1,133 @@ +/* + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _MISC_CXLLIB_H +#define _MISC_CXLLIB_H + +#include <linux/pci.h> +#include <asm/reg.h> + +/* + * cxl driver exports a in-kernel 'library' API which can be called by + * other drivers to help interacting with an IBM XSL. + */ + +/* + * tells whether capi is supported on the PCIe slot where the + * device is seated + * + * Input: + * dev: device whose slot needs to be checked + * flags: 0 for the time being + */ +bool cxllib_slot_is_supported(struct pci_dev *dev, unsigned long flags); + + +/* + * Returns the configuration parameters to be used by the XSL or device + * + * Input: + * dev: device, used to find PHB + * Output: + * struct cxllib_xsl_config: + * version + * capi BAR address, i.e. 0x2000000000000-0x2FFFFFFFFFFFF + * capi BAR size + * data send control (XSL_DSNCTL) + * dummy read address (XSL_DRA) + */ +#define CXL_XSL_CONFIG_VERSION1 1 +struct cxllib_xsl_config { + u32 version; /* format version for register encoding */ + u32 log_bar_size;/* log size of the capi_window */ + u64 bar_addr; /* address of the start of capi window */ + u64 dsnctl; /* matches definition of XSL_DSNCTL */ + u64 dra; /* real address that can be used for dummy read */ +}; + +int cxllib_get_xsl_config(struct pci_dev *dev, struct cxllib_xsl_config *cfg); + + +/* + * Activate capi for the pci host bridge associated with the device. + * Can be extended to deactivate once we know how to do it. + * Device must be ready to accept messages from the CAPP unit and + * respond accordingly (TLB invalidates, ...) + * + * PHB is switched to capi mode through calls to skiboot. + * CAPP snooping is activated + * + * Input: + * dev: device whose PHB should switch mode + * mode: mode to switch to i.e. CAPI or PCI + * flags: options related to the mode + */ +enum cxllib_mode { + CXL_MODE_CXL, + CXL_MODE_PCI, +}; + +#define CXL_MODE_NO_DMA 0 +#define CXL_MODE_DMA_TVT0 1 +#define CXL_MODE_DMA_TVT1 2 + +int cxllib_switch_phb_mode(struct pci_dev *dev, enum cxllib_mode mode, + unsigned long flags); + + +/* + * Set the device for capi DMA. + * Define its dma_ops and dma offset so that allocations will be using TVT#1 + * + * Input: + * dev: device to set + * flags: options. CXL_MODE_DMA_TVT1 should be used + */ +int cxllib_set_device_dma(struct pci_dev *dev, unsigned long flags); + + +/* + * Get the Process Element structure for the given thread + * + * Input: + * task: task_struct for the context of the translation + * translation_mode: whether addresses should be translated + * Output: + * attr: attributes to fill up the Process Element structure from CAIA + */ +struct cxllib_pe_attributes { + u64 sr; + u32 lpid; + u32 tid; + u32 pid; +}; +#define CXL_TRANSLATED_MODE 0 +#define CXL_REAL_MODE 1 + +int cxllib_get_PE_attributes(struct task_struct *task, + unsigned long translation_mode, struct cxllib_pe_attributes *attr); + + +/* + * Handle memory fault. + * Fault in all the pages of the specified buffer for the permissions + * provided in ‘flags’ + * + * Shouldn't be called from interrupt context + * + * Input: + * mm: struct mm for the thread faulting the pages + * addr: base address of the buffer to page in + * size: size of the buffer to page in + * flags: permission requested (DSISR_ISSTORE...) + */ +int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags); + + +#endif /* _MISC_CXLLIB_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 857f6ef368d4..46ef77d5c332 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -669,12 +669,16 @@ config IDLE_PAGE_TRACKING See Documentation/vm/idle_page_tracking.txt for more details. +# arch_add_memory() comprehends device memory +config ARCH_HAS_ZONE_DEVICE + bool + config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP - depends on X86_64 #arch_add_memory() comprehends device memory + depends on ARCH_HAS_ZONE_DEVICE help Device memory hotplug support allows for establishing pmem, diff --git a/tools/testing/selftests/powerpc/benchmarks/context_switch.c b/tools/testing/selftests/powerpc/benchmarks/context_switch.c index 778f5fbfd784..f4241339edd2 100644 --- a/tools/testing/selftests/powerpc/benchmarks/context_switch.c +++ b/tools/testing/selftests/powerpc/benchmarks/context_switch.c @@ -258,9 +258,14 @@ static unsigned long xchg(unsigned long *p, unsigned long val) return __atomic_exchange_n(p, val, __ATOMIC_SEQ_CST); } +static int processes; + static int mutex_lock(unsigned long *m) { int c; + int flags = FUTEX_WAIT; + if (!processes) + flags |= FUTEX_PRIVATE_FLAG; c = cmpxchg(m, 0, 1); if (!c) @@ -270,7 +275,7 @@ static int mutex_lock(unsigned long *m) c = xchg(m, 2); while (c) { - sys_futex(m, FUTEX_WAIT, 2, NULL, NULL, 0); + sys_futex(m, flags, 2, NULL, NULL, 0); c = xchg(m, 2); } @@ -279,12 +284,16 @@ static int mutex_lock(unsigned long *m) static int mutex_unlock(unsigned long *m) { + int flags = FUTEX_WAKE; + if (!processes) + flags |= FUTEX_PRIVATE_FLAG; + if (*m == 2) *m = 0; else if (xchg(m, 0) == 1) return 0; - sys_futex(m, FUTEX_WAKE, 1, NULL, NULL, 0); + sys_futex(m, flags, 1, NULL, NULL, 0); return 0; } @@ -293,26 +302,32 @@ static unsigned long *m1, *m2; static void futex_setup(int cpu1, int cpu2) { - int shmid; - void *shmaddr; + if (!processes) { + static unsigned long _m1, _m2; + m1 = &_m1; + m2 = &_m2; + } else { + int shmid; + void *shmaddr; - shmid = shmget(IPC_PRIVATE, getpagesize(), SHM_R | SHM_W); - if (shmid < 0) { - perror("shmget"); - exit(1); - } + shmid = shmget(IPC_PRIVATE, getpagesize(), SHM_R | SHM_W); + if (shmid < 0) { + perror("shmget"); + exit(1); + } - shmaddr = shmat(shmid, NULL, 0); - if (shmaddr == (char *)-1) { - perror("shmat"); - shmctl(shmid, IPC_RMID, NULL); - exit(1); - } + shmaddr = shmat(shmid, NULL, 0); + if (shmaddr == (char *)-1) { + perror("shmat"); + shmctl(shmid, IPC_RMID, NULL); + exit(1); + } - shmctl(shmid, IPC_RMID, NULL); + shmctl(shmid, IPC_RMID, NULL); - m1 = shmaddr; - m2 = shmaddr + sizeof(*m1); + m1 = shmaddr; + m2 = shmaddr + sizeof(*m1); + } *m1 = 0; *m2 = 0; @@ -352,8 +367,6 @@ static struct actions futex_actions = { .thread2 = futex_thread2, }; -static int processes; - static struct option options[] = { { "test", required_argument, 0, 't' }, { "process", no_argument, &processes, 1 }, |