summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-iommu_groups6
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-numa24
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt29
-rw-r--r--Documentation/admin-guide/mm/numa_memory_policy.rst15
-rw-r--r--Documentation/admin-guide/sysctl/vm.rst3
-rw-r--r--Documentation/core-api/cachetlb.rst86
-rw-r--r--Documentation/dev-tools/kasan.rst13
-rw-r--r--Documentation/devicetree/bindings/iommu/apple,dart.yaml81
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt36
-rw-r--r--Documentation/powerpc/associativity.rst105
-rw-r--r--Documentation/powerpc/index.rst1
-rw-r--r--Documentation/translations/zh_CN/core-api/cachetlb.rst9
-rw-r--r--Documentation/vm/hwpoison.rst1
-rw-r--r--MAINTAINERS18
-rw-r--r--arch/alpha/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/arm/include/asm/cacheflush.h4
-rw-r--r--arch/arm/kernel/setup.c20
-rw-r--r--arch/arm/mm/flush.c33
-rw-r--r--arch/arm/mm/nommu.c6
-rw-r--r--arch/arm/tools/syscall.tbl2
-rw-r--r--arch/arm64/include/asm/unistd.h2
-rw-r--r--arch/arm64/include/asm/unistd32.h2
-rw-r--r--arch/arm64/kvm/hyp/reserved_mem.c9
-rw-r--r--arch/arm64/mm/init.c36
-rw-r--r--arch/csky/abiv1/cacheflush.c11
-rw-r--r--arch/csky/abiv1/inc/abi/cacheflush.h4
-rw-r--r--arch/csky/kernel/probes/kprobes.c3
-rw-r--r--arch/ia64/include/asm/meminit.h2
-rw-r--r--arch/ia64/kernel/acpi.c2
-rw-r--r--arch/ia64/kernel/setup.c53
-rw-r--r--arch/ia64/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/m68k/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/microblaze/include/asm/page.h3
-rw-r--r--arch/microblaze/include/asm/pgtable.h2
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/microblaze/mm/init.c12
-rw-r--r--arch/microblaze/mm/pgtable.c17
-rw-r--r--arch/mips/Kbuild.platforms1
-rw-r--r--arch/mips/Kconfig30
-rw-r--r--arch/mips/Makefile3
-rw-r--r--arch/mips/alchemy/devboards/db1200.c2
-rw-r--r--arch/mips/boot/dts/Makefile2
-rw-r--r--arch/mips/boot/dts/img/Makefile3
-rw-r--r--arch/mips/boot/dts/img/pistachio.dtsi10
-rw-r--r--arch/mips/boot/dts/mscc/ocelot.dtsi11
-rw-r--r--arch/mips/boot/dts/mscc/ocelot_pcb120.dts12
-rw-r--r--arch/mips/boot/dts/mscc/ocelot_pcb123.dts8
-rw-r--r--arch/mips/cavium-octeon/executive/cvmx-bootmem.c10
-rw-r--r--arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c33
-rw-r--r--arch/mips/cavium-octeon/executive/cvmx-helper-board.c8
-rw-r--r--arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c12
-rw-r--r--arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c8
-rw-r--r--arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c17
-rw-r--r--arch/mips/cavium-octeon/executive/cvmx-l2c.c9
-rw-r--r--arch/mips/cavium-octeon/executive/cvmx-pko.c22
-rw-r--r--arch/mips/cavium-octeon/executive/cvmx-spi.c20
-rw-r--r--arch/mips/cavium-octeon/flash_setup.c2
-rw-r--r--arch/mips/cavium-octeon/setup.c81
-rw-r--r--arch/mips/cavium-octeon/smp.c14
-rw-r--r--arch/mips/configs/generic/board-marduk.config53
-rw-r--r--arch/mips/configs/pistachio_defconfig316
-rw-r--r--arch/mips/generic/Kconfig6
-rw-r--r--arch/mips/generic/Platform1
-rw-r--r--arch/mips/generic/board-ingenic.c49
-rw-r--r--arch/mips/generic/board-marduk.its.S22
-rw-r--r--arch/mips/generic/board-ocelot.c6
-rw-r--r--arch/mips/include/asm/atomic.h2
-rw-r--r--arch/mips/include/asm/bootinfo.h3
-rw-r--r--arch/mips/include/asm/cacheflush.h8
-rw-r--r--arch/mips/include/asm/cpu.h4
-rw-r--r--arch/mips/kernel/mips-mt-fpaff.c10
-rw-r--r--arch/mips/kernel/process.c4
-rw-r--r--arch/mips/kernel/setup.c14
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl2
-rw-r--r--arch/mips/kernel/syscalls/syscall_n64.tbl2
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl2
-rw-r--r--arch/mips/kernel/uprobes.c10
-rw-r--r--arch/mips/kvm/Makefile19
-rw-r--r--arch/mips/kvm/mmu.c4
-rw-r--r--arch/mips/loongson2ef/common/Makefile4
-rw-r--r--arch/mips/mm/c-octeon.c29
-rw-r--r--arch/mips/mti-malta/malta-dtshim.c2
-rw-r--r--arch/mips/netlogic/xlr/fmn-config.c15
-rw-r--r--arch/mips/pistachio/Kconfig14
-rw-r--r--arch/mips/pistachio/Makefile2
-rw-r--r--arch/mips/pistachio/Platform6
-rw-r--r--arch/mips/pistachio/init.c125
-rw-r--r--arch/mips/pistachio/irq.c24
-rw-r--r--arch/mips/pistachio/time.c55
-rw-r--r--arch/nds32/include/asm/cacheflush.h3
-rw-r--r--arch/nds32/mm/cacheflush.c9
-rw-r--r--arch/openrisc/boot/dts/or1klitex.dts13
-rw-r--r--arch/openrisc/configs/or1klitex_defconfig26
-rw-r--r--arch/openrisc/include/asm/pgtable.h6
-rw-r--r--arch/openrisc/include/asm/setup.h15
-rw-r--r--arch/openrisc/include/asm/thread_info.h2
-rw-r--r--arch/openrisc/kernel/entry.S6
-rw-r--r--arch/openrisc/kernel/head.S6
-rw-r--r--arch/openrisc/kernel/setup.c20
-rw-r--r--arch/openrisc/lib/Makefile2
-rw-r--r--arch/openrisc/mm/fault.c2
-rw-r--r--arch/parisc/boot/compressed/misc.c2
-rw-r--r--arch/parisc/include/asm/cacheflush.h8
-rw-r--r--arch/parisc/include/uapi/asm/swab.h68
-rw-r--r--arch/parisc/kernel/cache.c3
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/Kconfig.debug30
-rw-r--r--arch/powerpc/Makefile4
-rw-r--r--arch/powerpc/boot/Makefile11
-rw-r--r--arch/powerpc/boot/dts/fsl/sbc8641d.dts176
-rw-r--r--arch/powerpc/boot/dts/microwatt.dts12
-rw-r--r--arch/powerpc/boot/dts/sbc8548-altflash.dts111
-rw-r--r--arch/powerpc/boot/dts/sbc8548-post.dtsi289
-rw-r--r--arch/powerpc/boot/dts/sbc8548-pre.dtsi48
-rw-r--r--arch/powerpc/boot/dts/sbc8548.dts106
-rw-r--r--arch/powerpc/boot/dts/wii.dts13
-rw-r--r--arch/powerpc/boot/install.sh27
-rwxr-xr-xarch/powerpc/boot/wrapper2
-rw-r--r--arch/powerpc/configs/85xx/sbc8548_defconfig50
-rw-r--r--arch/powerpc/configs/microwatt_defconfig7
-rw-r--r--arch/powerpc/configs/mpc85xx_base.config1
-rw-r--r--arch/powerpc/configs/mpc86xx_base.config1
-rw-r--r--arch/powerpc/configs/mpc885_ads_defconfig49
-rw-r--r--arch/powerpc/configs/ppc6xx_defconfig1
-rw-r--r--arch/powerpc/configs/wii_defconfig1
-rw-r--r--arch/powerpc/include/asm/asm-compat.h4
-rw-r--r--arch/powerpc/include/asm/atomic.h4
-rw-r--r--arch/powerpc/include/asm/bitops.h8
-rw-r--r--arch/powerpc/include/asm/book3s/64/kup.h2
-rw-r--r--arch/powerpc/include/asm/bug.h62
-rw-r--r--arch/powerpc/include/asm/debugfs.h13
-rw-r--r--arch/powerpc/include/asm/drmem.h1
-rw-r--r--arch/powerpc/include/asm/extable.h14
-rw-r--r--arch/powerpc/include/asm/firmware.h7
-rw-r--r--arch/powerpc/include/asm/iommu.h1
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h1
-rw-r--r--arch/powerpc/include/asm/kvm_host.h2
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h4
-rw-r--r--arch/powerpc/include/asm/membarrier.h3
-rw-r--r--arch/powerpc/include/asm/mmu.h2
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h5
-rw-r--r--arch/powerpc/include/asm/pmc.h7
-rw-r--r--arch/powerpc/include/asm/pnv-pci.h2
-rw-r--r--arch/powerpc/include/asm/ppc-opcode.h2
-rw-r--r--arch/powerpc/include/asm/ppc_asm.h13
-rw-r--r--arch/powerpc/include/asm/prom.h3
-rw-r--r--arch/powerpc/include/asm/ptrace.h37
-rw-r--r--arch/powerpc/include/asm/reg.h3
-rw-r--r--arch/powerpc/include/asm/sections.h8
-rw-r--r--arch/powerpc/include/asm/simple_spinlock.h6
-rw-r--r--arch/powerpc/include/asm/smp.h6
-rw-r--r--arch/powerpc/include/asm/syscall.h20
-rw-r--r--arch/powerpc/include/asm/syscalls.h30
-rw-r--r--arch/powerpc/include/asm/tce.h8
-rw-r--r--arch/powerpc/include/asm/topology.h19
-rw-r--r--arch/powerpc/include/asm/unistd.h2
-rw-r--r--arch/powerpc/include/asm/vdso/processor.h9
-rw-r--r--arch/powerpc/include/asm/xics.h3
-rw-r--r--arch/powerpc/include/asm/xive-regs.h3
-rw-r--r--arch/powerpc/include/asm/xive.h2
-rw-r--r--arch/powerpc/kernel/Makefile3
-rw-r--r--arch/powerpc/kernel/asm-offsets.c15
-rw-r--r--arch/powerpc/kernel/cacheinfo.c124
-rw-r--r--arch/powerpc/kernel/dawr.c3
-rw-r--r--arch/powerpc/kernel/eeh.c16
-rw-r--r--arch/powerpc/kernel/eeh_cache.c4
-rw-r--r--arch/powerpc/kernel/entry_32.S4
-rw-r--r--arch/powerpc/kernel/entry_64.S2
-rw-r--r--arch/powerpc/kernel/exceptions-64e.S24
-rw-r--r--arch/powerpc/kernel/fadump.c4
-rw-r--r--arch/powerpc/kernel/fpu.S3
-rw-r--r--arch/powerpc/kernel/fsl_booke_entry_mapping.S8
-rw-r--r--arch/powerpc/kernel/head_44x.S6
-rw-r--r--arch/powerpc/kernel/head_64.S2
-rw-r--r--arch/powerpc/kernel/head_fsl_booke.S6
-rw-r--r--arch/powerpc/kernel/hw_breakpoint.c1
-rw-r--r--arch/powerpc/kernel/interrupt.c12
-rw-r--r--arch/powerpc/kernel/iommu.c61
-rw-r--r--arch/powerpc/kernel/kdebugfs.c14
-rw-r--r--arch/powerpc/kernel/misc.S2
-rw-r--r--arch/powerpc/kernel/misc_32.S4
-rw-r--r--arch/powerpc/kernel/misc_64.S2
-rw-r--r--arch/powerpc/kernel/pci-common.c6
-rw-r--r--arch/powerpc/kernel/process.c2
-rw-r--r--arch/powerpc/kernel/prom.c5
-rw-r--r--arch/powerpc/kernel/prom_init.c3
-rw-r--r--arch/powerpc/kernel/ptrace/ptrace.c4
-rw-r--r--arch/powerpc/kernel/reloc_32.S2
-rw-r--r--arch/powerpc/kernel/rtasd.c4
-rw-r--r--arch/powerpc/kernel/security.c16
-rw-r--r--arch/powerpc/kernel/setup-common.c13
-rw-r--r--arch/powerpc/kernel/setup_64.c1
-rw-r--r--arch/powerpc/kernel/smp.c88
-rw-r--r--arch/powerpc/kernel/stacktrace.c1
-rw-r--r--arch/powerpc/kernel/syscalls.c15
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/powerpc/kernel/tau_6xx.c2
-rw-r--r--arch/powerpc/kernel/time.c3
-rw-r--r--arch/powerpc/kernel/traps.c23
-rw-r--r--arch/powerpc/kernel/vector.S4
-rw-r--r--arch/powerpc/kexec/core_64.c10
-rw-r--r--arch/powerpc/kexec/relocate_32.S12
-rw-r--r--arch/powerpc/kvm/Kconfig1
-rw-r--r--arch/powerpc/kvm/book3s.h3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c12
-rw-r--r--arch/powerpc/kvm/book3s_64_vio_hv.c9
-rw-r--r--arch/powerpc/kvm/book3s_hv.c108
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c10
-rw-r--r--arch/powerpc/kvm/book3s_hv_nested.c101
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c8
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S42
-rw-r--r--arch/powerpc/kvm/book3s_hv_tm.c61
-rw-r--r--arch/powerpc/kvm/book3s_xics.c6
-rw-r--r--arch/powerpc/kvm/book3s_xive.c74
-rw-r--r--arch/powerpc/kvm/book3s_xive.h11
-rw-r--r--arch/powerpc/kvm/book3s_xive_native.c24
-rw-r--r--arch/powerpc/mm/Makefile2
-rw-r--r--arch/powerpc/mm/book3s64/hash_native.c2
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c4
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c8
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c3
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c16
-rw-r--r--arch/powerpc/mm/book3s64/slb.c2
-rw-r--r--arch/powerpc/mm/drmem.c46
-rw-r--r--arch/powerpc/mm/mmu_decl.h2
-rw-r--r--arch/powerpc/mm/nohash/tlb_low.S4
-rw-r--r--arch/powerpc/mm/numa.c491
-rw-r--r--arch/powerpc/mm/ptdump/8xx.c6
-rw-r--r--arch/powerpc/mm/ptdump/Makefile9
-rw-r--r--arch/powerpc/mm/ptdump/bats.c18
-rw-r--r--arch/powerpc/mm/ptdump/book3s64.c6
-rw-r--r--arch/powerpc/mm/ptdump/hashpagetable.c12
-rw-r--r--arch/powerpc/mm/ptdump/ptdump.c178
-rw-r--r--arch/powerpc/mm/ptdump/segment_regs.c16
-rw-r--r--arch/powerpc/mm/ptdump/shared.c6
-rw-r--r--arch/powerpc/perf/core-book3s.c21
-rw-r--r--arch/powerpc/perf/hv-gpci.c2
-rw-r--r--arch/powerpc/platforms/44x/machine_check.c4
-rw-r--r--arch/powerpc/platforms/4xx/machine_check.c2
-rw-r--r--arch/powerpc/platforms/85xx/Kconfig6
-rw-r--r--arch/powerpc/platforms/85xx/Makefile1
-rw-r--r--arch/powerpc/platforms/85xx/sbc8548.c134
-rw-r--r--arch/powerpc/platforms/86xx/Kconfig8
-rw-r--r--arch/powerpc/platforms/86xx/Makefile1
-rw-r--r--arch/powerpc/platforms/86xx/sbc8641d.c87
-rw-r--r--arch/powerpc/platforms/cell/axon_msi.c4
-rw-r--r--arch/powerpc/platforms/embedded6xx/holly.c2
-rw-r--r--arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c2
-rw-r--r--arch/powerpc/platforms/pasemi/idle.c2
-rw-r--r--arch/powerpc/platforms/powernv/idle.c6
-rw-r--r--arch/powerpc/platforms/powernv/memtrace.c3
-rw-r--r--arch/powerpc/platforms/powernv/opal-imc.c12
-rw-r--r--arch/powerpc/platforms/powernv/opal-lpc.c4
-rw-r--r--arch/powerpc/platforms/powernv/opal-xscom.c4
-rw-r--r--arch/powerpc/platforms/powernv/opal.c2
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c260
-rw-r--r--arch/powerpc/platforms/powernv/pci.c67
-rw-r--r--arch/powerpc/platforms/powernv/pci.h6
-rw-r--r--arch/powerpc/platforms/ps3/htab.c3
-rw-r--r--arch/powerpc/platforms/ps3/mm.c8
-rw-r--r--arch/powerpc/platforms/pseries/dtl.c4
-rw-r--r--arch/powerpc/platforms/pseries/firmware.c3
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-cpu.c173
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c10
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c514
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c18
-rw-r--r--arch/powerpc/platforms/pseries/msi.c296
-rw-r--r--arch/powerpc/platforms/pseries/pci_dlpar.c4
-rw-r--r--arch/powerpc/platforms/pseries/pseries.h2
-rw-r--r--arch/powerpc/platforms/pseries/ras.c2
-rw-r--r--arch/powerpc/platforms/pseries/setup.c2
-rw-r--r--arch/powerpc/platforms/pseries/svm.c6
-rw-r--r--arch/powerpc/platforms/pseries/vas.c2
-rw-r--r--arch/powerpc/sysdev/fsl_rio.c2
-rw-r--r--arch/powerpc/sysdev/xics/ics-native.c13
-rw-r--r--arch/powerpc/sysdev/xics/ics-opal.c40
-rw-r--r--arch/powerpc/sysdev/xics/ics-rtas.c40
-rw-r--r--arch/powerpc/sysdev/xics/xics-common.c131
-rw-r--r--arch/powerpc/sysdev/xive/common.c103
-rw-r--r--arch/powerpc/sysdev/xive/native.c10
-rw-r--r--arch/powerpc/tools/head_check.sh24
-rw-r--r--arch/powerpc/xmon/xmon.c22
-rw-r--r--arch/riscv/mm/init.c44
-rw-r--r--arch/s390/kernel/setup.c9
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/s390/mm/fault.c2
-rw-r--r--arch/s390/mm/init.c2
-rw-r--r--arch/sh/include/asm/cacheflush.h8
-rw-r--r--arch/sh/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/kernel/aperture_64.c5
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/mm/init.c23
-rw-r--r--arch/x86/mm/numa.c5
-rw-r--r--arch/x86/mm/numa_emulation.c5
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl2
-rw-r--r--block/blk-map.c2
-rw-r--r--drivers/acpi/tables.c5
-rw-r--r--drivers/base/arch_numa.c5
-rw-r--r--drivers/base/core.c4
-rw-r--r--drivers/base/memory.c4
-rw-r--r--drivers/clk/Kconfig1
-rw-r--r--drivers/clk/Makefile2
-rw-r--r--drivers/clk/pistachio/Kconfig8
-rw-r--r--drivers/clocksource/Kconfig3
-rw-r--r--drivers/cpufreq/powernv-cpufreq.c16
-rw-r--r--drivers/cpuidle/cpuidle-pseries.c77
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_internal.c2
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_ttm.c2
-rw-r--r--drivers/iommu/Kconfig69
-rw-r--r--drivers/iommu/Makefile1
-rw-r--r--drivers/iommu/amd/amd_iommu_types.h6
-rw-r--r--drivers/iommu/amd/init.c12
-rw-r--r--drivers/iommu/amd/io_pgtable.c3
-rw-r--r--drivers/iommu/amd/iommu.c151
-rw-r--r--drivers/iommu/amd/iommu_v2.c13
-rw-r--r--drivers/iommu/apple-dart.c923
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c121
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c11
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu.c89
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu.h1
-rw-r--r--drivers/iommu/arm/arm-smmu/qcom_iommu.c9
-rw-r--r--drivers/iommu/dma-iommu.c63
-rw-r--r--drivers/iommu/exynos-iommu.c19
-rw-r--r--drivers/iommu/intel/Kconfig19
-rw-r--r--drivers/iommu/intel/dmar.c2
-rw-r--r--drivers/iommu/intel/iommu.c197
-rw-r--r--drivers/iommu/intel/pasid.c18
-rw-r--r--drivers/iommu/intel/pasid.h10
-rw-r--r--drivers/iommu/intel/perf.c2
-rw-r--r--drivers/iommu/intel/svm.c4
-rw-r--r--drivers/iommu/io-pgtable-arm-v7s.c62
-rw-r--r--drivers/iommu/io-pgtable-arm.c282
-rw-r--r--drivers/iommu/io-pgtable.c1
-rw-r--r--drivers/iommu/iommu.c195
-rw-r--r--drivers/iommu/iova.c14
-rw-r--r--drivers/iommu/ipmmu-vmsa.c28
-rw-r--r--drivers/iommu/mtk_iommu.c13
-rw-r--r--drivers/iommu/mtk_iommu_v1.c1
-rw-r--r--drivers/iommu/rockchip-iommu.c12
-rw-r--r--drivers/iommu/sprd-iommu.c7
-rw-r--r--drivers/iommu/sun50i-iommu.c13
-rw-r--r--drivers/iommu/virtio-iommu.c8
-rw-r--r--drivers/mmc/host/jz4740_mmc.c4
-rw-r--r--drivers/mmc/host/mmc_spi.c2
-rw-r--r--drivers/of/device.c40
-rw-r--r--drivers/of/of_reserved_mem.c12
-rw-r--r--drivers/pci/xen-pcifront.c2
-rw-r--r--drivers/phy/Kconfig2
-rw-r--r--drivers/xen/swiotlb-xen.c8
-rw-r--r--fs/drop_caches.c3
-rw-r--r--fs/exec.c8
-rw-r--r--fs/fcntl.c3
-rw-r--r--fs/fs-writeback.c28
-rw-r--r--fs/fs_context.c4
-rw-r--r--fs/inode.c2
-rw-r--r--fs/locks.c6
-rw-r--r--fs/namei.c8
-rw-r--r--fs/namespace.c7
-rw-r--r--fs/ocfs2/dlmglue.c14
-rw-r--r--fs/ocfs2/quota_global.c1
-rw-r--r--fs/ocfs2/quota_local.c2
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/select.c4
-rw-r--r--fs/userfaultfd.c116
-rw-r--r--include/linux/backing-dev-defs.h2
-rw-r--r--include/linux/backing-dev.h19
-rw-r--r--include/linux/buffer_head.h2
-rw-r--r--include/linux/compaction.h2
-rw-r--r--include/linux/device.h4
-rw-r--r--include/linux/dma-iommu.h6
-rw-r--r--include/linux/highmem.h5
-rw-r--r--include/linux/hugetlb_cgroup.h12
-rw-r--r--include/linux/intel-iommu.h6
-rw-r--r--include/linux/intel-svm.h5
-rw-r--r--include/linux/io-pgtable.h20
-rw-r--r--include/linux/iommu.h114
-rw-r--r--include/linux/memblock.h2
-rw-r--r--include/linux/memcontrol.h104
-rw-r--r--include/linux/memory.h2
-rw-r--r--include/linux/mempolicy.h16
-rw-r--r--include/linux/migrate.h14
-rw-r--r--include/linux/mm.h17
-rw-r--r--include/linux/mmzone.h4
-rw-r--r--include/linux/pagemap.h4
-rw-r--r--include/linux/sched/mm.h10
-rw-r--r--include/linux/shmem_fs.h25
-rw-r--r--include/linux/swap.h28
-rw-r--r--include/linux/swiotlb.h57
-rw-r--r--include/linux/syscalls.h1
-rw-r--r--include/linux/userfaultfd_k.h8
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--include/linux/vmpressure.h2
-rw-r--r--include/linux/writeback.h4
-rw-r--r--include/trace/events/migrate.h3
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--include/uapi/linux/mempolicy.h1
-rw-r--r--ipc/msg.c2
-rw-r--r--ipc/namespace.c2
-rw-r--r--ipc/sem.c9
-rw-r--r--ipc/shm.c2
-rw-r--r--kernel/cgroup/namespace.c2
-rw-r--r--kernel/dma/Kconfig13
-rw-r--r--kernel/dma/direct.c57
-rw-r--r--kernel/dma/direct.h8
-rw-r--r--kernel/dma/swiotlb.c352
-rw-r--r--kernel/irq/irqdomain.c1
-rw-r--r--kernel/livepatch/transition.c4
-rw-r--r--kernel/nsproxy.c2
-rw-r--r--kernel/pid_namespace.c5
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/time/namespace.c4
-rw-r--r--kernel/time/posix-timers.c4
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--lib/scatterlist.c5
-rw-r--r--lib/test_kasan.c80
-rw-r--r--lib/test_kasan_module.c20
-rw-r--r--lib/test_vmalloc.c5
-rw-r--r--mm/backing-dev.c11
-rw-r--r--mm/bootmem_info.c4
-rw-r--r--mm/compaction.c67
-rw-r--r--mm/debug_vm_pgtable.c918
-rw-r--r--mm/filemap.c15
-rw-r--r--mm/gup.c109
-rw-r--r--mm/huge_memory.c32
-rw-r--r--mm/hugetlb.c171
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h9
-rw-r--r--mm/kasan/hw_tags.c43
-rw-r--r--mm/kasan/kasan.h1
-rw-r--r--mm/kasan/report.c29
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/ksm.c8
-rw-r--r--mm/madvise.c1
-rw-r--r--mm/memblock.c22
-rw-r--r--mm/memcontrol.c228
-rw-r--r--mm/memory-failure.c51
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c177
-rw-r--r--mm/migrate.c315
-rw-r--r--mm/mmap.c7
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/oom_kill.c70
-rw-r--r--mm/page-writeback.c121
-rw-r--r--mm/page_alloc.c62
-rw-r--r--mm/page_isolation.c13
-rw-r--r--mm/percpu.c3
-rw-r--r--mm/shmem.c271
-rw-r--r--mm/sparse.c46
-rw-r--r--mm/swap.c22
-rw-r--r--mm/swapfile.c8
-rw-r--r--mm/truncate.c28
-rw-r--r--mm/userfaultfd.c15
-rw-r--r--mm/vmalloc.c79
-rw-r--r--mm/vmpressure.c10
-rw-r--r--mm/vmscan.c210
-rw-r--r--mm/vmstat.c25
-rw-r--r--scripts/mod/modpost.c2
-rw-r--r--security/tomoyo/domain.c13
-rw-r--r--tools/testing/scatterlist/linux/mm.h1
l---------tools/testing/selftests/powerpc/primitives/asm/extable.h1
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c3
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c3
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c1
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c1
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c1
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c1
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c1
-rw-r--r--tools/testing/selftests/powerpc/signal/signal_tm.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-exec.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-fork.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-poison.c2
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-resched-dscr.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-stack.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-sigreturn.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-syscall.c2
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-tar.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-tmspr.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-trap.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-unavailable.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-vmxcopy.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm.h36
-rw-r--r--tools/testing/selftests/vm/.gitignore1
-rw-r--r--tools/testing/selftests/vm/Makefile3
-rw-r--r--tools/testing/selftests/vm/charge_reserved_hugetlb.sh5
-rw-r--r--tools/testing/selftests/vm/hugetlb_reparenting_test.sh5
-rw-r--r--tools/testing/selftests/vm/ksm_tests.c662
-rw-r--r--tools/testing/selftests/vm/mlock-random-test.c2
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests.sh96
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c13
504 files changed, 9249 insertions, 5982 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-iommu_groups b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
index eae2f1c1e11e..b15af6a5bc08 100644
--- a/Documentation/ABI/testing/sysfs-kernel-iommu_groups
+++ b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
@@ -42,8 +42,12 @@ Description: /sys/kernel/iommu_groups/<grp_id>/type shows the type of default
======== ======================================================
DMA All the DMA transactions from the device in this group
are translated by the iommu.
+ DMA-FQ As above, but using batched invalidation to lazily
+ remove translations after use. This may offer reduced
+ overhead at the cost of reduced memory protection.
identity All the DMA transactions from the device in this group
- are not translated by the iommu.
+ are not translated by the iommu. Maximum performance
+ but zero protection.
auto Change to the type the device was booted with.
======== ======================================================
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa
new file mode 100644
index 000000000000..77e559d4ed80
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa
@@ -0,0 +1,24 @@
+What: /sys/kernel/mm/numa/
+Date: June 2021
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Interface for NUMA
+
+What: /sys/kernel/mm/numa/demotion_enabled
+Date: June 2021
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Enable/disable demoting pages during reclaim
+
+ Page migration during reclaim is intended for systems
+ with tiered memory configurations. These systems have
+ multiple types of memory with varied performance
+ characteristics instead of plain NUMA systems where
+ the same kind of memory is found at varied distances.
+ Allowing page migration during reclaim enables these
+ systems to migrate pages from fast tiers to slow tiers
+ when the fast tier is under pressure. This migration
+ is performed before swap. It may move data to a NUMA
+ node that does not fall into the cpuset of the
+ allocating process which might be construed to violate
+ the guarantees of cpusets. This should not be enabled
+ on systems which need strict cpuset location
+ guarantees.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 84dc5790741b..828d11441ebf 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -301,10 +301,7 @@
amd_iommu= [HW,X86-64]
Pass parameters to the AMD IOMMU driver in the system.
Possible values are:
- fullflush - enable flushing of IO/TLB entries when
- they are unmapped. Otherwise they are
- flushed before they will be reused, which
- is a lot of faster
+ fullflush - Deprecated, equivalent to iommu.strict=1
off - do not initialize any AMD IOMMU found in
the system
force_isolation - Force device isolation for all
@@ -1958,18 +1955,17 @@
this case, gfx device will use physical address for
DMA.
strict [Default Off]
- With this option on every unmap_single operation will
- result in a hardware IOTLB flush operation as opposed
- to batching them for performance.
+ Deprecated, equivalent to iommu.strict=1.
sp_off [Default Off]
By default, super page will be supported if Intel IOMMU
has the capability. With this option, super page will
not be supported.
- sm_on [Default Off]
- By default, scalable mode will be disabled even if the
- hardware advertises that it has support for the scalable
- mode translation. With this option set, scalable mode
- will be used on hardware which claims to support it.
+ sm_on
+ Enable the Intel IOMMU scalable mode if the hardware
+ advertises that it has support for the scalable mode
+ translation.
+ sm_off
+ Disallow use of the Intel IOMMU scalable mode.
tboot_noforce [Default Off]
Do not force the Intel IOMMU enabled under tboot.
By default, tboot will force Intel IOMMU on, which
@@ -2061,13 +2057,12 @@
throughput at the cost of reduced device isolation.
Will fall back to strict mode if not supported by
the relevant IOMMU driver.
- 1 - Strict mode (default).
+ 1 - Strict mode.
DMA unmap operations invalidate IOMMU hardware TLBs
synchronously.
- Note: on x86, the default behaviour depends on the
- equivalent driver-specific parameters, but a strict
- mode explicitly specified by either method takes
- precedence.
+ unset - Use value of CONFIG_IOMMU_DEFAULT_DMA_{LAZY,STRICT}.
+ Note: on x86, strict mode specified via one of the
+ legacy driver-specific options takes precedence.
iommu.passthrough=
[ARM64, X86] Configure DMA to bypass the IOMMU by default.
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index 067a90a1499c..64fd0ba0d057 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -245,6 +245,13 @@ MPOL_INTERLEAVED
address range or file. During system boot up, the temporary
interleaved system default policy works in this mode.
+MPOL_PREFERRED_MANY
+ This mode specifices that the allocation should be preferrably
+ satisfied from the nodemask specified in the policy. If there is
+ a memory pressure on all nodes in the nodemask, the allocation
+ can fall back to all existing numa nodes. This is effectively
+ MPOL_PREFERRED allowed for a mask rather than a single node.
+
NUMA memory policy supports the following optional mode flags:
MPOL_F_STATIC_NODES
@@ -253,10 +260,10 @@ MPOL_F_STATIC_NODES
nodes changes after the memory policy has been defined.
Without this flag, any time a mempolicy is rebound because of a
- change in the set of allowed nodes, the node (Preferred) or
- nodemask (Bind, Interleave) is remapped to the new set of
- allowed nodes. This may result in nodes being used that were
- previously undesired.
+ change in the set of allowed nodes, the preferred nodemask (Preferred
+ Many), preferred node (Preferred) or nodemask (Bind, Interleave) is
+ remapped to the new set of allowed nodes. This may result in nodes
+ being used that were previously undesired.
With this flag, if the user-specified nodes overlap with the
nodes allowed by the task's cpuset, then the memory policy is
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 003d5cc3751b..5e795202111f 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -118,7 +118,8 @@ compaction_proactiveness
This tunable takes a value in the range [0, 100] with a default value of
20. This tunable determines how aggressively compaction is done in the
-background. Setting it to 0 disables proactive compaction.
+background. Write of a non zero value to this tunable will immediately
+trigger the proactive compaction. Setting it to 0 disables proactive compaction.
Note that compaction has a non-trivial system-wide impact as pages
belonging to different processes are moved around, which could also lead
diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst
index fe4290e26729..8aed9103e48a 100644
--- a/Documentation/core-api/cachetlb.rst
+++ b/Documentation/core-api/cachetlb.rst
@@ -271,10 +271,15 @@ maps this page at its virtual address.
``void flush_dcache_page(struct page *page)``
- Any time the kernel writes to a page cache page, _OR_
- the kernel is about to read from a page cache page and
- user space shared/writable mappings of this page potentially
- exist, this routine is called.
+ This routines must be called when:
+
+ a) the kernel did write to a page that is in the page cache page
+ and / or in high memory
+ b) the kernel is about to read from a page cache page and user space
+ shared/writable mappings of this page potentially exist. Note
+ that {get,pin}_user_pages{_fast} already call flush_dcache_page
+ on any page found in the user address space and thus driver
+ code rarely needs to take this into account.
.. note::
@@ -284,38 +289,34 @@ maps this page at its virtual address.
handling vfs symlinks in the page cache need not call
this interface at all.
- The phrase "kernel writes to a page cache page" means,
- specifically, that the kernel executes store instructions
- that dirty data in that page at the page->virtual mapping
- of that page. It is important to flush here to handle
- D-cache aliasing, to make sure these kernel stores are
- visible to user space mappings of that page.
-
- The corollary case is just as important, if there are users
- which have shared+writable mappings of this file, we must make
- sure that kernel reads of these pages will see the most recent
- stores done by the user.
-
- If D-cache aliasing is not an issue, this routine may
- simply be defined as a nop on that architecture.
-
- There is a bit set aside in page->flags (PG_arch_1) as
- "architecture private". The kernel guarantees that,
- for pagecache pages, it will clear this bit when such
- a page first enters the pagecache.
-
- This allows these interfaces to be implemented much more
- efficiently. It allows one to "defer" (perhaps indefinitely)
- the actual flush if there are currently no user processes
- mapping this page. See sparc64's flush_dcache_page and
- update_mmu_cache implementations for an example of how to go
- about doing this.
-
- The idea is, first at flush_dcache_page() time, if
- page->mapping->i_mmap is an empty tree, just mark the architecture
- private page flag bit. Later, in update_mmu_cache(), a check is
- made of this flag bit, and if set the flush is done and the flag
- bit is cleared.
+ The phrase "kernel writes to a page cache page" means, specifically,
+ that the kernel executes store instructions that dirty data in that
+ page at the page->virtual mapping of that page. It is important to
+ flush here to handle D-cache aliasing, to make sure these kernel stores
+ are visible to user space mappings of that page.
+
+ The corollary case is just as important, if there are users which have
+ shared+writable mappings of this file, we must make sure that kernel
+ reads of these pages will see the most recent stores done by the user.
+
+ If D-cache aliasing is not an issue, this routine may simply be defined
+ as a nop on that architecture.
+
+ There is a bit set aside in page->flags (PG_arch_1) as "architecture
+ private". The kernel guarantees that, for pagecache pages, it will
+ clear this bit when such a page first enters the pagecache.
+
+ This allows these interfaces to be implemented much more efficiently.
+ It allows one to "defer" (perhaps indefinitely) the actual flush if
+ there are currently no user processes mapping this page. See sparc64's
+ flush_dcache_page and update_mmu_cache implementations for an example
+ of how to go about doing this.
+
+ The idea is, first at flush_dcache_page() time, if page_file_mapping()
+ returns a mapping, and mapping_mapped on that mapping returns %false,
+ just mark the architecture private page flag bit. Later, in
+ update_mmu_cache(), a check is made of this flag bit, and if set the
+ flush is done and the flag bit is cleared.
.. important::
@@ -351,19 +352,6 @@ maps this page at its virtual address.
architectures). For incoherent architectures, it should flush
the cache of the page at vmaddr.
- ``void flush_kernel_dcache_page(struct page *page)``
-
- When the kernel needs to modify a user page is has obtained
- with kmap, it calls this function after all modifications are
- complete (but before kunmapping it) to bring the underlying
- page up to date. It is assumed here that the user has no
- incoherent cached copies (i.e. the original page was obtained
- from a mechanism like get_user_pages()). The default
- implementation is a nop and should remain so on all coherent
- architectures. On incoherent architectures, this should flush
- the kernel cache for page (using page_address(page)).
-
-
``void flush_icache_range(unsigned long start, unsigned long end)``
When the kernel stores into addresses that it will execute
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 83ec4a556c19..21dc03bc10a4 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -181,9 +181,16 @@ By default, KASAN prints a bug report only for the first invalid memory access.
With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This
effectively disables ``panic_on_warn`` for KASAN reports.
+Alternatively, independent of ``panic_on_warn`` the ``kasan.fault=`` boot
+parameter can be used to control panic and reporting behaviour:
+
+- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
+ report or also panic the kernel (default: ``report``). The panic happens even
+ if ``kasan_multi_shot`` is enabled.
+
Hardware tag-based KASAN mode (see the section about various modes below) is
intended for use in production as a security mitigation. Therefore, it supports
-boot parameters that allow disabling KASAN or controlling its features.
+additional boot parameters that allow disabling KASAN or controlling features:
- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
@@ -199,10 +206,6 @@ boot parameters that allow disabling KASAN or controlling its features.
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
traces collection (default: ``on``).
-- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
- report or also panic the kernel (default: ``report``). The panic happens even
- if ``kasan_multi_shot`` is enabled.
-
Implementation details
----------------------
diff --git a/Documentation/devicetree/bindings/iommu/apple,dart.yaml b/Documentation/devicetree/bindings/iommu/apple,dart.yaml
new file mode 100644
index 000000000000..94aa9e9afa59
--- /dev/null
+++ b/Documentation/devicetree/bindings/iommu/apple,dart.yaml
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iommu/apple,dart.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Apple DART IOMMU
+
+maintainers:
+ - Sven Peter <sven@svenpeter.dev>
+
+description: |+
+ Apple SoCs may contain an implementation of their Device Address
+ Resolution Table which provides a mandatory layer of address
+ translations for various masters.
+
+ Each DART instance is capable of handling up to 16 different streams
+ with individual pagetables and page-level read/write protection flags.
+
+ This DART IOMMU also raises interrupts in response to various
+ fault conditions.
+
+properties:
+ compatible:
+ const: apple,t8103-dart
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ description:
+ Reference to the gate clock phandle if required for this IOMMU.
+ Optional since not all IOMMUs are attached to a clock gate.
+
+ '#iommu-cells':
+ const: 1
+ description:
+ Has to be one. The single cell describes the stream id emitted by
+ a master to the IOMMU.
+
+required:
+ - compatible
+ - reg
+ - '#iommu-cells'
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |+
+ dart1: iommu@82f80000 {
+ compatible = "apple,t8103-dart";
+ reg = <0x82f80000 0x4000>;
+ interrupts = <1 781 4>;
+ #iommu-cells = <1>;
+ };
+
+ master1 {
+ iommus = <&dart1 0>;
+ };
+
+ - |+
+ dart2a: iommu@82f00000 {
+ compatible = "apple,t8103-dart";
+ reg = <0x82f00000 0x4000>;
+ interrupts = <1 781 4>;
+ #iommu-cells = <1>;
+ };
+ dart2b: iommu@82f80000 {
+ compatible = "apple,t8103-dart";
+ reg = <0x82f80000 0x4000>;
+ interrupts = <1 781 4>;
+ #iommu-cells = <1>;
+ };
+
+ master2 {
+ iommus = <&dart2a 0>, <&dart2b 1>;
+ };
diff --git a/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt b/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
index e8d3096d922c..39b5f4c5a511 100644
--- a/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
+++ b/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
@@ -51,6 +51,23 @@ compatible (optional) - standard definition
used as a shared pool of DMA buffers for a set of devices. It can
be used by an operating system to instantiate the necessary pool
management subsystem if necessary.
+ - restricted-dma-pool: This indicates a region of memory meant to be
+ used as a pool of restricted DMA buffers for a set of devices. The
+ memory region would be the only region accessible to those devices.
+ When using this, the no-map and reusable properties must not be set,
+ so the operating system can create a virtual mapping that will be used
+ for synchronization. The main purpose for restricted DMA is to
+ mitigate the lack of DMA access control on systems without an IOMMU,
+ which could result in the DMA accessing the system memory at
+ unexpected times and/or unexpected addresses, possibly leading to data
+ leakage or corruption. The feature on its own provides a basic level
+ of protection against the DMA overwriting buffer contents at
+ unexpected times. However, to protect against general data leakage and
+ system memory corruption, the system needs to provide way to lock down
+ the memory access, e.g., MPU. Note that since coherent allocation
+ needs remapping, one must set up another device coherent pool by
+ shared-dma-pool and use dma_alloc_from_dev_coherent instead for atomic
+ coherent allocation.
- vendor specific string in the form <vendor>,[<device>-]<usage>
no-map (optional) - empty property
- Indicates the operating system must not create a virtual mapping
@@ -85,10 +102,11 @@ memory-region-names (optional) - a list of names, one for each corresponding
Example
-------
-This example defines 3 contiguous regions are defined for Linux kernel:
+This example defines 4 contiguous regions for Linux kernel:
one default of all device drivers (named linux,cma@72000000 and 64MiB in size),
-one dedicated to the framebuffer device (named framebuffer@78000000, 8MiB), and
-one for multimedia processing (named multimedia-memory@77000000, 64MiB).
+one dedicated to the framebuffer device (named framebuffer@78000000, 8MiB),
+one for multimedia processing (named multimedia-memory@77000000, 64MiB), and
+one for restricted dma pool (named restricted_dma_reserved@0x50000000, 64MiB).
/ {
#address-cells = <1>;
@@ -120,6 +138,11 @@ one for multimedia processing (named multimedia-memory@77000000, 64MiB).
compatible = "acme,multimedia-memory";
reg = <0x77000000 0x4000000>;
};
+
+ restricted_dma_reserved: restricted_dma_reserved {
+ compatible = "restricted-dma-pool";
+ reg = <0x50000000 0x4000000>;
+ };
};
/* ... */
@@ -138,4 +161,11 @@ one for multimedia processing (named multimedia-memory@77000000, 64MiB).
memory-region = <&multimedia_reserved>;
/* ... */
};
+
+ pcie_device: pcie_device@0,0 {
+ reg = <0x83010000 0x0 0x00000000 0x0 0x00100000
+ 0x83010000 0x0 0x00100000 0x0 0x00100000>;
+ memory-region = <&restricted_dma_reserved>;
+ /* ... */
+ };
};
diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst
new file mode 100644
index 000000000000..4d01c7368561
--- /dev/null
+++ b/Documentation/powerpc/associativity.rst
@@ -0,0 +1,105 @@
+============================
+NUMA resource associativity
+============================
+
+Associativity represents the groupings of the various platform resources into
+domains of substantially similar mean performance relative to resources outside
+of that domain. Resources subsets of a given domain that exhibit better
+performance relative to each other than relative to other resources subsets
+are represented as being members of a sub-grouping domain. This performance
+characteristic is presented in terms of NUMA node distance within the Linux kernel.
+From the platform view, these groups are also referred to as domains.
+
+PAPR interface currently supports different ways of communicating these resource
+grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
+associativity grouping. Form 0 is the oldest format and is now considered deprecated.
+
+Hypervisor indicates the type/form of associativity used via "ibm,architecture-vec-5 property".
+Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
+A value of 1 indicates the usage of Form 1 associativity. For Form 2 associativity
+bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
+
+Form 0
+------
+Form 0 associativity supports only two NUMA distances (LOCAL and REMOTE).
+
+Form 1
+------
+With Form 1 a combination of ibm,associativity-reference-points, and ibm,associativity
+device tree properties are used to determine the NUMA distance between resource groups/domains.
+
+The “ibm,associativity” property contains a list of one or more numbers (domainID)
+representing the resource’s platform grouping domains.
+
+The “ibm,associativity-reference-points” property contains a list of one or more numbers
+(domainID index) that represents the 1 based ordinal in the associativity lists.
+The list of domainID indexes represents an increasing hierarchy of resource grouping.
+
+ex:
+{ primary domainID index, secondary domainID index, tertiary domainID index.. }
+
+Linux kernel uses the domainID at the primary domainID index as the NUMA node id.
+Linux kernel computes NUMA distance between two domains by recursively comparing
+if they belong to the same higher-level domains. For mismatch at every higher
+level of the resource group, the kernel doubles the NUMA distance between the
+comparing domains.
+
+Form 2
+-------
+Form 2 associativity format adds separate device tree properties representing NUMA node distance
+thereby making the node distance computation flexible. Form 2 also allows flexible primary
+domain numbering. With numa distance computation now detached from the index value in
+"ibm,associativity-reference-points" property, Form 2 allows a large number of primary domain
+ids at the same domainID index representing resource groups of different performance/latency
+characteristics.
+
+Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the
+"ibm,architecture-vec-5" property.
+
+"ibm,numa-lookup-index-table" property contains a list of one or more numbers representing
+the domainIDs present in the system. The offset of the domainID in this property is
+used as an index while computing numa distance information via "ibm,numa-distance-table".
+
+prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by
+N domainID encoded as with encode-int
+
+For ex:
+"ibm,numa-lookup-index-table" = {4, 0, 8, 250, 252}. The offset of domainID 8 (2) is used when
+computing the distance of domain 8 from other domains present in the system. For the rest of
+this document, this offset will be referred to as domain distance offset.
+
+"ibm,numa-distance-table" property contains a list of one or more numbers representing the NUMA
+distance between resource groups/domains present in the system.
+
+prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
+N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
+The number N must be equal to the square of m where m is the number of domainIDs in the
+numa-lookup-index-table.
+
+For ex:
+ibm,numa-lookup-index-table = <3 0 8 40>;
+ibm,numa-distace-table = <9>, /bits/ 8 < 10 20 80 20 10 160 80 160 10>;
+
+::
+
+ | 0 8 40
+ --|------------
+ |
+ 0 | 10 20 80
+ |
+ 8 | 20 10 160
+ |
+ 40| 80 160 10
+
+A possible "ibm,associativity" property for resources in node 0, 8 and 40
+
+{ 3, 6, 7, 0 }
+{ 3, 6, 9, 8 }
+{ 3, 6, 7, 40}
+
+With "ibm,associativity-reference-points" { 0x3 }
+
+"ibm,lookup-index-table" helps in having a compact representation of distance matrix.
+Since domainID can be sparse, the matrix of distances can also be effectively sparse.
+With "ibm,lookup-index-table" we can achieve a compact representation of
+distance information.
diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst
index bf5f1a2bdbdf..0f7d3c495693 100644
--- a/Documentation/powerpc/index.rst
+++ b/Documentation/powerpc/index.rst
@@ -7,6 +7,7 @@ powerpc
.. toctree::
:maxdepth: 1
+ associativity
booting
bootwrapper
cpu_families
diff --git a/Documentation/translations/zh_CN/core-api/cachetlb.rst b/Documentation/translations/zh_CN/core-api/cachetlb.rst
index 8376485a534d..55827b8a7c53 100644
--- a/Documentation/translations/zh_CN/core-api/cachetlb.rst
+++ b/Documentation/translations/zh_CN/core-api/cachetlb.rst
@@ -298,15 +298,6 @@ HyperSparc cpu就是这样一个具有这种属性的cpu。
用。默认的实现是nop(对于所有相干的架构应该保持这样)。对于不一致性
的架构,它应该刷新vmaddr处的页面缓存。
- ``void flush_kernel_dcache_page(struct page *page)``
-
- 当内核需要修改一个用kmap获得的用户页时,它会在所有修改完成后(但在
- kunmapping之前)调用这个函数,以使底层页面达到最新状态。这里假定用
- 户没有不一致性的缓存副本(即原始页面是从类似get_user_pages()的机制
- 中获得的)。默认的实现是一个nop,在所有相干的架构上都应该如此。在不
- 一致性的架构上,这应该刷新内核缓存中的页面(使用page_address(page))。
-
-
``void flush_icache_range(unsigned long start, unsigned long end)``
当内核存储到它将执行的地址中时(例如在加载模块时),这个函数被调用。
diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst
index a5c884293dac..89b5f7a52077 100644
--- a/Documentation/vm/hwpoison.rst
+++ b/Documentation/vm/hwpoison.rst
@@ -180,7 +180,6 @@ Limitations
===========
- Not all page types are supported and never will. Most kernel internal
objects cannot be recovered, only LRU pages for now.
-- Right now hugepage support is missing.
---
Andi Kleen, Oct 2009
diff --git a/MAINTAINERS b/MAINTAINERS
index fb1c48c34009..fa87db67a249 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1268,6 +1268,13 @@ L: linux-input@vger.kernel.org
S: Odd fixes
F: drivers/input/mouse/bcm5974.c
+APPLE DART IOMMU DRIVER
+M: Sven Peter <sven@svenpeter.dev>
+L: iommu@lists.linux-foundation.org
+S: Maintained
+F: Documentation/devicetree/bindings/iommu/apple,dart.yaml
+F: drivers/iommu/apple-dart.c
+
APPLE SMC DRIVER
M: Henrik Rydberg <rydberg@bitmath.org>
L: linux-hwmon@vger.kernel.org
@@ -6846,7 +6853,6 @@ F: Documentation/admin-guide/media/em28xx*
F: drivers/media/usb/em28xx/
EMBEDDED LINUX
-M: Paul Gortmaker <paul.gortmaker@windriver.com>
M: Matt Mackall <mpm@selenic.com>
M: David Woodhouse <dwmw2@infradead.org>
L: linux-embedded@vger.kernel.org
@@ -11113,7 +11119,7 @@ MARDUK (CREATOR CI40) DEVICE TREE SUPPORT
M: Rahul Bedarkar <rahulbedarkar89@gmail.com>
L: linux-mips@vger.kernel.org
S: Maintained
-F: arch/mips/boot/dts/img/pistachio_marduk.dts
+F: arch/mips/boot/dts/img/pistachio*
MARVELL 88E6XXX ETHERNET SWITCH FABRIC DRIVER
M: Andrew Lunn <andrew@lunn.ch>
@@ -14817,14 +14823,6 @@ S: Maintained
W: http://www.st.com/spear
F: drivers/pinctrl/spear/
-PISTACHIO SOC SUPPORT
-M: James Hartley <james.hartley@sondrel.com>
-L: linux-mips@vger.kernel.org
-S: Odd Fixes
-F: arch/mips/boot/dts/img/pistachio*
-F: arch/mips/configs/pistachio*_defconfig
-F: arch/mips/pistachio/
-
PKTCDVD DRIVER
M: linux-block@vger.kernel.org
S: Orphan
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 7ac22e007d52..e4a041cd5715 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -486,3 +486,5 @@
554 common landlock_create_ruleset sys_landlock_create_ruleset
555 common landlock_add_rule sys_landlock_add_rule
556 common landlock_restrict_self sys_landlock_restrict_self
+# 557 reserved for memfd_secret
+558 common process_mrelease sys_process_mrelease
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 2e24e765e6d3..5e56288e343b 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -291,6 +291,7 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *);
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
static inline void flush_kernel_vmap_range(void *addr, int size)
{
if ((cache_is_vivt() || cache_is_vipt_aliasing()))
@@ -312,9 +313,6 @@ static inline void flush_anon_page(struct vm_area_struct *vma,
__flush_anon_page(vma, page, vmaddr);
}
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-extern void flush_kernel_dcache_page(struct page *);
-
#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index f97eb2371672..284a80c0b6e1 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -1012,31 +1012,25 @@ static void __init reserve_crashkernel(void)
unsigned long long lowmem_max = __pa(high_memory - 1) + 1;
if (crash_max > lowmem_max)
crash_max = lowmem_max;
- crash_base = memblock_find_in_range(CRASH_ALIGN, crash_max,
- crash_size, CRASH_ALIGN);
+
+ crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+ CRASH_ALIGN, crash_max);
if (!crash_base) {
pr_err("crashkernel reservation failed - No suitable area found.\n");
return;
}
} else {
+ unsigned long long crash_max = crash_base + crash_size;
unsigned long long start;
- start = memblock_find_in_range(crash_base,
- crash_base + crash_size,
- crash_size, SECTION_SIZE);
- if (start != crash_base) {
+ start = memblock_phys_alloc_range(crash_size, SECTION_SIZE,
+ crash_base, crash_max);
+ if (!start) {
pr_err("crashkernel reservation failed - memory is in use.\n");
return;
}
}
- ret = memblock_reserve(crash_base, crash_size);
- if (ret < 0) {
- pr_warn("crashkernel reservation failed - memory is in use (0x%lx)\n",
- (unsigned long)crash_base);
- return;
- }
-
pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
(unsigned long)(crash_size >> 20),
(unsigned long)(crash_base >> 20),
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 6d89db7895d1..7ff9feea13a6 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -346,39 +346,6 @@ void flush_dcache_page(struct page *page)
EXPORT_SYMBOL(flush_dcache_page);
/*
- * Ensure cache coherency for the kernel mapping of this page. We can
- * assume that the page is pinned via kmap.
- *
- * If the page only exists in the page cache and there are no user
- * space mappings, this is a no-op since the page was already marked
- * dirty at creation. Otherwise, we need to flush the dirty kernel
- * cache lines directly.
- */
-void flush_kernel_dcache_page(struct page *page)
-{
- if (cache_is_vivt() || cache_is_vipt_aliasing()) {
- struct address_space *mapping;
-
- mapping = page_mapping_file(page);
-
- if (!mapping || mapping_mapped(mapping)) {
- void *addr;
-
- addr = page_address(page);
- /*
- * kmap_atomic() doesn't set the page virtual
- * address for highmem pages, and
- * kunmap_atomic() takes care of cache
- * flushing already.
- */
- if (!IS_ENABLED(CONFIG_HIGHMEM) || addr)
- __cpuc_flush_dcache_area(addr, PAGE_SIZE);
- }
- }
-}
-EXPORT_SYMBOL(flush_kernel_dcache_page);
-
-/*
* Flush an anonymous page so that users of get_user_pages()
* can safely access the data. The expected sequence is:
*
diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c
index 8b3d7191e2b8..2658f52903da 100644
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -166,12 +166,6 @@ void flush_dcache_page(struct page *page)
}
EXPORT_SYMBOL(flush_dcache_page);
-void flush_kernel_dcache_page(struct page *page)
-{
- __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
-}
-EXPORT_SYMBOL(flush_kernel_dcache_page);
-
void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
unsigned long uaddr, void *dst, const void *src,
unsigned long len)
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index f8a2d5aa17b7..7e0a9b692d87 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -460,3 +460,5 @@
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 common process_mrelease sys_process_mrelease
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 727bfc3be99b..3cb206aea3db 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 447
+#define __NR_compat_syscalls 449
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 03d4ca47d253..4e99e4b912ef 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -901,6 +901,8 @@ __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset)
__SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
#define __NR_landlock_restrict_self 446
__SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
+#define __NR_process_mrelease 448
+__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c
index d654921dd09b..578670e3f608 100644
--- a/arch/arm64/kvm/hyp/reserved_mem.c
+++ b/arch/arm64/kvm/hyp/reserved_mem.c
@@ -92,12 +92,10 @@ void __init kvm_hyp_reserve(void)
* this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
*/
hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
- hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
- ALIGN(hyp_mem_size, PMD_SIZE),
- PMD_SIZE);
+ hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE),
+ PMD_SIZE);
if (!hyp_mem_base)
- hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
- hyp_mem_size, PAGE_SIZE);
+ hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE);
else
hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);
@@ -105,7 +103,6 @@ void __init kvm_hyp_reserve(void)
kvm_err("Failed to reserve hyp memory\n");
return;
}
- memblock_reserve(hyp_mem_base, hyp_mem_size);
kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
hyp_mem_base);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index edc8e950bada..b16be52233c6 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -74,6 +74,7 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
static void __init reserve_crashkernel(void)
{
unsigned long long crash_base, crash_size;
+ unsigned long long crash_max = arm64_dma_phys_limit;
int ret;
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
@@ -84,33 +85,18 @@ static void __init reserve_crashkernel(void)
crash_size = PAGE_ALIGN(crash_size);
- if (crash_base == 0) {
- /* Current arm64 boot protocol requires 2MB alignment */
- crash_base = memblock_find_in_range(0, arm64_dma_phys_limit,
- crash_size, SZ_2M);
- if (crash_base == 0) {
- pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
- crash_size);
- return;
- }
- } else {
- /* User specifies base address explicitly. */
- if (!memblock_is_region_memory(crash_base, crash_size)) {
- pr_warn("cannot reserve crashkernel: region is not memory\n");
- return;
- }
+ /* User specifies base address explicitly. */
+ if (crash_base)
+ crash_max = crash_base + crash_size;
- if (memblock_is_region_reserved(crash_base, crash_size)) {
- pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n");
- return;
- }
-
- if (!IS_ALIGNED(crash_base, SZ_2M)) {
- pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n");
- return;
- }
+ /* Current arm64 boot protocol requires 2MB alignment */
+ crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
+ crash_base, crash_max);
+ if (!crash_base) {
+ pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+ crash_size);
+ return;
}
- memblock_reserve(crash_base, crash_size);
pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
crash_base, crash_base + crash_size, crash_size >> 20);
diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c
index 07ff17ea33de..fb91b069dc69 100644
--- a/arch/csky/abiv1/cacheflush.c
+++ b/arch/csky/abiv1/cacheflush.c
@@ -56,17 +56,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr,
}
}
-void flush_kernel_dcache_page(struct page *page)
-{
- struct address_space *mapping;
-
- mapping = page_mapping_file(page);
-
- if (!mapping || mapping_mapped(mapping))
- dcache_wbinv_all();
-}
-EXPORT_SYMBOL(flush_kernel_dcache_page);
-
void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h
index 6cab7afae962..ed62e2066ba7 100644
--- a/arch/csky/abiv1/inc/abi/cacheflush.h
+++ b/arch/csky/abiv1/inc/abi/cacheflush.h
@@ -14,12 +14,10 @@ extern void flush_dcache_page(struct page *);
#define flush_cache_page(vma, page, pfn) cache_wbinv_all()
#define flush_cache_dup_mm(mm) cache_wbinv_all()
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-extern void flush_kernel_dcache_page(struct page *);
-
#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
static inline void flush_kernel_vmap_range(void *addr, int size)
{
dcache_wbinv_all();
diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c
index 68b22b499aeb..8fffa34d4e1c 100644
--- a/arch/csky/kernel/probes/kprobes.c
+++ b/arch/csky/kernel/probes/kprobes.c
@@ -283,8 +283,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr)
* normal page fault.
*/
regs->pc = (unsigned long) cur->addr;
- if (!instruction_pointer(regs))
- BUG();
+ BUG_ON(!instruction_pointer(regs));
if (kcb->kprobe_status == KPROBE_REENTER)
restore_previous_kprobe(kcb);
diff --git a/arch/ia64/include/asm/meminit.h b/arch/ia64/include/asm/meminit.h
index 6c47a239fc26..f1d5bf2ba847 100644
--- a/arch/ia64/include/asm/meminit.h
+++ b/arch/ia64/include/asm/meminit.h
@@ -29,7 +29,6 @@ struct rsvd_region {
};
extern struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1];
-extern int num_rsvd_regions;
extern void find_memory (void);
extern void reserve_memory (void);
@@ -40,7 +39,6 @@ extern unsigned long efi_memmap_init(u64 *s, u64 *e);
extern int find_max_min_low_pfn (u64, u64, void *);
extern unsigned long vmcore_find_descriptor_size(unsigned long address);
-extern int reserve_elfcorehdr(u64 *start, u64 *end);
/*
* For rounding an address to the next IA64_GRANULE_SIZE or order
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index e2af6b172200..96d13cb7c19f 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -906,6 +906,6 @@ EXPORT_SYMBOL(acpi_unregister_ioapic);
/*
* acpi_suspend_lowlevel() - save kernel state and suspend.
*
- * TBD when when IA64 starts to support suspend...
+ * TBD when IA64 starts to support suspend...
*/
int acpi_suspend_lowlevel(void) { return 0; }
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index dd595fbd8006..31fb84de2d21 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -131,7 +131,7 @@ unsigned long ia64_cache_stride_shift = ~0;
* We use a special marker for the end of memory and it uses the extra (+1) slot
*/
struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1] __initdata;
-int num_rsvd_regions __initdata;
+static int num_rsvd_regions __initdata;
/*
@@ -325,6 +325,31 @@ static inline void __init setup_crashkernel(unsigned long total, int *n)
{}
#endif
+#ifdef CONFIG_CRASH_DUMP
+static int __init reserve_elfcorehdr(u64 *start, u64 *end)
+{
+ u64 length;
+
+ /* We get the address using the kernel command line,
+ * but the size is extracted from the EFI tables.
+ * Both address and size are required for reservation
+ * to work properly.
+ */
+
+ if (!is_vmcore_usable())
+ return -EINVAL;
+
+ if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) {
+ vmcore_unusable();
+ return -EINVAL;
+ }
+
+ *start = (unsigned long)__va(elfcorehdr_addr);
+ *end = *start + length;
+ return 0;
+}
+#endif /* CONFIG_CRASH_DUMP */
+
/**
* reserve_memory - setup reserved memory areas
*
@@ -522,32 +547,6 @@ static __init int setup_nomca(char *s)
}
early_param("nomca", setup_nomca);
-#ifdef CONFIG_CRASH_DUMP
-int __init reserve_elfcorehdr(u64 *start, u64 *end)
-{
- u64 length;
-
- /* We get the address using the kernel command line,
- * but the size is extracted from the EFI tables.
- * Both address and size are required for reservation
- * to work properly.
- */
-
- if (!is_vmcore_usable())
- return -EINVAL;
-
- if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) {
- vmcore_unusable();
- return -EINVAL;
- }
-
- *start = (unsigned long)__va(elfcorehdr_addr);
- *end = *start + length;
- return 0;
-}
-
-#endif /* CONFIG_PROC_VMCORE */
-
void __init
setup_arch (char **cmdline_p)
{
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 4b20224b14d9..6fea1844fb95 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -367,3 +367,5 @@
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 common process_mrelease sys_process_mrelease
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 3ec1291c268d..7976dff8f879 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -446,3 +446,5 @@
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 common process_mrelease sys_process_mrelease
diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h
index ce550978f4fc..4b8b2fa78fc5 100644
--- a/arch/microblaze/include/asm/page.h
+++ b/arch/microblaze/include/asm/page.h
@@ -112,8 +112,7 @@ extern int page_is_ram(unsigned long pfn);
# define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
# define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT)
-# define pfn_valid(pfn) ((pfn) < (max_mapnr + ARCH_PFN_OFFSET))
-
+# define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (max_mapnr + ARCH_PFN_OFFSET))
# endif /* __ASSEMBLY__ */
#define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr)))
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index 71cd547655d9..c136a01e467e 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -443,8 +443,6 @@ extern int mem_init_done;
asmlinkage void __init mmu_init(void);
-void __init *early_get_page(void);
-
#endif /* __ASSEMBLY__ */
#endif /* __KERNEL__ */
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 9be3ace12938..6b0e11362bd2 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -452,3 +452,5 @@
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 common process_mrelease sys_process_mrelease
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index ab55c70380a5..952f35b335b2 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -265,18 +265,6 @@ asmlinkage void __init mmu_init(void)
dma_contiguous_reserve(memory_start + lowmem_size - 1);
}
-/* This is only called until mem_init is done. */
-void __init *early_get_page(void)
-{
- /*
- * Mem start + kernel_tlb -> here is limit
- * because of mem mapping from head.S
- */
- return memblock_alloc_try_nid_raw(PAGE_SIZE, PAGE_SIZE,
- MEMBLOCK_LOW_LIMIT, memory_start + kernel_tlb,
- NUMA_NO_NODE);
-}
-
void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask)
{
void *p;
diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c
index 38ccb909bc9d..c1833b159d3b 100644
--- a/arch/microblaze/mm/pgtable.c
+++ b/arch/microblaze/mm/pgtable.c
@@ -33,6 +33,7 @@
#include <linux/init.h>
#include <linux/mm_types.h>
#include <linux/pgtable.h>
+#include <linux/memblock.h>
#include <asm/pgalloc.h>
#include <linux/io.h>
@@ -242,15 +243,13 @@ unsigned long iopa(unsigned long addr)
__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
{
- pte_t *pte;
- if (mem_init_done) {
- pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- } else {
- pte = (pte_t *)early_get_page();
- if (pte)
- clear_page(pte);
- }
- return pte;
+ if (mem_init_done)
+ return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ else
+ return memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE,
+ MEMBLOCK_LOW_LIMIT,
+ memory_start + kernel_tlb,
+ NUMA_NO_NODE);
}
void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
diff --git a/arch/mips/Kbuild.platforms b/arch/mips/Kbuild.platforms
index e4f6e49417a9..584081df89c2 100644
--- a/arch/mips/Kbuild.platforms
+++ b/arch/mips/Kbuild.platforms
@@ -21,7 +21,6 @@ platform-$(CONFIG_MIPS_MALTA) += mti-malta/
platform-$(CONFIG_MACH_NINTENDO64) += n64/
platform-$(CONFIG_NLM_COMMON) += netlogic/
platform-$(CONFIG_PIC32MZDA) += pic32/
-platform-$(CONFIG_MACH_PISTACHIO) += pistachio/
platform-$(CONFIG_RALINK) += ralink/
platform-$(CONFIG_MIKROTIK_RB532) += rb532/
platform-$(CONFIG_SGI_IP22) += sgi-ip22/
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 24e374266fdc..9b8ff6c2c1e3 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -514,35 +514,6 @@ config MACH_LOONGSON64
and Loongson-2F which will be removed), developed by the Institute
of Computing Technology (ICT), Chinese Academy of Sciences (CAS).
-config MACH_PISTACHIO
- bool "IMG Pistachio SoC based boards"
- select BOOT_ELF32
- select BOOT_RAW
- select CEVT_R4K
- select CLKSRC_MIPS_GIC
- select COMMON_CLK
- select CSRC_R4K
- select DMA_NONCOHERENT
- select GPIOLIB
- select IRQ_MIPS_CPU
- select MFD_SYSCON
- select MIPS_CPU_SCACHE
- select MIPS_GIC
- select PINCTRL
- select REGULATOR
- select SYS_HAS_CPU_MIPS32_R2
- select SYS_SUPPORTS_32BIT_KERNEL
- select SYS_SUPPORTS_LITTLE_ENDIAN
- select SYS_SUPPORTS_MIPS_CPS
- select SYS_SUPPORTS_MULTITHREADING
- select SYS_SUPPORTS_RELOCATABLE
- select SYS_SUPPORTS_ZBOOT
- select SYS_HAS_EARLY_PRINTK
- select USE_GENERIC_EARLY_PRINTK_8250
- select USE_OF
- help
- This enables support for the IMG Pistachio SoC platform.
-
config MIPS_MALTA
bool "MIPS Malta board"
select ARCH_MAY_HAVE_PC_FDC
@@ -1089,7 +1060,6 @@ source "arch/mips/ingenic/Kconfig"
source "arch/mips/jazz/Kconfig"
source "arch/mips/lantiq/Kconfig"
source "arch/mips/pic32/Kconfig"
-source "arch/mips/pistachio/Kconfig"
source "arch/mips/ralink/Kconfig"
source "arch/mips/sgi-ip27/Kconfig"
source "arch/mips/sibyte/Kconfig"
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index 653befc1b176..1fb97579b6b4 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -560,6 +560,9 @@ sead3micro_defconfig-y := micro32r2el_defconfig BOARDS=sead-3
legacy_defconfigs += xilfpga_defconfig
xilfpga_defconfig-y := 32r2el_defconfig BOARDS=xilfpga
+legacy_defconfigs += pistachio_defconfig
+pistachio_defconfig-y := 32r2el_defconfig BOARDS=marduk
+
.PHONY: $(legacy_defconfigs)
$(legacy_defconfigs):
$(Q)$(MAKE) -f $(srctree)/Makefile $($@-y)
diff --git a/arch/mips/alchemy/devboards/db1200.c b/arch/mips/alchemy/devboards/db1200.c
index 421d651433b6..1864eb935ca5 100644
--- a/arch/mips/alchemy/devboards/db1200.c
+++ b/arch/mips/alchemy/devboards/db1200.c
@@ -835,7 +835,7 @@ int __init db1200_dev_setup(void)
if (!IS_ERR(c)) {
pfc = clk_round_rate(c, 50000000);
if ((pfc < 1) || (abs(50000000 - pfc) > 2500000))
- pr_warn("DB1200: cant get I2C close to 50MHz\n");
+ pr_warn("DB1200: can't get I2C close to 50MHz\n");
else
clk_set_rate(c, pfc);
clk_prepare_enable(c);
diff --git a/arch/mips/boot/dts/Makefile b/arch/mips/boot/dts/Makefile
index 60bd7d2a9ad8..be96d35eb582 100644
--- a/arch/mips/boot/dts/Makefile
+++ b/arch/mips/boot/dts/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
subdir-$(CONFIG_BMIPS_GENERIC) += brcm
subdir-$(CONFIG_CAVIUM_OCTEON_SOC) += cavium-octeon
-subdir-$(CONFIG_MACH_PISTACHIO) += img
+subdir-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += img
subdir-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += img
subdir-$(CONFIG_MACH_INGENIC) += ingenic
subdir-$(CONFIG_LANTIQ) += lantiq
diff --git a/arch/mips/boot/dts/img/Makefile b/arch/mips/boot/dts/img/Makefile
index 441a3c16efb0..ebb47490b04b 100644
--- a/arch/mips/boot/dts/img/Makefile
+++ b/arch/mips/boot/dts/img/Makefile
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
dtb-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += boston.dtb
-dtb-$(CONFIG_MACH_PISTACHIO) += pistachio_marduk.dtb
-obj-$(CONFIG_MACH_PISTACHIO) += pistachio_marduk.dtb.o
+dtb-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += pistachio_marduk.dtb
diff --git a/arch/mips/boot/dts/img/pistachio.dtsi b/arch/mips/boot/dts/img/pistachio.dtsi
index dc3b7909de73..b1db8b8f446f 100644
--- a/arch/mips/boot/dts/img/pistachio.dtsi
+++ b/arch/mips/boot/dts/img/pistachio.dtsi
@@ -900,6 +900,16 @@
};
};
+ cpc: cpc@1bde0000 {
+ compatible = "mti,mips-cpc";
+ reg = <0x1bde0000 0x10000>;
+ };
+
+ cdmm: cdmm@1bdf0000 {
+ compatible = "mti,mips-cdmm";
+ reg = <0x1bdf0000 0x10000>;
+ };
+
usb_phy: usb-phy {
compatible = "img,pistachio-usb-phy";
clocks = <&clk_core CLK_USB_PHY>;
diff --git a/arch/mips/boot/dts/mscc/ocelot.dtsi b/arch/mips/boot/dts/mscc/ocelot.dtsi
index 535a98284dcb..e51db651af13 100644
--- a/arch/mips/boot/dts/mscc/ocelot.dtsi
+++ b/arch/mips/boot/dts/mscc/ocelot.dtsi
@@ -150,36 +150,47 @@
port0: port@0 {
reg = <0>;
+ status = "disabled";
};
port1: port@1 {
reg = <1>;
+ status = "disabled";
};
port2: port@2 {
reg = <2>;
+ status = "disabled";
};
port3: port@3 {
reg = <3>;
+ status = "disabled";
};
port4: port@4 {
reg = <4>;
+ status = "disabled";
};
port5: port@5 {
reg = <5>;
+ status = "disabled";
};
port6: port@6 {
reg = <6>;
+ status = "disabled";
};
port7: port@7 {
reg = <7>;
+ status = "disabled";
};
port8: port@8 {
reg = <8>;
+ status = "disabled";
};
port9: port@9 {
reg = <9>;
+ status = "disabled";
};
port10: port@10 {
reg = <10>;
+ status = "disabled";
};
};
};
diff --git a/arch/mips/boot/dts/mscc/ocelot_pcb120.dts b/arch/mips/boot/dts/mscc/ocelot_pcb120.dts
index 897de5025d7f..bd240690cb37 100644
--- a/arch/mips/boot/dts/mscc/ocelot_pcb120.dts
+++ b/arch/mips/boot/dts/mscc/ocelot_pcb120.dts
@@ -69,40 +69,52 @@
};
&port0 {
+ status = "okay";
phy-handle = <&phy0>;
+ phy-mode = "internal";
};
&port1 {
+ status = "okay";
phy-handle = <&phy1>;
+ phy-mode = "internal";
};
&port2 {
+ status = "okay";
phy-handle = <&phy2>;
+ phy-mode = "internal";
};
&port3 {
+ status = "okay";
phy-handle = <&phy3>;
+ phy-mode = "internal";
};
&port4 {
+ status = "okay";
phy-handle = <&phy7>;
phy-mode = "sgmii";
phys = <&serdes 4 SERDES1G(2)>;
};
&port5 {
+ status = "okay";
phy-handle = <&phy4>;
phy-mode = "sgmii";
phys = <&serdes 5 SERDES1G(5)>;
};
&port6 {
+ status = "okay";
phy-handle = <&phy6>;
phy-mode = "sgmii";
phys = <&serdes 6 SERDES1G(3)>;
};
&port9 {
+ status = "okay";
phy-handle = <&phy5>;
phy-mode = "sgmii";
phys = <&serdes 9 SERDES1G(4)>;
diff --git a/arch/mips/boot/dts/mscc/ocelot_pcb123.dts b/arch/mips/boot/dts/mscc/ocelot_pcb123.dts
index ef852f382da8..0185045c7630 100644
--- a/arch/mips/boot/dts/mscc/ocelot_pcb123.dts
+++ b/arch/mips/boot/dts/mscc/ocelot_pcb123.dts
@@ -47,17 +47,25 @@
};
&port0 {
+ status = "okay";
phy-handle = <&phy0>;
+ phy-mode = "internal";
};
&port1 {
+ status = "okay";
phy-handle = <&phy1>;
+ phy-mode = "internal";
};
&port2 {
+ status = "okay";
phy-handle = <&phy2>;
+ phy-mode = "internal";
};
&port3 {
+ status = "okay";
phy-handle = <&phy3>;
+ phy-mode = "internal";
};
diff --git a/arch/mips/cavium-octeon/executive/cvmx-bootmem.c b/arch/mips/cavium-octeon/executive/cvmx-bootmem.c
index e794b2d53adf..b63ad5d42cc7 100644
--- a/arch/mips/cavium-octeon/executive/cvmx-bootmem.c
+++ b/arch/mips/cavium-octeon/executive/cvmx-bootmem.c
@@ -44,7 +44,7 @@ static struct cvmx_bootmem_desc *cvmx_bootmem_desc;
/* See header file for descriptions of functions */
-/**
+/*
* This macro returns a member of the
* cvmx_bootmem_named_block_desc_t structure. These members can't
* be directly addressed as they might be in memory not directly
@@ -60,7 +60,7 @@ static struct cvmx_bootmem_desc *cvmx_bootmem_desc;
offsetof(struct cvmx_bootmem_named_block_desc, field), \
sizeof_field(struct cvmx_bootmem_named_block_desc, field))
-/**
+/*
* This function is the implementation of the get macros defined
* for individual structure members. The argument are generated
* by the macros inorder to read only the needed memory.
@@ -115,7 +115,7 @@ static uint64_t cvmx_bootmem_phy_get_next(uint64_t addr)
return cvmx_read64_uint64((addr + NEXT_OFFSET) | (1ull << 63));
}
-/**
+/*
* Allocate a block of memory from the free list that was
* passed to the application by the bootloader within a specified
* address range. This is an allocate-only algorithm, so
@@ -550,7 +550,7 @@ bootmem_free_done:
}
-/**
+/*
* Finds a named memory block by name.
* Also used for finding an unused entry in the named block table.
*
@@ -657,7 +657,7 @@ struct cvmx_bootmem_named_block_desc *cvmx_bootmem_find_named_block(char *name)
}
EXPORT_SYMBOL(cvmx_bootmem_find_named_block);
-/**
+/*
* Frees a named block.
*
* @name: name of block to free
diff --git a/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c b/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c
index 3839feba68f2..20189e9ad94d 100644
--- a/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c
+++ b/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c
@@ -42,14 +42,14 @@
#include <asm/octeon/cvmx-pexp-defs.h>
#include <asm/octeon/cvmx-pko-defs.h>
-/**
+/*
* This application uses this pointer to access the global queue
* state. It points to a bootmem named block.
*/
__cvmx_cmd_queue_all_state_t *__cvmx_cmd_queue_state_ptr;
EXPORT_SYMBOL_GPL(__cvmx_cmd_queue_state_ptr);
-/**
+/*
* Initialize the Global queue state pointer.
*
* Returns CVMX_CMD_QUEUE_SUCCESS or a failure code
@@ -57,27 +57,14 @@ EXPORT_SYMBOL_GPL(__cvmx_cmd_queue_state_ptr);
static cvmx_cmd_queue_result_t __cvmx_cmd_queue_init_state_ptr(void)
{
char *alloc_name = "cvmx_cmd_queues";
-#if defined(CONFIG_CAVIUM_RESERVE32) && CONFIG_CAVIUM_RESERVE32
- extern uint64_t octeon_reserve32_memory;
-#endif
if (likely(__cvmx_cmd_queue_state_ptr))
return CVMX_CMD_QUEUE_SUCCESS;
-#if defined(CONFIG_CAVIUM_RESERVE32) && CONFIG_CAVIUM_RESERVE32
- if (octeon_reserve32_memory)
- __cvmx_cmd_queue_state_ptr =
- cvmx_bootmem_alloc_named_range(sizeof(*__cvmx_cmd_queue_state_ptr),
- octeon_reserve32_memory,
- octeon_reserve32_memory +
- (CONFIG_CAVIUM_RESERVE32 <<
- 20) - 1, 128, alloc_name);
- else
-#endif
- __cvmx_cmd_queue_state_ptr =
- cvmx_bootmem_alloc_named(sizeof(*__cvmx_cmd_queue_state_ptr),
- 128,
- alloc_name);
+ __cvmx_cmd_queue_state_ptr =
+ cvmx_bootmem_alloc_named(sizeof(*__cvmx_cmd_queue_state_ptr),
+ 128,
+ alloc_name);
if (__cvmx_cmd_queue_state_ptr)
memset(__cvmx_cmd_queue_state_ptr, 0,
sizeof(*__cvmx_cmd_queue_state_ptr));
@@ -97,7 +84,7 @@ static cvmx_cmd_queue_result_t __cvmx_cmd_queue_init_state_ptr(void)
return CVMX_CMD_QUEUE_SUCCESS;
}
-/**
+/*
* Initialize a command queue for use. The initial FPA buffer is
* allocated and the hardware unit is configured to point to the
* new command queue.
@@ -195,7 +182,7 @@ cvmx_cmd_queue_result_t cvmx_cmd_queue_initialize(cvmx_cmd_queue_id_t queue_id,
}
}
-/**
+/*
* Shutdown a queue a free it's command buffers to the FPA. The
* hardware connected to the queue must be stopped before this
* function is called.
@@ -231,7 +218,7 @@ cvmx_cmd_queue_result_t cvmx_cmd_queue_shutdown(cvmx_cmd_queue_id_t queue_id)
return CVMX_CMD_QUEUE_SUCCESS;
}
-/**
+/*
* Return the number of command words pending in the queue. This
* function may be relatively slow for some hardware units.
*
@@ -287,7 +274,7 @@ int cvmx_cmd_queue_length(cvmx_cmd_queue_id_t queue_id)
return CVMX_CMD_QUEUE_INVALID_PARAM;
}
-/**
+/*
* Return the command buffer to be written to. The purpose of this
* function is to allow CVMX routine access t othe low level buffer
* for initial hardware setup. User applications should not call this
diff --git a/arch/mips/cavium-octeon/executive/cvmx-helper-board.c b/arch/mips/cavium-octeon/executive/cvmx-helper-board.c
index abd11b7af22f..1daa0c6b6f4e 100644
--- a/arch/mips/cavium-octeon/executive/cvmx-helper-board.c
+++ b/arch/mips/cavium-octeon/executive/cvmx-helper-board.c
@@ -44,7 +44,7 @@
#include <asm/octeon/cvmx-gmxx-defs.h>
#include <asm/octeon/cvmx-asxx-defs.h>
-/**
+/*
* Return the MII PHY address associated with the given IPD
* port. A result of -1 means there isn't a MII capable PHY
* connected to this port. On chips supporting multiple MII
@@ -189,7 +189,7 @@ int cvmx_helper_board_get_mii_address(int ipd_port)
return -1;
}
-/**
+/*
* This function is the board specific method of determining an
* ethernet ports link speed. Most Octeon boards have Marvell PHYs
* and are handled by the fall through case. This function must be
@@ -274,7 +274,7 @@ union cvmx_helper_link_info __cvmx_helper_board_link_get(int ipd_port)
return result;
}
-/**
+/*
* This function is called by cvmx_helper_interface_probe() after it
* determines the number of ports Octeon can support on a specific
* interface. This function is the per board location to override
@@ -320,7 +320,7 @@ int __cvmx_helper_board_interface_probe(int interface, int supported_ports)
return supported_ports;
}
-/**
+/*
* Get the clock type used for the USB block based on board type.
* Used by the USB code for auto configuration of clock type.
*
diff --git a/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c b/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c
index c4b58598aa9d..a8c3be4eb6f0 100644
--- a/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c
+++ b/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c
@@ -42,7 +42,7 @@
#include <asm/octeon/cvmx-asxx-defs.h>
#include <asm/octeon/cvmx-dbg-defs.h>
-/**
+/*
* Probe RGMII ports and determine the number present
*
* @interface: Interface to probe
@@ -88,7 +88,7 @@ int __cvmx_helper_rgmii_probe(int interface)
return num_ports;
}
-/**
+/*
* Put an RGMII interface in loopback mode. Internal packets sent
* out will be received back again on the same port. Externally
* received packets will echo back out.
@@ -120,7 +120,7 @@ void cvmx_helper_rgmii_internal_loopback(int port)
cvmx_write_csr(CVMX_GMXX_PRTX_CFG(index, interface), gmx_cfg.u64);
}
-/**
+/*
* Workaround ASX setup errata with CN38XX pass1
*
* @interface: Interface to setup
@@ -148,7 +148,7 @@ static int __cvmx_helper_errata_asx_pass1(int interface, int port,
return 0;
}
-/**
+/*
* Configure all of the ASX, GMX, and PKO registers required
* to get RGMII to function on the supplied interface.
*
@@ -251,7 +251,7 @@ int __cvmx_helper_rgmii_enable(int interface)
return 0;
}
-/**
+/*
* Return the link state of an IPD/PKO port as returned by
* auto negotiation. The result of this function may not match
* Octeon's link config if auto negotiation has changed since
@@ -280,7 +280,7 @@ union cvmx_helper_link_info __cvmx_helper_rgmii_link_get(int ipd_port)
return __cvmx_helper_board_link_get(ipd_port);
}
-/**
+/*
* Configure an IPD/PKO port for the specified link state. This
* function does not influence auto negotiation at the PHY level.
* The passed link state must always match the link state returned
diff --git a/arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c b/arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c
index 842990e8404f..fea71a85bb29 100644
--- a/arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c
+++ b/arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c
@@ -54,7 +54,7 @@ int __cvmx_helper_xaui_enumerate(int interface)
return 1;
}
-/**
+/*
* Probe a XAUI interface and determine the number of ports
* connected to it. The XAUI interface should still be down
* after this call.
@@ -102,7 +102,7 @@ int __cvmx_helper_xaui_probe(int interface)
return __cvmx_helper_xaui_enumerate(interface);
}
-/**
+/*
* Bringup and enable a XAUI interface. After this call packet
* I/O should be fully functional. This is called with IPD
* enabled but PKO disabled.
@@ -249,7 +249,7 @@ int __cvmx_helper_xaui_enable(int interface)
return 0;
}
-/**
+/*
* Return the link state of an IPD/PKO port as returned by
* auto negotiation. The result of this function may not match
* Octeon's link config if auto negotiation has changed since
@@ -288,7 +288,7 @@ union cvmx_helper_link_info __cvmx_helper_xaui_link_get(int ipd_port)
return result;
}
-/**
+/*
* Configure an IPD/PKO port for the specified link state. This
* function does not influence auto negotiation at the PHY level.
* The passed link state must always match the link state returned
diff --git a/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c b/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c
index 2f415d9d0f3c..67d6da21d49f 100644
--- a/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c
+++ b/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c
@@ -46,7 +46,9 @@
/**
- * __cvmx_interrupt_gmxx_rxx_int_en_enable enables all interrupt bits in cvmx_gmxx_rxx_int_en_t
+ * __cvmx_interrupt_gmxx_rxx_int_en_enable - enable all interrupt bits in cvmx_gmxx_rxx_int_en_t
+ * @index: interrupt register offset
+ * @block: interrupt register block_id
*/
void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block)
{
@@ -227,7 +229,9 @@ void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block)
cvmx_write_csr(CVMX_GMXX_RXX_INT_EN(index, block), gmx_rx_int_en.u64);
}
/**
- * __cvmx_interrupt_pcsx_intx_en_reg_enable enables all interrupt bits in cvmx_pcsx_intx_en_reg_t
+ * __cvmx_interrupt_pcsx_intx_en_reg_enable - enable all interrupt bits in cvmx_pcsx_intx_en_reg_t
+ * @index: interrupt register offset
+ * @block: interrupt register block_id
*/
void __cvmx_interrupt_pcsx_intx_en_reg_enable(int index, int block)
{
@@ -268,7 +272,8 @@ void __cvmx_interrupt_pcsx_intx_en_reg_enable(int index, int block)
cvmx_write_csr(CVMX_PCSX_INTX_EN_REG(index, block), pcs_int_en_reg.u64);
}
/**
- * __cvmx_interrupt_pcsxx_int_en_reg_enable enables all interrupt bits in cvmx_pcsxx_int_en_reg_t
+ * __cvmx_interrupt_pcsxx_int_en_reg_enable - enable all interrupt bits in cvmx_pcsxx_int_en_reg_t
+ * @index: interrupt register block_id
*/
void __cvmx_interrupt_pcsxx_int_en_reg_enable(int index)
{
@@ -298,7 +303,8 @@ void __cvmx_interrupt_pcsxx_int_en_reg_enable(int index)
}
/**
- * __cvmx_interrupt_spxx_int_msk_enable enables all interrupt bits in cvmx_spxx_int_msk_t
+ * __cvmx_interrupt_spxx_int_msk_enable - enable all interrupt bits in cvmx_spxx_int_msk_t
+ * @index: interrupt register block_id
*/
void __cvmx_interrupt_spxx_int_msk_enable(int index)
{
@@ -337,7 +343,8 @@ void __cvmx_interrupt_spxx_int_msk_enable(int index)
cvmx_write_csr(CVMX_SPXX_INT_MSK(index), spx_int_msk.u64);
}
/**
- * __cvmx_interrupt_stxx_int_msk_enable enables all interrupt bits in cvmx_stxx_int_msk_t
+ * __cvmx_interrupt_stxx_int_msk_enable - enable all interrupt bits in cvmx_stxx_int_msk_t
+ * @index: interrupt register block_id
*/
void __cvmx_interrupt_stxx_int_msk_enable(int index)
{
diff --git a/arch/mips/cavium-octeon/executive/cvmx-l2c.c b/arch/mips/cavium-octeon/executive/cvmx-l2c.c
index 83df0a963a8b..33b303691bc2 100644
--- a/arch/mips/cavium-octeon/executive/cvmx-l2c.c
+++ b/arch/mips/cavium-octeon/executive/cvmx-l2c.c
@@ -281,7 +281,7 @@ uint64_t cvmx_l2c_read_perf(uint32_t counter)
}
}
-/**
+/*
* @INTERNAL
* Helper function use to fault in cache lines for L2 cache locking
*
@@ -575,7 +575,7 @@ union __cvmx_l2c_tag {
};
-/**
+/*
* @INTERNAL
* Function to read a L2C tag. This code make the current core
* the 'debug core' for the L2. This code must only be executed by
@@ -764,9 +764,8 @@ int cvmx_l2c_get_cache_size_bytes(void)
CVMX_CACHE_LINE_SIZE;
}
-/**
+/*
* Return log base 2 of the number of sets in the L2 cache
- * Returns
*/
int cvmx_l2c_get_set_bits(void)
{
@@ -857,7 +856,7 @@ int cvmx_l2c_get_num_assoc(void)
return l2_assoc;
}
-/**
+/*
* Flush a line from the L2 cache
* This should only be called from one core at a time, as this routine
* sets the core to the 'debug' core in order to flush the line.
diff --git a/arch/mips/cavium-octeon/executive/cvmx-pko.c b/arch/mips/cavium-octeon/executive/cvmx-pko.c
index b0efc35e95c4..7c4879e74318 100644
--- a/arch/mips/cavium-octeon/executive/cvmx-pko.c
+++ b/arch/mips/cavium-octeon/executive/cvmx-pko.c
@@ -35,7 +35,7 @@
#include <asm/octeon/cvmx-pko.h>
#include <asm/octeon/cvmx-helper.h>
-/**
+/*
* Internal state of packet output
*/
@@ -176,7 +176,7 @@ static void __cvmx_pko_chip_init(void)
}
}
-/**
+/*
* Call before any other calls to initialize the packet
* output system. This does chip global config, and should only be
* done by one core.
@@ -229,7 +229,7 @@ void cvmx_pko_initialize_global(void)
}
}
-/**
+/*
* This function does per-core initialization required by the PKO routines.
* This must be called on all cores that will do packet output, and must
* be called after the FPA has been initialized and filled with pages.
@@ -243,7 +243,7 @@ int cvmx_pko_initialize_local(void)
return 0;
}
-/**
+/*
* Enables the packet output hardware. It must already be
* configured.
*/
@@ -266,7 +266,7 @@ void cvmx_pko_enable(void)
cvmx_write_csr(CVMX_PKO_REG_FLAGS, flags.u64);
}
-/**
+/*
* Disables the packet output. Does not affect any configuration.
*/
void cvmx_pko_disable(void)
@@ -278,7 +278,7 @@ void cvmx_pko_disable(void)
}
EXPORT_SYMBOL_GPL(cvmx_pko_disable);
-/**
+/*
* Reset the packet output.
*/
static void __cvmx_pko_reset(void)
@@ -289,7 +289,7 @@ static void __cvmx_pko_reset(void)
cvmx_write_csr(CVMX_PKO_REG_FLAGS, pko_reg_flags.u64);
}
-/**
+/*
* Shutdown and free resources required by packet output.
*/
void cvmx_pko_shutdown(void)
@@ -320,7 +320,7 @@ void cvmx_pko_shutdown(void)
}
EXPORT_SYMBOL_GPL(cvmx_pko_shutdown);
-/**
+/*
* Configure a output port and the associated queues for use.
*
* @port: Port to configure.
@@ -548,7 +548,7 @@ cvmx_pko_status_t cvmx_pko_config_port(uint64_t port, uint64_t base_queue,
}
#ifdef PKO_DEBUG
-/**
+/*
* Show map of ports -> queues for different cores.
*/
void cvmx_pko_show_queue_map()
@@ -573,7 +573,7 @@ void cvmx_pko_show_queue_map()
}
#endif
-/**
+/*
* Rate limit a PKO port to a max packets/sec. This function is only
* supported on CN51XX and higher, excluding CN58XX.
*
@@ -606,7 +606,7 @@ int cvmx_pko_rate_limit_packets(int port, int packets_s, int burst)
return 0;
}
-/**
+/*
* Rate limit a PKO port to a max bits/sec. This function is only
* supported on CN51XX and higher, excluding CN58XX.
*
diff --git a/arch/mips/cavium-octeon/executive/cvmx-spi.c b/arch/mips/cavium-octeon/executive/cvmx-spi.c
index f51957a3e915..eb9333e84a6b 100644
--- a/arch/mips/cavium-octeon/executive/cvmx-spi.c
+++ b/arch/mips/cavium-octeon/executive/cvmx-spi.c
@@ -66,7 +66,7 @@ static cvmx_spi_callbacks_t cvmx_spi_callbacks = {
.interface_up_cb = cvmx_spi_interface_up_cb
};
-/**
+/*
* Get current SPI4 initialization callbacks
*
* @callbacks: Pointer to the callbacks structure.to fill
@@ -78,7 +78,7 @@ void cvmx_spi_get_callbacks(cvmx_spi_callbacks_t *callbacks)
memcpy(callbacks, &cvmx_spi_callbacks, sizeof(cvmx_spi_callbacks));
}
-/**
+/*
* Set new SPI4 initialization callbacks
*
* @new_callbacks: Pointer to an updated callbacks structure.
@@ -88,7 +88,7 @@ void cvmx_spi_set_callbacks(cvmx_spi_callbacks_t *new_callbacks)
memcpy(&cvmx_spi_callbacks, new_callbacks, sizeof(cvmx_spi_callbacks));
}
-/**
+/*
* Initialize and start the SPI interface.
*
* @interface: The identifier of the packet interface to configure and
@@ -133,7 +133,7 @@ int cvmx_spi_start_interface(int interface, cvmx_spi_mode_t mode, int timeout,
return res;
}
-/**
+/*
* This routine restarts the SPI interface after it has lost synchronization
* with its correspondent system.
*
@@ -179,7 +179,7 @@ int cvmx_spi_restart_interface(int interface, cvmx_spi_mode_t mode, int timeout)
}
EXPORT_SYMBOL_GPL(cvmx_spi_restart_interface);
-/**
+/*
* Callback to perform SPI4 reset
*
* @interface: The identifier of the packet interface to configure and
@@ -294,7 +294,7 @@ int cvmx_spi_reset_cb(int interface, cvmx_spi_mode_t mode)
return 0;
}
-/**
+/*
* Callback to setup calendar and miscellaneous settings before clock detection
*
* @interface: The identifier of the packet interface to configure and
@@ -413,7 +413,7 @@ int cvmx_spi_calendar_setup_cb(int interface, cvmx_spi_mode_t mode,
return 0;
}
-/**
+/*
* Callback to perform clock detection
*
* @interface: The identifier of the packet interface to configure and
@@ -491,7 +491,7 @@ int cvmx_spi_clock_detect_cb(int interface, cvmx_spi_mode_t mode, int timeout)
return 0;
}
-/**
+/*
* Callback to perform link training
*
* @interface: The identifier of the packet interface to configure and
@@ -560,7 +560,7 @@ int cvmx_spi_training_cb(int interface, cvmx_spi_mode_t mode, int timeout)
return 0;
}
-/**
+/*
* Callback to perform calendar data synchronization
*
* @interface: The identifier of the packet interface to configure and
@@ -617,7 +617,7 @@ int cvmx_spi_calendar_sync_cb(int interface, cvmx_spi_mode_t mode, int timeout)
return 0;
}
-/**
+/*
* Callback to handle interface up
*
* @interface: The identifier of the packet interface to configure and
diff --git a/arch/mips/cavium-octeon/flash_setup.c b/arch/mips/cavium-octeon/flash_setup.c
index a5e8f4a784af..c8a8c6d359b9 100644
--- a/arch/mips/cavium-octeon/flash_setup.c
+++ b/arch/mips/cavium-octeon/flash_setup.c
@@ -62,7 +62,7 @@ static void octeon_flash_map_copy_to(struct map_info *map, unsigned long to,
up(&octeon_bootbus_sem);
}
-/**
+/*
* Module/ driver initialization.
*
* Returns Zero on success
diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c
index ce4e2806159b..00bf269763cf 100644
--- a/arch/mips/cavium-octeon/setup.c
+++ b/arch/mips/cavium-octeon/setup.c
@@ -284,11 +284,6 @@ void octeon_crash_smp_send_stop(void)
#endif /* CONFIG_KEXEC */
-#ifdef CONFIG_CAVIUM_RESERVE32
-uint64_t octeon_reserve32_memory;
-EXPORT_SYMBOL(octeon_reserve32_memory);
-#endif
-
#ifdef CONFIG_KEXEC
/* crashkernel cmdline parameter is parsed _after_ memory setup
* we also parse it here (workaround for EHB5200) */
@@ -300,9 +295,10 @@ static int octeon_uart;
extern asmlinkage void handle_int(void);
/**
- * Return non zero if we are currently running in the Octeon simulator
+ * octeon_is_simulation - Return non-zero if we are currently running
+ * in the Octeon simulator
*
- * Returns
+ * Return: non-0 if running in the Octeon simulator, 0 otherwise
*/
int octeon_is_simulation(void)
{
@@ -311,10 +307,10 @@ int octeon_is_simulation(void)
EXPORT_SYMBOL(octeon_is_simulation);
/**
- * Return true if Octeon is in PCI Host mode. This means
+ * octeon_is_pci_host - Return true if Octeon is in PCI Host mode. This means
* Linux can control the PCI bus.
*
- * Returns Non zero if Octeon in host mode.
+ * Return: Non-zero if Octeon is in host mode.
*/
int octeon_is_pci_host(void)
{
@@ -326,9 +322,9 @@ int octeon_is_pci_host(void)
}
/**
- * Get the clock rate of Octeon
+ * octeon_get_clock_rate - Get the clock rate of Octeon
*
- * Returns Clock rate in HZ
+ * Return: Clock rate in HZ
*/
uint64_t octeon_get_clock_rate(void)
{
@@ -348,11 +344,11 @@ EXPORT_SYMBOL(octeon_get_io_clock_rate);
/**
- * Write to the LCD display connected to the bootbus. This display
- * exists on most Cavium evaluation boards. If it doesn't exist, then
- * this function doesn't do anything.
- *
+ * octeon_write_lcd - Write to the LCD display connected to the bootbus.
* @s: String to write
+ *
+ * This display exists on most Cavium evaluation boards. If it doesn't exist,
+ * then this function doesn't do anything.
*/
static void octeon_write_lcd(const char *s)
{
@@ -372,9 +368,9 @@ static void octeon_write_lcd(const char *s)
}
/**
- * Return the console uart passed by the bootloader
+ * octeon_get_boot_uart - Return the console uart passed by the bootloader
*
- * Returns uart (0 or 1)
+ * Return: uart number (0 or 1)
*/
static int octeon_get_boot_uart(void)
{
@@ -383,9 +379,9 @@ static int octeon_get_boot_uart(void)
}
/**
- * Get the coremask Linux was booted on.
+ * octeon_get_boot_coremask - Get the coremask Linux was booted on.
*
- * Returns Core mask
+ * Return: Core mask
*/
int octeon_get_boot_coremask(void)
{
@@ -393,7 +389,7 @@ int octeon_get_boot_coremask(void)
}
/**
- * Check the hardware BIST results for a CPU
+ * octeon_check_cpu_bist - Check the hardware BIST results for a CPU
*/
void octeon_check_cpu_bist(void)
{
@@ -424,7 +420,7 @@ void octeon_check_cpu_bist(void)
}
/**
- * Reboot Octeon
+ * octeon_restart - Reboot Octeon
*
* @command: Command to pass to the bootloader. Currently ignored.
*/
@@ -449,7 +445,7 @@ static void octeon_restart(char *command)
/**
- * Permanently stop a core.
+ * octeon_kill_core - Permanently stop a core.
*
* @arg: Ignored.
*/
@@ -469,7 +465,7 @@ static void octeon_kill_core(void *arg)
/**
- * Halt the system
+ * octeon_halt - Halt the system
*/
static void octeon_halt(void)
{
@@ -512,9 +508,9 @@ static void __init init_octeon_system_type(void)
}
/**
- * Return a string representing the system type
+ * octeon_board_type_string - Return a string representing the system type
*
- * Returns
+ * Return: system type string
*/
const char *octeon_board_type_string(void)
{
@@ -655,7 +651,7 @@ void octeon_user_io_init(void)
}
/**
- * Early entry point for arch setup
+ * prom_init - Early entry point for arch setup
*/
void __init prom_init(void)
{
@@ -665,9 +661,7 @@ void __init prom_init(void)
int i;
u64 t;
int argc;
-#ifdef CONFIG_CAVIUM_RESERVE32
- int64_t addr = -1;
-#endif
+
/*
* The bootloader passes a pointer to the boot descriptor in
* $a3, this is available as fw_arg3.
@@ -782,25 +776,6 @@ void __init prom_init(void)
cvmx_write_csr(CVMX_LED_UDD_DATX(1), 0);
cvmx_write_csr(CVMX_LED_EN, 1);
}
-#ifdef CONFIG_CAVIUM_RESERVE32
- /*
- * We need to temporarily allocate all memory in the reserve32
- * region. This makes sure the kernel doesn't allocate this
- * memory when it is getting memory from the
- * bootloader. Later, after the memory allocations are
- * complete, the reserve32 will be freed.
- *
- * Allocate memory for RESERVED32 aligned on 2MB boundary. This
- * is in case we later use hugetlb entries with it.
- */
- addr = cvmx_bootmem_phy_named_block_alloc(CONFIG_CAVIUM_RESERVE32 << 20,
- 0, 0, 2 << 20,
- "CAVIUM_RESERVE32", 0);
- if (addr < 0)
- pr_err("Failed to allocate CAVIUM_RESERVE32 memory area\n");
- else
- octeon_reserve32_memory = addr;
-#endif
#ifdef CONFIG_CAVIUM_OCTEON_LOCK_L2
if (cvmx_read_csr(CVMX_L2D_FUS3) & (3ull << 34)) {
@@ -1078,16 +1053,6 @@ void __init plat_mem_setup(void)
cvmx_bootmem_unlock();
#endif /* CONFIG_CRASH_DUMP */
-#ifdef CONFIG_CAVIUM_RESERVE32
- /*
- * Now that we've allocated the kernel memory it is safe to
- * free the reserved region. We free it here so that builtin
- * drivers can use the memory.
- */
- if (octeon_reserve32_memory)
- cvmx_bootmem_free_named("CAVIUM_RESERVE32");
-#endif /* CONFIG_CAVIUM_RESERVE32 */
-
if (total == 0)
panic("Unable to allocate memory from "
"cvmx_bootmem_phy_alloc");
diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c
index 66ce5527da54..89954f5f87fb 100644
--- a/arch/mips/cavium-octeon/smp.c
+++ b/arch/mips/cavium-octeon/smp.c
@@ -91,7 +91,7 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
-/**
+/*
* Cause the function described by call_data to be executed on the passed
* cpu. When the function has finished, increment the finished field of
* call_data.
@@ -115,7 +115,7 @@ static inline void octeon_send_ipi_mask(const struct cpumask *mask,
octeon_send_ipi_single(i, action);
}
-/**
+/*
* Detect available CPUs, populate cpu_possible_mask
*/
static void octeon_smp_hotplug_setup(void)
@@ -202,9 +202,8 @@ int plat_post_relocation(long offset)
}
#endif /* CONFIG_RELOCATABLE */
-/**
+/*
* Firmware CPU startup hook
- *
*/
static int octeon_boot_secondary(int cpu, struct task_struct *idle)
{
@@ -232,7 +231,7 @@ static int octeon_boot_secondary(int cpu, struct task_struct *idle)
return 0;
}
-/**
+/*
* After we've done initial boot, this function is called to allow the
* board code to clean up state, if needed
*/
@@ -250,9 +249,8 @@ static void octeon_init_secondary(void)
octeon_irq_setup_secondary();
}
-/**
+/*
* Callout to firmware before smp_init
- *
*/
static void __init octeon_prepare_cpus(unsigned int max_cpus)
{
@@ -268,7 +266,7 @@ static void __init octeon_prepare_cpus(unsigned int max_cpus)
}
}
-/**
+/*
* Last chance for the board code to finish SMP initialization before
* the CPU is "online".
*/
diff --git a/arch/mips/configs/generic/board-marduk.config b/arch/mips/configs/generic/board-marduk.config
new file mode 100644
index 000000000000..05ca34cd5a73
--- /dev/null
+++ b/arch/mips/configs/generic/board-marduk.config
@@ -0,0 +1,53 @@
+CONFIG_FIT_IMAGE_FDT_MARDUK=y
+
+CONFIG_SCSI=y
+CONFIG_BLK_DEV_SD=y
+
+CONFIG_CLKSRC_PISTACHIO=y
+
+CONFIG_COMMON_CLK_PISTACHIO=y
+
+CONFIG_DMADEVICES=y
+CONFIG_IMG_MDC_DMA=y
+
+CONFIG_GPIOLIB=y
+CONFIG_GPIO_SYSFS=y
+CONFIG_GPIO_PCH=y
+
+CONFIG_I2C=y
+CONFIG_I2C_IMG=y
+
+CONFIG_MMC=y
+CONFIG_MMC_SDHCI=y
+CONFIG_MMC_DW=y
+CONFIG_MMC_DW_PLTFM=y
+
+CONFIG_NETDEVICES=y
+CONFIG_STMMAC_ETH=y
+CONFIG_STMMAC_PLATFORM=y
+
+CONFIG_PHY_PISTACHIO_USB=y
+
+CONFIG_PINCTRL=y
+CONFIG_PINCTRL_PISTACHIO=y
+
+CONFIG_RESET_PISTACHIO=y
+
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_OF_PLATFORM=y
+CONFIG_SERIAL_8250_DW=y
+
+CONFIG_SPI=y
+CONFIG_SRAM=y
+
+CONFIG_USB=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_DWC2=y
+
+CONFIG_CRYPTO_DEV_IMGTEC_HASH=y
+CONFIG_IMGPDC_WDT=y
+CONFIG_IR_IMG=y
+CONFIG_CC10001_ADC=y
+CONFIG_SND_SOC_IMG=y
diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig
deleted file mode 100644
index b9adf15ebbec..000000000000
--- a/arch/mips/configs/pistachio_defconfig
+++ /dev/null
@@ -1,316 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_DEFAULT_HOSTNAME="localhost"
-CONFIG_SYSVIPC=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_PREEMPT_VOLUNTARY=y
-CONFIG_IKCONFIG=m
-CONFIG_IKCONFIG_PROC=y
-CONFIG_LOG_BUF_SHIFT=18
-CONFIG_CGROUPS=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_CFS_BANDWIDTH=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_NAMESPACES=y
-CONFIG_USER_NS=y
-CONFIG_BLK_DEV_INITRD=y
-# CONFIG_RD_BZIP2 is not set
-# CONFIG_RD_LZMA is not set
-# CONFIG_RD_LZO is not set
-# CONFIG_RD_LZ4 is not set
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_EMBEDDED=y
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_MACH_PISTACHIO=y
-CONFIG_MIPS_CPS=y
-CONFIG_NR_CPUS=4
-CONFIG_PM_DEBUG=y
-CONFIG_PM_ADVANCED_DEBUG=y
-CONFIG_CPU_IDLE=y
-# CONFIG_MIPS_CPS_CPUIDLE is not set
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
-CONFIG_PARTITION_ADVANCED=y
-# CONFIG_COMPACTION is not set
-CONFIG_DEFAULT_MMAP_MIN_ADDR=32768
-CONFIG_ZSMALLOC=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_NET_KEY=m
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_MULTIPLE_TABLES=y
-CONFIG_IP_ROUTE_MULTIPATH=y
-CONFIG_IP_ROUTE_VERBOSE=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_IP_MROUTE=y
-CONFIG_IP_PIMSM_V1=y
-CONFIG_IP_PIMSM_V2=y
-CONFIG_SYN_COOKIES=y
-CONFIG_INET_AH=m
-CONFIG_INET_ESP=m
-CONFIG_INET_IPCOMP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
-# CONFIG_INET_DIAG is not set
-CONFIG_TCP_CONG_ADVANCED=y
-# CONFIG_TCP_CONG_BIC is not set
-# CONFIG_TCP_CONG_WESTWOOD is not set
-# CONFIG_TCP_CONG_HTCP is not set
-CONFIG_TCP_CONG_LP=m
-CONFIG_TCP_MD5SIG=y
-CONFIG_INET6_AH=m
-CONFIG_INET6_ESP=m
-CONFIG_INET6_XFRM_MODE_TRANSPORT=m
-CONFIG_INET6_XFRM_MODE_TUNNEL=m
-CONFIG_INET6_XFRM_MODE_BEET=m
-CONFIG_IPV6_SIT=m
-CONFIG_NETWORK_SECMARK=y
-CONFIG_NETFILTER=y
-# CONFIG_BRIDGE_NETFILTER is not set
-CONFIG_NF_CONNTRACK=y
-CONFIG_NF_CT_NETLINK=y
-CONFIG_NETFILTER_XT_MARK=m
-CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
-CONFIG_NETFILTER_XT_TARGET_DSCP=y
-CONFIG_NETFILTER_XT_TARGET_NFLOG=y
-CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
-CONFIG_NETFILTER_XT_TARGET_SECMARK=y
-CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
-CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
-CONFIG_NETFILTER_XT_MATCH_DSCP=y
-CONFIG_NETFILTER_XT_MATCH_POLICY=y
-CONFIG_NETFILTER_XT_MATCH_STATE=y
-CONFIG_NF_NAT_IPV4=m
-CONFIG_IP_NF_IPTABLES=y
-CONFIG_IP_NF_FILTER=y
-CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_MANGLE=y
-CONFIG_NF_NAT_IPV6=m
-CONFIG_IP6_NF_IPTABLES=m
-CONFIG_IP6_NF_MATCH_IPV6HEADER=m
-CONFIG_IP6_NF_FILTER=m
-CONFIG_IP6_NF_TARGET_REJECT=m
-CONFIG_IP6_NF_MANGLE=m
-CONFIG_BRIDGE=m
-CONFIG_VLAN_8021Q=m
-CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_HTB=m
-CONFIG_NET_SCH_CODEL=m
-CONFIG_NET_SCH_FQ_CODEL=m
-CONFIG_NET_CLS_U32=m
-CONFIG_CLS_U32_MARK=y
-CONFIG_BT=m
-CONFIG_BT_RFCOMM=m
-CONFIG_BT_HCIBTUSB=m
-CONFIG_BT_HCIBFUSB=m
-CONFIG_BT_HCIVHCI=m
-CONFIG_CFG80211=m
-CONFIG_NL80211_TESTMODE=y
-CONFIG_CFG80211_DEBUGFS=y
-CONFIG_CFG80211_WEXT=y
-CONFIG_MAC80211=m
-CONFIG_MAC80211_LEDS=y
-CONFIG_MAC80211_DEBUGFS=y
-CONFIG_MAC80211_DEBUG_MENU=y
-CONFIG_MAC80211_VERBOSE_DEBUG=y
-CONFIG_RFKILL=y
-CONFIG_DEVTMPFS=y
-CONFIG_DEVTMPFS_MOUNT=y
-CONFIG_DEBUG_DEVRES=y
-CONFIG_CONNECTOR=y
-CONFIG_MTD=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_SPI_NOR=y
-CONFIG_MTD_UBI=y
-CONFIG_MTD_UBI_BLOCK=y
-CONFIG_ZRAM=m
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_BLK_DEV_SR=m
-CONFIG_SCSI_SPI_ATTRS=y
-CONFIG_MD=y
-CONFIG_BLK_DEV_DM=y
-CONFIG_DM_CRYPT=y
-CONFIG_DM_VERITY=y
-CONFIG_NETDEVICES=y
-CONFIG_TUN=m
-CONFIG_VETH=m
-# CONFIG_NET_VENDOR_MARVELL is not set
-# CONFIG_NET_VENDOR_MICREL is not set
-# CONFIG_NET_VENDOR_MICROCHIP is not set
-# CONFIG_NET_VENDOR_NATSEMI is not set
-# CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SMSC is not set
-CONFIG_STMMAC_ETH=y
-# CONFIG_NET_VENDOR_VIA is not set
-CONFIG_PPP=m
-CONFIG_PPP_ASYNC=m
-CONFIG_USB_PEGASUS=m
-CONFIG_USB_RTL8150=m
-CONFIG_USB_RTL8152=m
-CONFIG_USB_NET_DM9601=m
-CONFIG_USB_NET_SMSC75XX=m
-CONFIG_USB_NET_SMSC95XX=m
-CONFIG_USB_NET_MCS7830=m
-# CONFIG_USB_NET_CDC_SUBSET is not set
-# CONFIG_USB_NET_ZAURUS is not set
-CONFIG_HOSTAP=m
-CONFIG_HOSTAP_FIRMWARE=y
-CONFIG_HOSTAP_FIRMWARE_NVRAM=y
-CONFIG_LIBERTAS_THINFIRM=m
-CONFIG_RT2X00=m
-CONFIG_RT2800USB=m
-CONFIG_MAC80211_HWSIM=m
-CONFIG_USB_NET_RNDIS_WLAN=m
-CONFIG_INPUT_EVDEV=y
-# CONFIG_KEYBOARD_ATKBD is not set
-CONFIG_KEYBOARD_GPIO=y
-# CONFIG_INPUT_MOUSE is not set
-# CONFIG_SERIO is not set
-# CONFIG_VT is not set
-# CONFIG_LEGACY_PTYS is not set
-CONFIG_SERIAL_8250=y
-# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_8250_DW=y
-CONFIG_SERIAL_OF_PLATFORM=y
-CONFIG_HW_RANDOM=y
-CONFIG_TCG_TPM=y
-CONFIG_I2C=y
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_IMG=y
-CONFIG_I2C_STUB=m
-CONFIG_SPI=y
-CONFIG_SPI_BITBANG=m
-CONFIG_SPI_IMG_SPFI=y
-CONFIG_SPI_SPIDEV=y
-CONFIG_DEBUG_GPIO=y
-CONFIG_GPIO_SYSFS=y
-CONFIG_POWER_SUPPLY=y
-CONFIG_THERMAL=y
-CONFIG_WATCHDOG=y
-CONFIG_IMGPDC_WDT=y
-CONFIG_REGULATOR_FIXED_VOLTAGE=y
-CONFIG_REGULATOR_GPIO=y
-CONFIG_RC_CORE=y
-CONFIG_RC_DEVICES=y
-CONFIG_IR_IMG=y
-CONFIG_IR_IMG_NEC=y
-CONFIG_IR_IMG_JVC=y
-CONFIG_IR_IMG_SONY=y
-CONFIG_IR_IMG_SHARP=y
-CONFIG_IR_IMG_SANYO=y
-CONFIG_IR_IMG_RC5=y
-CONFIG_IR_IMG_RC6=y
-CONFIG_MEDIA_SUPPORT=y
-CONFIG_FB=y
-CONFIG_FB_MODE_HELPERS=y
-# CONFIG_LCD_CLASS_DEVICE is not set
-CONFIG_BACKLIGHT_CLASS_DEVICE=y
-CONFIG_SOUND=y
-CONFIG_SND=y
-CONFIG_SND_HRTIMER=m
-CONFIG_SND_DYNAMIC_MINORS=y
-CONFIG_SND_SEQUENCER=m
-CONFIG_SND_SEQ_DUMMY=m
-# CONFIG_SND_SPI is not set
-CONFIG_SND_USB_AUDIO=m
-CONFIG_USB=y
-CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-# CONFIG_USB_DEFAULT_PERSIST is not set
-CONFIG_USB_MON=y
-CONFIG_USB_EHCI_HCD=y
-CONFIG_USB_EHCI_ROOT_HUB_TT=y
-CONFIG_USB_ACM=y
-CONFIG_USB_STORAGE=y
-CONFIG_USB_DWC2=y
-CONFIG_USB_SERIAL=y
-CONFIG_USB_SERIAL_GENERIC=y
-CONFIG_USB_SERIAL_CP210X=m
-CONFIG_USB_SERIAL_FTDI_SIO=m
-CONFIG_USB_SERIAL_KEYSPAN=m
-CONFIG_USB_SERIAL_PL2303=m
-CONFIG_USB_SERIAL_OTI6858=m
-CONFIG_USB_SERIAL_QUALCOMM=m
-CONFIG_USB_SERIAL_SIERRAWIRELESS=m
-CONFIG_USB_SERIAL_OPTION=m
-CONFIG_MMC=y
-CONFIG_MMC_BLOCK_MINORS=16
-CONFIG_MMC_TEST=m
-CONFIG_MMC_DW=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_RTC_CLASS=y
-CONFIG_DMADEVICES=y
-CONFIG_IMG_MDC_DMA=y
-CONFIG_STAGING=y
-CONFIG_ASHMEM=y
-# CONFIG_IOMMU_SUPPORT is not set
-CONFIG_MEMORY=y
-CONFIG_IIO=y
-CONFIG_CC10001_ADC=y
-CONFIG_PWM=y
-CONFIG_PWM_IMG=y
-CONFIG_PHY_PISTACHIO_USB=y
-CONFIG_ANDROID=y
-CONFIG_EXT4_FS=y
-CONFIG_EXT4_FS_POSIX_ACL=y
-CONFIG_EXT4_FS_SECURITY=y
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=m
-CONFIG_ISO9660_FS=m
-CONFIG_JOLIET=y
-CONFIG_ZISOFS=y
-CONFIG_UDF_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_TMPFS=y
-CONFIG_TMPFS_POSIX_ACL=y
-CONFIG_ECRYPT_FS=y
-CONFIG_HFSPLUS_FS=m
-CONFIG_UBIFS_FS=y
-CONFIG_SQUASHFS=y
-CONFIG_SQUASHFS_FILE_DIRECT=y
-CONFIG_SQUASHFS_LZO=y
-CONFIG_PSTORE=y
-CONFIG_PSTORE_CONSOLE=y
-CONFIG_PSTORE_RAM=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_NLS_DEFAULT="utf8"
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_ASCII=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
-CONFIG_SECURITY_YAMA=y
-CONFIG_CRYPTO_AUTHENC=y
-CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPTO_SHA1=y
-CONFIG_CRYPTO_SHA256=y
-CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_ARC4=y
-CONFIG_CRYPTO_DES=y
-CONFIG_CRC_CCITT=y
-CONFIG_CRC_T10DIF=m
-CONFIG_CRC7=m
-# CONFIG_XZ_DEC_X86 is not set
-CONFIG_PRINTK_TIME=y
-CONFIG_DEBUG_INFO=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0
-# CONFIG_SCHED_DEBUG is not set
-CONFIG_SCHEDSTATS=y
-CONFIG_DEBUG_SPINLOCK=y
-CONFIG_DEBUG_CREDENTIALS=y
-CONFIG_FUNCTION_TRACER=y
-CONFIG_BLK_DEV_IO_TRACE=y
-CONFIG_LKDTM=y
-CONFIG_TEST_UDELAY=m
diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig
index 657dd93c5e76..7dc5b3821cc6 100644
--- a/arch/mips/generic/Kconfig
+++ b/arch/mips/generic/Kconfig
@@ -58,6 +58,12 @@ config FIT_IMAGE_FDT_BOSTON
enable this if you wish to boot on a MIPS Boston board, as it is
expected by the bootloader.
+config FIT_IMAGE_FDT_MARDUK
+ bool "Include FDT for IMG Pistachio Marduk (CI40) boards"
+ help
+ Enable this to include the FDT for the IMG Pistachio Marduk (CI40)
+ from Imagination Technologies in the FIT kernel image.
+
config FIT_IMAGE_FDT_NI169445
bool "Include FDT for NI 169445"
help
diff --git a/arch/mips/generic/Platform b/arch/mips/generic/Platform
index b871af16b5b6..e1abc113b409 100644
--- a/arch/mips/generic/Platform
+++ b/arch/mips/generic/Platform
@@ -24,3 +24,4 @@ its-$(CONFIG_FIT_IMAGE_FDT_LUTON) += board-luton.its.S
its-$(CONFIG_FIT_IMAGE_FDT_JAGUAR2) += board-jaguar2.its.S
its-$(CONFIG_FIT_IMAGE_FDT_SERVAL) += board-serval.its.S
its-$(CONFIG_FIT_IMAGE_FDT_XILFPGA) += board-xilfpga.its.S
+its-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += board-marduk.its.S
diff --git a/arch/mips/generic/board-ingenic.c b/arch/mips/generic/board-ingenic.c
index 0cec0bea13d6..3f44f14bdb33 100644
--- a/arch/mips/generic/board-ingenic.c
+++ b/arch/mips/generic/board-ingenic.c
@@ -7,6 +7,8 @@
* Copyright (C) 2020 Paul Cercueil <paul@crapouillou.net>
*/
+#include <linux/clk.h>
+#include <linux/of.h>
#include <linux/of_address.h>
#include <linux/of_fdt.h>
#include <linux/pm.h>
@@ -21,6 +23,10 @@
static __init char *ingenic_get_system_type(unsigned long machtype)
{
switch (machtype) {
+ case MACH_INGENIC_X2100:
+ return "X2100";
+ case MACH_INGENIC_X2000H:
+ return "X2000H";
case MACH_INGENIC_X2000E:
return "X2000E";
case MACH_INGENIC_X2000:
@@ -37,8 +43,18 @@ static __init char *ingenic_get_system_type(unsigned long machtype)
return "JZ4775";
case MACH_INGENIC_JZ4770:
return "JZ4770";
+ case MACH_INGENIC_JZ4760B:
+ return "JZ4760B";
+ case MACH_INGENIC_JZ4760:
+ return "JZ4760";
+ case MACH_INGENIC_JZ4755:
+ return "JZ4755";
+ case MACH_INGENIC_JZ4750:
+ return "JZ4750";
case MACH_INGENIC_JZ4725B:
return "JZ4725B";
+ case MACH_INGENIC_JZ4730:
+ return "JZ4730";
default:
return "JZ4740";
}
@@ -61,8 +77,13 @@ static __init const void *ingenic_fixup_fdt(const void *fdt, const void *match_d
}
static const struct of_device_id ingenic_of_match[] __initconst = {
+ { .compatible = "ingenic,jz4730", .data = (void *)MACH_INGENIC_JZ4730 },
{ .compatible = "ingenic,jz4740", .data = (void *)MACH_INGENIC_JZ4740 },
{ .compatible = "ingenic,jz4725b", .data = (void *)MACH_INGENIC_JZ4725B },
+ { .compatible = "ingenic,jz4750", .data = (void *)MACH_INGENIC_JZ4750 },
+ { .compatible = "ingenic,jz4755", .data = (void *)MACH_INGENIC_JZ4755 },
+ { .compatible = "ingenic,jz4760", .data = (void *)MACH_INGENIC_JZ4760 },
+ { .compatible = "ingenic,jz4760b", .data = (void *)MACH_INGENIC_JZ4760B },
{ .compatible = "ingenic,jz4770", .data = (void *)MACH_INGENIC_JZ4770 },
{ .compatible = "ingenic,jz4775", .data = (void *)MACH_INGENIC_JZ4775 },
{ .compatible = "ingenic,jz4780", .data = (void *)MACH_INGENIC_JZ4780 },
@@ -71,6 +92,8 @@ static const struct of_device_id ingenic_of_match[] __initconst = {
{ .compatible = "ingenic,x1830", .data = (void *)MACH_INGENIC_X1830 },
{ .compatible = "ingenic,x2000", .data = (void *)MACH_INGENIC_X2000 },
{ .compatible = "ingenic,x2000e", .data = (void *)MACH_INGENIC_X2000E },
+ { .compatible = "ingenic,x2000h", .data = (void *)MACH_INGENIC_X2000H },
+ { .compatible = "ingenic,x2100", .data = (void *)MACH_INGENIC_X2100 },
{}
};
@@ -108,10 +131,36 @@ static const struct platform_suspend_ops ingenic_pm_ops __maybe_unused = {
static int __init ingenic_pm_init(void)
{
+ struct device_node *cpu_node;
+ struct clk *cpu0_clk;
+ int ret;
+
if (boot_cpu_type() == CPU_XBURST) {
if (IS_ENABLED(CONFIG_PM_SLEEP))
suspend_set_ops(&ingenic_pm_ops);
_machine_halt = ingenic_halt;
+
+ /*
+ * Unconditionally enable the clock for the first CPU.
+ * This makes sure that the PLL that feeds the CPU won't be
+ * stopped while the kernel is running.
+ */
+ cpu_node = of_get_cpu_node(0, NULL);
+ if (!cpu_node) {
+ pr_err("Unable to get CPU node\n");
+ } else {
+ cpu0_clk = of_clk_get(cpu_node, 0);
+ if (IS_ERR(cpu0_clk)) {
+ pr_err("Unable to get CPU0 clock\n");
+ return PTR_ERR(cpu0_clk);
+ }
+
+ ret = clk_prepare_enable(cpu0_clk);
+ if (ret) {
+ pr_err("Unable to enable CPU0 clock\n");
+ return ret;
+ }
+ }
}
return 0;
diff --git a/arch/mips/generic/board-marduk.its.S b/arch/mips/generic/board-marduk.its.S
new file mode 100644
index 000000000000..4f633794db90
--- /dev/null
+++ b/arch/mips/generic/board-marduk.its.S
@@ -0,0 +1,22 @@
+/ {
+ images {
+ fdt-marduk {
+ description = "img,pistachio-marduk Device Tree";
+ data = /incbin/("boot/dts/img/pistachio_marduk.dtb");
+ type = "flat_dt";
+ arch = "mips";
+ compression = "none";
+ hash {
+ algo = "sha1";
+ };
+ };
+ };
+
+ configurations {
+ conf-marduk {
+ description = "Marduk Linux kernel";
+ kernel = "kernel";
+ fdt = "fdt-marduk";
+ };
+ };
+};
diff --git a/arch/mips/generic/board-ocelot.c b/arch/mips/generic/board-ocelot.c
index c238e95190ac..7115410acb4f 100644
--- a/arch/mips/generic/board-ocelot.c
+++ b/arch/mips/generic/board-ocelot.c
@@ -26,13 +26,13 @@ static __init bool ocelot_detect(void)
tlb_probe_hazard();
idx = read_c0_index();
if (idx < 0)
- return 0;
+ return false;
/* A TLB entry exists, lets assume its usable and check the CHIP ID */
rev = __raw_readl((void __iomem *)DEVCPU_GCB_CHIP_REGS_CHIP_ID);
if ((rev & CHIP_ID_PART_ID) != OCELOT_PART_ID)
- return 0;
+ return false;
/* Copy command line from bootloader early for Initrd detection */
if (fw_arg0 < 10 && (fw_arg1 & 0xFFF00000) == 0x80000000) {
@@ -44,7 +44,7 @@ static __init bool ocelot_detect(void)
strcpy(arcs_cmdline, prom_argv[1]);
}
- return 1;
+ return true;
}
static void __init ocelot_earlyprintk_init(void)
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index 95e1f7f3597f..a0b9e7c1e4fc 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -206,7 +206,7 @@ ATOMIC_OPS(atomic64, xor, s64, ^=, xor, lld, scd)
* The function returns the old value of @v minus @i.
*/
#define ATOMIC_SIP_OP(pfx, type, op, ll, sc) \
-static __inline__ int arch_##pfx##_sub_if_positive(type i, pfx##_t * v) \
+static __inline__ type arch_##pfx##_sub_if_positive(type i, pfx##_t * v) \
{ \
type temp, result; \
\
diff --git a/arch/mips/include/asm/bootinfo.h b/arch/mips/include/asm/bootinfo.h
index 4c2e8173e6ec..2128ba903391 100644
--- a/arch/mips/include/asm/bootinfo.h
+++ b/arch/mips/include/asm/bootinfo.h
@@ -75,6 +75,7 @@ enum ingenic_machine_type {
MACH_INGENIC_JZ4750,
MACH_INGENIC_JZ4755,
MACH_INGENIC_JZ4760,
+ MACH_INGENIC_JZ4760B,
MACH_INGENIC_JZ4770,
MACH_INGENIC_JZ4775,
MACH_INGENIC_JZ4780,
@@ -83,6 +84,8 @@ enum ingenic_machine_type {
MACH_INGENIC_X1830,
MACH_INGENIC_X2000,
MACH_INGENIC_X2000E,
+ MACH_INGENIC_X2000H,
+ MACH_INGENIC_X2100,
};
extern char *system_type;
diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h
index d687b40b9fbb..b3dc9c589442 100644
--- a/arch/mips/include/asm/cacheflush.h
+++ b/arch/mips/include/asm/cacheflush.h
@@ -125,13 +125,7 @@ static inline void kunmap_noncoherent(void)
kunmap_coherent();
}
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-static inline void flush_kernel_dcache_page(struct page *page)
-{
- BUG_ON(cpu_has_dc_aliases && PageHighMem(page));
- flush_dcache_page(page);
-}
-
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
/*
* For now flush_kernel_vmap_range and invalidate_kernel_vmap_range both do a
* cache writeback and invalidate operation.
diff --git a/arch/mips/include/asm/cpu.h b/arch/mips/include/asm/cpu.h
index 9e6211e6d76b..d45a52f65b7a 100644
--- a/arch/mips/include/asm/cpu.h
+++ b/arch/mips/include/asm/cpu.h
@@ -46,8 +46,8 @@
#define PRID_COMP_NETLOGIC 0x0c0000
#define PRID_COMP_CAVIUM 0x0d0000
#define PRID_COMP_LOONGSON 0x140000
-#define PRID_COMP_INGENIC_13 0x130000 /* X2000 */
-#define PRID_COMP_INGENIC_D0 0xd00000 /* JZ4740, JZ4750, X1830 */
+#define PRID_COMP_INGENIC_13 0x130000 /* X2000, X2100 */
+#define PRID_COMP_INGENIC_D0 0xd00000 /* JZ4730, JZ4740, JZ4750, JZ4755, JZ4760, X1830 */
#define PRID_COMP_INGENIC_D1 0xd10000 /* JZ4770, JZ4775, X1000 */
#define PRID_COMP_INGENIC_E1 0xe10000 /* JZ4780 */
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index 6c590ef27648..67e130d3f038 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -76,13 +76,13 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
return -EFAULT;
- get_online_cpus();
+ cpus_read_lock();
rcu_read_lock();
p = find_process_by_pid(pid);
if (!p) {
rcu_read_unlock();
- put_online_cpus();
+ cpus_read_unlock();
return -ESRCH;
}
@@ -147,7 +147,7 @@ out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
out_put_task:
put_task_struct(p);
- put_online_cpus();
+ cpus_read_unlock();
return retval;
}
@@ -166,7 +166,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
if (len < real_len)
return -EINVAL;
- get_online_cpus();
+ cpus_read_lock();
rcu_read_lock();
retval = -ESRCH;
@@ -182,7 +182,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
out_unlock:
rcu_read_unlock();
- put_online_cpus();
+ cpus_read_unlock();
if (retval)
return retval;
if (copy_to_user(user_mask_ptr, &mask, real_len))
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 73c8e7990a97..95aa86fa6077 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -859,10 +859,10 @@ int mips_set_process_fp_mode(struct task_struct *task, unsigned int value)
* scheduled in then it will already have picked up the new FP mode
* whilst doing so.
*/
- get_online_cpus();
+ cpus_read_lock();
for_each_cpu_and(cpu, &process_cpus, cpu_online_mask)
work_on_cpu(cpu, prepare_for_fp_mode_switch, NULL);
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 23a140327a0b..f979adfd4fc2 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -452,8 +452,9 @@ static void __init mips_parse_crashkernel(void)
return;
if (crash_base <= 0) {
- crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_MAX,
- crash_size, CRASH_ALIGN);
+ crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+ CRASH_ALIGN,
+ CRASH_ADDR_MAX);
if (!crash_base) {
pr_warn("crashkernel reservation failed - No suitable area found.\n");
return;
@@ -461,8 +462,9 @@ static void __init mips_parse_crashkernel(void)
} else {
unsigned long long start;
- start = memblock_find_in_range(crash_base, crash_base + crash_size,
- crash_size, 1);
+ start = memblock_phys_alloc_range(crash_size, 1,
+ crash_base,
+ crash_base + crash_size);
if (start != crash_base) {
pr_warn("Invalid memory region reserved for crash kernel\n");
return;
@@ -656,10 +658,6 @@ static void __init arch_mem_init(char **cmdline_p)
mips_reserve_vmcore();
mips_parse_crashkernel();
-#ifdef CONFIG_KEXEC
- if (crashk_res.start != crashk_res.end)
- memblock_reserve(crashk_res.start, resource_size(&crashk_res));
-#endif
device_tree_init();
/*
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index c2d2e19abea8..56c8d3cf42ed 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -385,3 +385,5 @@
444 n32 landlock_create_ruleset sys_landlock_create_ruleset
445 n32 landlock_add_rule sys_landlock_add_rule
446 n32 landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 n32 process_mrelease sys_process_mrelease
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index ac653d08b1ea..1ca7bc337932 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -361,3 +361,5 @@
444 n64 landlock_create_ruleset sys_landlock_create_ruleset
445 n64 landlock_add_rule sys_landlock_add_rule
446 n64 landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 n64 process_mrelease sys_process_mrelease
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index fae35882a165..201237fd0f43 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -434,3 +434,5 @@
444 o32 landlock_create_ruleset sys_landlock_create_ruleset
445 o32 landlock_add_rule sys_landlock_add_rule
446 o32 landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 o32 process_mrelease sys_process_mrelease
diff --git a/arch/mips/kernel/uprobes.c b/arch/mips/kernel/uprobes.c
index 6dbe4eab0a0e..9db2a6db5f62 100644
--- a/arch/mips/kernel/uprobes.c
+++ b/arch/mips/kernel/uprobes.c
@@ -75,7 +75,7 @@ bool is_trap_insn(uprobe_opcode_t *insn)
case tlt_op:
case tltu_op:
case tne_op:
- return 1;
+ return true;
}
break;
@@ -87,12 +87,12 @@ bool is_trap_insn(uprobe_opcode_t *insn)
case tlti_op:
case tltiu_op:
case tnei_op:
- return 1;
+ return true;
}
break;
}
- return 0;
+ return false;
}
#define UPROBE_TRAP_NR ULONG_MAX
@@ -254,9 +254,9 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs)
* See if the instruction can be emulated.
* Returns true if instruction was emulated, false otherwise.
*
- * For now we always emulate so this function just returns 0.
+ * For now we always emulate so this function just returns false.
*/
bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- return 0;
+ return false;
}
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index c67250a956b8..d3710959da55 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -2,21 +2,18 @@
# Makefile for KVM support for MIPS
#
-common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o)
+ccflags-y += -Ivirt/kvm -Iarch/mips/kvm
-EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
+kvm-y := $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o)
+kvm-$(CONFIG_CPU_HAS_MSA) += msa.o
-common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
-
-kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \
+kvm-y += mips.o emulate.o entry.o \
interrupt.o stats.o \
fpu.o
-kvm-objs += hypcall.o
-kvm-objs += mmu.o
-ifdef CONFIG_CPU_LOONGSON64
-kvm-objs += loongson_ipi.o
-endif
+kvm-y += hypcall.o
+kvm-y += mmu.o
+kvm-$(CONFIG_CPU_LOONGSON64) += loongson_ipi.o
-kvm-objs += vz.o
+kvm-y += vz.o
obj-$(CONFIG_KVM) += kvm.o
obj-y += callback.o tlb.o
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 6d1f68cf4edf..1bfd1b501d82 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -442,7 +442,7 @@ static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn,
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
kvm_mips_flush_gpa_pt(kvm, range->start, range->end);
- return 1;
+ return true;
}
bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -486,7 +486,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
if (!gpa_pte)
- return 0;
+ return false;
return pte_young(*gpa_pte);
}
diff --git a/arch/mips/loongson2ef/common/Makefile b/arch/mips/loongson2ef/common/Makefile
index d5ab3e543ea3..30ea8b5ca685 100644
--- a/arch/mips/loongson2ef/common/Makefile
+++ b/arch/mips/loongson2ef/common/Makefile
@@ -4,12 +4,14 @@
#
obj-y += setup.o init.o env.o time.o reset.o irq.o \
- bonito-irq.o mem.o machtype.o platform.o serial.o
+ bonito-irq.o mem.o machtype.o platform.o
obj-$(CONFIG_PCI) += pci.o
#
# Serial port support
#
+obj-$(CONFIG_LOONGSON_UART_BASE) += serial.o
+obj-$(CONFIG_EARLY_PRINTK) += serial.o
obj-$(CONFIG_LOONGSON_UART_BASE) += uart_base.o
obj-$(CONFIG_LOONGSON_MC146818) += rtc.o
diff --git a/arch/mips/mm/c-octeon.c b/arch/mips/mm/c-octeon.c
index 8ae181e08311..ec2ae501539a 100644
--- a/arch/mips/mm/c-octeon.c
+++ b/arch/mips/mm/c-octeon.c
@@ -30,7 +30,7 @@
unsigned long long cache_err_dcache[NR_CPUS];
EXPORT_SYMBOL_GPL(cache_err_dcache);
-/**
+/*
* Octeon automatically flushes the dcache on tlb changes, so
* from Linux's viewpoint it acts much like a physically
* tagged cache. No flushing is needed
@@ -56,8 +56,8 @@ static void local_octeon_flush_icache_range(unsigned long start,
}
/**
- * Flush caches as necessary for all cores affected by a
- * vma. If no vma is supplied, all cores are flushed.
+ * octeon_flush_icache_all_cores - Flush caches as necessary for all cores
+ * affected by a vma. If no vma is supplied, all cores are flushed.
*
* @vma: VMA to flush or NULL to flush all icaches.
*/
@@ -92,7 +92,7 @@ static void octeon_flush_icache_all_cores(struct vm_area_struct *vma)
}
-/**
+/*
* Called to flush the icache on all cores
*/
static void octeon_flush_icache_all(void)
@@ -102,8 +102,7 @@ static void octeon_flush_icache_all(void)
/**
- * Called to flush all memory associated with a memory
- * context.
+ * octeon_flush_cache_mm - flush all memory associated with a memory context.
*
* @mm: Memory context to flush
*/
@@ -116,7 +115,7 @@ static void octeon_flush_cache_mm(struct mm_struct *mm)
}
-/**
+/*
* Flush a range of kernel addresses out of the icache
*
*/
@@ -127,11 +126,11 @@ static void octeon_flush_icache_range(unsigned long start, unsigned long end)
/**
- * Flush a range out of a vma
+ * octeon_flush_cache_range - Flush a range out of a vma
*
* @vma: VMA to flush
- * @start:
- * @end:
+ * @start: beginning address for flush
+ * @end: ending address for flush
*/
static void octeon_flush_cache_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
@@ -142,11 +141,11 @@ static void octeon_flush_cache_range(struct vm_area_struct *vma,
/**
- * Flush a specific page of a vma
+ * octeon_flush_cache_page - Flush a specific page of a vma
*
* @vma: VMA to flush page for
* @page: Page to flush
- * @pfn:
+ * @pfn: Page frame number
*/
static void octeon_flush_cache_page(struct vm_area_struct *vma,
unsigned long page, unsigned long pfn)
@@ -160,7 +159,7 @@ static void octeon_flush_kernel_vmap_range(unsigned long vaddr, int size)
BUG();
}
-/**
+/*
* Probe Octeon's caches
*
*/
@@ -256,7 +255,7 @@ static void octeon_cache_error_setup(void)
set_handler(0x100, &except_vec2_octeon, 0x80);
}
-/**
+/*
* Setup the Octeon cache flush routines
*
*/
@@ -341,7 +340,7 @@ asmlinkage void cache_parity_error_octeon_recoverable(void)
co_cache_error_call_notifiers(0);
}
-/**
+/*
* Called when the the exception is not recoverable
*/
diff --git a/arch/mips/mti-malta/malta-dtshim.c b/arch/mips/mti-malta/malta-dtshim.c
index 0ddf03df6268..f451268f6c38 100644
--- a/arch/mips/mti-malta/malta-dtshim.c
+++ b/arch/mips/mti-malta/malta-dtshim.c
@@ -22,7 +22,7 @@
#define ROCIT_CONFIG_GEN1_MEMMAP_SHIFT 8
#define ROCIT_CONFIG_GEN1_MEMMAP_MASK (0xf << 8)
-static unsigned char fdt_buf[16 << 10] __initdata;
+static unsigned char fdt_buf[16 << 10] __initdata __aligned(8);
/* determined physical memory size, not overridden by command line args */
extern unsigned long physical_memsize;
diff --git a/arch/mips/netlogic/xlr/fmn-config.c b/arch/mips/netlogic/xlr/fmn-config.c
index c7622c6e5f67..15483537e8cf 100644
--- a/arch/mips/netlogic/xlr/fmn-config.c
+++ b/arch/mips/netlogic/xlr/fmn-config.c
@@ -103,18 +103,19 @@ static void check_credit_distribution(void)
}
/**
- * Configure bucket size and credits for a device. 'size' is the size of
- * the buckets for the device. This size is distributed among all the CPUs
- * so that all of them can send messages to the device.
- *
- * The device is also given 'cpu_credits' to send messages to the CPUs
- *
+ * setup_fmn_cc - Configure bucket size and credits for a device.
* @dev_info: FMN information structure for each devices
* @start_stn_id: Starting station id of dev_info
* @end_stn_id: End station id of dev_info
* @num_buckets: Total number of buckets for den_info
* @cpu_credits: Allowed credits to cpu for each devices pointing by dev_info
* @size: Size of the each buckets in the device station
+ *
+ * 'size' is the size of the buckets for the device. This size is
+ * distributed among all the CPUs
+ * so that all of them can send messages to the device.
+ *
+ * The device is also given 'cpu_credits' to send messages to the CPUs
*/
static void setup_fmn_cc(struct xlr_fmn_info *dev_info, int start_stn_id,
int end_stn_id, int num_buckets, int cpu_credits, int size)
@@ -174,6 +175,8 @@ static void setup_cpu_fmninfo(struct xlr_fmn_info *cpu, int num_core)
}
/**
+ * xlr_board_info_setup - Setup FMN details
+ *
* Setup the FMN details for each devices according to the device available
* in each variant of XLR/XLS processor
*/
diff --git a/arch/mips/pistachio/Kconfig b/arch/mips/pistachio/Kconfig
deleted file mode 100644
index 9a0e06c95184..000000000000
--- a/arch/mips/pistachio/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-config PISTACHIO_GPTIMER_CLKSRC
- bool "Enable General Purpose Timer based clocksource"
- depends on MACH_PISTACHIO
- select CLKSRC_PISTACHIO
- select MIPS_EXTERNAL_TIMER
- help
- This option enables a clocksource driver based on a Pistachio
- SoC General Purpose external timer.
-
- If you want to enable the CPUFreq, you need to enable
- this option.
-
- If you don't want to enable CPUFreq, you can leave this disabled.
diff --git a/arch/mips/pistachio/Makefile b/arch/mips/pistachio/Makefile
deleted file mode 100644
index 66f4af17fb66..000000000000
--- a/arch/mips/pistachio/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-y += init.o irq.o time.o
diff --git a/arch/mips/pistachio/Platform b/arch/mips/pistachio/Platform
deleted file mode 100644
index c59de86dbddf..000000000000
--- a/arch/mips/pistachio/Platform
+++ /dev/null
@@ -1,6 +0,0 @@
-#
-# IMG Pistachio SoC
-#
-load-$(CONFIG_MACH_PISTACHIO) += 0xffffffff80400000
-zload-$(CONFIG_MACH_PISTACHIO) += 0xffffffff81000000
-all-$(CONFIG_MACH_PISTACHIO) := uImage.gz
diff --git a/arch/mips/pistachio/init.c b/arch/mips/pistachio/init.c
deleted file mode 100644
index e0bacfc3c6b4..000000000000
--- a/arch/mips/pistachio/init.c
+++ /dev/null
@@ -1,125 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Pistachio platform setup
- *
- * Copyright (C) 2014 Google, Inc.
- * Copyright (C) 2016 Imagination Technologies
- */
-
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/kernel.h>
-#include <linux/of_address.h>
-#include <linux/of_fdt.h>
-
-#include <asm/cacheflush.h>
-#include <asm/fw/fw.h>
-#include <asm/mips-boards/generic.h>
-#include <asm/mips-cps.h>
-#include <asm/prom.h>
-#include <asm/smp-ops.h>
-#include <asm/traps.h>
-
-/*
- * Core revision register decoding
- * Bits 23 to 20: Major rev
- * Bits 15 to 8: Minor rev
- * Bits 7 to 0: Maintenance rev
- */
-#define PISTACHIO_CORE_REV_REG 0xB81483D0
-#define PISTACHIO_CORE_REV_A1 0x00100006
-#define PISTACHIO_CORE_REV_B0 0x00100106
-
-const char *get_system_type(void)
-{
- u32 core_rev;
- const char *sys_type;
-
- core_rev = __raw_readl((const void *)PISTACHIO_CORE_REV_REG);
-
- switch (core_rev) {
- case PISTACHIO_CORE_REV_B0:
- sys_type = "IMG Pistachio SoC (B0)";
- break;
-
- case PISTACHIO_CORE_REV_A1:
- sys_type = "IMG Pistachio SoC (A1)";
- break;
-
- default:
- sys_type = "IMG Pistachio SoC";
- break;
- }
-
- return sys_type;
-}
-
-void __init *plat_get_fdt(void)
-{
- if (fw_arg0 != -2)
- panic("Device-tree not present");
- return (void *)fw_arg1;
-}
-
-void __init plat_mem_setup(void)
-{
- __dt_setup_arch(plat_get_fdt());
-}
-
-#define DEFAULT_CPC_BASE_ADDR 0x1bde0000
-#define DEFAULT_CDMM_BASE_ADDR 0x1bdd0000
-
-phys_addr_t mips_cpc_default_phys_base(void)
-{
- return DEFAULT_CPC_BASE_ADDR;
-}
-
-phys_addr_t mips_cdmm_phys_base(void)
-{
- return DEFAULT_CDMM_BASE_ADDR;
-}
-
-static void __init mips_nmi_setup(void)
-{
- void *base;
-
- base = cpu_has_veic ?
- (void *)(CAC_BASE + 0xa80) :
- (void *)(CAC_BASE + 0x380);
- memcpy(base, except_vec_nmi, 0x80);
- flush_icache_range((unsigned long)base,
- (unsigned long)base + 0x80);
-}
-
-static void __init mips_ejtag_setup(void)
-{
- void *base;
- extern char except_vec_ejtag_debug[];
-
- base = cpu_has_veic ?
- (void *)(CAC_BASE + 0xa00) :
- (void *)(CAC_BASE + 0x300);
- memcpy(base, except_vec_ejtag_debug, 0x80);
- flush_icache_range((unsigned long)base,
- (unsigned long)base + 0x80);
-}
-
-void __init prom_init(void)
-{
- board_nmi_handler_setup = mips_nmi_setup;
- board_ejtag_handler_setup = mips_ejtag_setup;
-
- mips_cm_probe();
- mips_cpc_probe();
- register_cps_smp_ops();
-
- pr_info("SoC Type: %s\n", get_system_type());
-}
-
-void __init device_tree_init(void)
-{
- if (!initial_boot_params)
- return;
-
- unflatten_and_copy_device_tree();
-}
diff --git a/arch/mips/pistachio/irq.c b/arch/mips/pistachio/irq.c
deleted file mode 100644
index 437c3101ac45..000000000000
--- a/arch/mips/pistachio/irq.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Pistachio IRQ setup
- *
- * Copyright (C) 2014 Google, Inc.
- */
-
-#include <linux/init.h>
-#include <linux/irqchip.h>
-#include <linux/kernel.h>
-
-#include <asm/cpu-features.h>
-#include <asm/irq_cpu.h>
-
-void __init arch_init_irq(void)
-{
- pr_info("EIC is %s\n", cpu_has_veic ? "on" : "off");
- pr_info("VINT is %s\n", cpu_has_vint ? "on" : "off");
-
- if (!cpu_has_veic)
- mips_cpu_irq_init();
-
- irqchip_init();
-}
diff --git a/arch/mips/pistachio/time.c b/arch/mips/pistachio/time.c
deleted file mode 100644
index de64751dec40..000000000000
--- a/arch/mips/pistachio/time.c
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Pistachio clocksource/timer setup
- *
- * Copyright (C) 2014 Google, Inc.
- */
-
-#include <linux/clk.h>
-#include <linux/clocksource.h>
-#include <linux/init.h>
-#include <linux/of.h>
-#include <linux/of_clk.h>
-
-#include <asm/mips-cps.h>
-#include <asm/time.h>
-
-unsigned int get_c0_compare_int(void)
-{
- return gic_get_c0_compare_int();
-}
-
-int get_c0_perfcount_int(void)
-{
- return gic_get_c0_perfcount_int();
-}
-EXPORT_SYMBOL_GPL(get_c0_perfcount_int);
-
-int get_c0_fdc_int(void)
-{
- return gic_get_c0_fdc_int();
-}
-
-void __init plat_time_init(void)
-{
- struct device_node *np;
- struct clk *clk;
-
- of_clk_init(NULL);
- timer_probe();
-
- np = of_get_cpu_node(0, NULL);
- if (!np) {
- pr_err("Failed to get CPU node\n");
- return;
- }
-
- clk = of_clk_get(np, 0);
- if (IS_ERR(clk)) {
- pr_err("Failed to get CPU clock: %ld\n", PTR_ERR(clk));
- return;
- }
-
- mips_hpt_frequency = clk_get_rate(clk) / 2;
- clk_put(clk);
-}
diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h
index 7d6824f7c0e8..c2a222ebfa2a 100644
--- a/arch/nds32/include/asm/cacheflush.h
+++ b/arch/nds32/include/asm/cacheflush.h
@@ -36,8 +36,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
void flush_anon_page(struct vm_area_struct *vma,
struct page *page, unsigned long vaddr);
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-void flush_kernel_dcache_page(struct page *page);
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
void flush_kernel_vmap_range(void *addr, int size);
void invalidate_kernel_vmap_range(void *addr, int size);
#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&(mapping)->i_pages)
diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c
index ad5344ef5d33..07aac65d1cab 100644
--- a/arch/nds32/mm/cacheflush.c
+++ b/arch/nds32/mm/cacheflush.c
@@ -318,15 +318,6 @@ void flush_anon_page(struct vm_area_struct *vma,
local_irq_restore(flags);
}
-void flush_kernel_dcache_page(struct page *page)
-{
- unsigned long flags;
- local_irq_save(flags);
- cpu_dcache_wbinval_page((unsigned long)page_address(page));
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(flush_kernel_dcache_page);
-
void flush_kernel_vmap_range(void *addr, int size)
{
unsigned long flags;
diff --git a/arch/openrisc/boot/dts/or1klitex.dts b/arch/openrisc/boot/dts/or1klitex.dts
index 3f9867aa3844..91c7173c50e6 100644
--- a/arch/openrisc/boot/dts/or1klitex.dts
+++ b/arch/openrisc/boot/dts/or1klitex.dts
@@ -41,10 +41,10 @@
interrupt-controller;
};
- serial0: serial@e0002000 {
+ serial0: serial@e0006800 {
device_type = "serial";
compatible = "litex,liteuart";
- reg = <0xe0002000 0x100>;
+ reg = <0xe0006800 0x100>;
};
soc_ctrl0: soc_controller@e0000000 {
@@ -52,4 +52,13 @@
reg = <0xe0000000 0xc>;
status = "okay";
};
+
+ ethernet@e0001000 {
+ compatible = "litex,liteeth";
+ reg = <0xe0001000 0x7c>,
+ <0xe0001800 0x0a>,
+ <0x80000000 0x2000>;
+ reg-names = "mac", "mdio", "buffer";
+ interrupts = <2>;
+ };
};
diff --git a/arch/openrisc/configs/or1klitex_defconfig b/arch/openrisc/configs/or1klitex_defconfig
index 3c2c70d3d740..d695879a4d26 100644
--- a/arch/openrisc/configs/or1klitex_defconfig
+++ b/arch/openrisc/configs/or1klitex_defconfig
@@ -1,18 +1,24 @@
CONFIG_BLK_DEV_INITRD=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
-CONFIG_BUG_ON_DATA_CORRUPTION=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_DEVTMPFS=y
-CONFIG_DEVTMPFS_MOUNT=y
CONFIG_EMBEDDED=y
+CONFIG_OPENRISC_BUILTIN_DTB="or1klitex"
CONFIG_HZ_100=y
-CONFIG_INITRAMFS_SOURCE="openrisc-rootfs.cpio.gz"
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
CONFIG_OF_OVERLAY=y
-CONFIG_OPENRISC_BUILTIN_DTB="or1klitex"
-CONFIG_PANIC_ON_OOPS=y
-CONFIG_PRINTK_TIME=y
-CONFIG_LITEX_SOC_CONTROLLER=y
+CONFIG_NETDEVICES=y
+CONFIG_LITEX_LITEETH=y
CONFIG_SERIAL_LITEUART=y
CONFIG_SERIAL_LITEUART_CONSOLE=y
-CONFIG_SOFTLOCKUP_DETECTOR=y
CONFIG_TTY_PRINTK=y
+CONFIG_LITEX_SOC_CONTROLLER=y
+CONFIG_TMPFS=y
+CONFIG_PRINTK_TIME=y
+CONFIG_PANIC_ON_OOPS=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BUG_ON_DATA_CORRUPTION=y
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 4ac591c9ca33..cdd657f80bfa 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -12,7 +12,7 @@
* et al.
*/
-/* or32 pgtable.h - macros and functions to manipulate page tables
+/* or1k pgtable.h - macros and functions to manipulate page tables
*
* Based on:
* include/asm-cris/pgtable.h
@@ -29,14 +29,14 @@
/*
* The Linux memory management assumes a three-level page table setup. On
- * or32, we use that, but "fold" the mid level into the top-level page
+ * or1k, we use that, but "fold" the mid level into the top-level page
* table. Since the MMU TLB is software loaded through an interrupt, it
* supports any page table structure, so we could have used a three-level
* setup, but for the amounts of memory we normally use, a two-level is
* probably more efficient.
*
* This file contains the functions and defines necessary to modify and use
- * the or32 page table tree.
+ * the or1k page table tree.
*/
extern void paging_init(void);
diff --git a/arch/openrisc/include/asm/setup.h b/arch/openrisc/include/asm/setup.h
new file mode 100644
index 000000000000..9acbc5deda69
--- /dev/null
+++ b/arch/openrisc/include/asm/setup.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2021 Stafford Horne
+ */
+#ifndef _ASM_OR1K_SETUP_H
+#define _ASM_OR1K_SETUP_H
+
+#include <linux/init.h>
+#include <asm-generic/setup.h>
+
+#ifndef __ASSEMBLY__
+void __init or1k_early_setup(void *fdt);
+#endif
+
+#endif /* _ASM_OR1K_SETUP_H */
diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h
index 4f9d2a261455..659834ab87fa 100644
--- a/arch/openrisc/include/asm/thread_info.h
+++ b/arch/openrisc/include/asm/thread_info.h
@@ -25,7 +25,7 @@
/* THREAD_SIZE is the size of the task_struct/kernel_stack combo.
* normally, the stack is found by doing something like p + THREAD_SIZE
- * in or32, a page is 8192 bytes, which seems like a sane size
+ * in or1k, a page is 8192 bytes, which seems like a sane size
*/
#define THREAD_SIZE_ORDER 0
diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S
index 947613f61d4a..edaa775a648e 100644
--- a/arch/openrisc/kernel/entry.S
+++ b/arch/openrisc/kernel/entry.S
@@ -326,7 +326,7 @@ EXCEPTION_ENTRY(_data_page_fault_handler)
1: l.ori r6,r0,0x0 // !write access
2:
- /* call fault.c handler in or32/mm/fault.c */
+ /* call fault.c handler in openrisc/mm/fault.c */
l.jal do_page_fault
l.nop
l.j _ret_from_exception
@@ -348,7 +348,7 @@ EXCEPTION_ENTRY(_insn_page_fault_handler)
/* r4 set be EXCEPTION_HANDLE */ // effective address of fault
l.ori r6,r0,0x0 // !write access
- /* call fault.c handler in or32/mm/fault.c */
+ /* call fault.c handler in openrisc/mm/fault.c */
l.jal do_page_fault
l.nop
l.j _ret_from_exception
@@ -547,6 +547,7 @@ EXCEPTION_ENTRY(_external_irq_handler)
l.bnf 1f // ext irq enabled, all ok.
l.nop
+#ifdef CONFIG_PRINTK
l.addi r1,r1,-0x8
l.movhi r3,hi(42f)
l.ori r3,r3,lo(42f)
@@ -560,6 +561,7 @@ EXCEPTION_ENTRY(_external_irq_handler)
.string "\n\rESR interrupt bug: in _external_irq_handler (ESR %x)\n\r"
.align 4
.previous
+#endif
l.ori r4,r4,SPR_SR_IEE // fix the bug
// l.sw PT_SR(r1),r4
diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S
index af355e3f4619..15f1b38dfe03 100644
--- a/arch/openrisc/kernel/head.S
+++ b/arch/openrisc/kernel/head.S
@@ -599,7 +599,7 @@ flush_tlb:
l.jal _flush_tlb
l.nop
-/* The MMU needs to be enabled before or32_early_setup is called */
+/* The MMU needs to be enabled before or1k_early_setup is called */
enable_mmu:
/*
@@ -641,9 +641,9 @@ enable_mmu:
/* magic number mismatch, set fdt pointer to null */
l.or r25,r0,r0
_fdt_found:
- /* pass fdt pointer to or32_early_setup in r3 */
+ /* pass fdt pointer to or1k_early_setup in r3 */
l.or r3,r0,r25
- LOAD_SYMBOL_2_GPR(r24, or32_early_setup)
+ LOAD_SYMBOL_2_GPR(r24, or1k_early_setup)
l.jalr r24
l.nop
diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c
index 8ae2da6ac097..0cd04d936a7a 100644
--- a/arch/openrisc/kernel/setup.c
+++ b/arch/openrisc/kernel/setup.c
@@ -209,7 +209,8 @@ void __init setup_cpuinfo(void)
}
/**
- * or32_early_setup
+ * or1k_early_setup
+ * @fdt: pointer to the start of the device tree in memory or NULL
*
* Handles the pointer to the device tree that this kernel is to use
* for establishing the available platform devices.
@@ -217,7 +218,7 @@ void __init setup_cpuinfo(void)
* Falls back on built-in device tree in case null pointer is passed.
*/
-void __init or32_early_setup(void *fdt)
+void __init or1k_early_setup(void *fdt)
{
if (fdt)
pr_info("FDT at %p\n", fdt);
@@ -243,21 +244,6 @@ static inline unsigned long extract_value(unsigned long reg, unsigned long mask)
return mask & reg;
}
-void __init detect_unit_config(unsigned long upr, unsigned long mask,
- char *text, void (*func) (void))
-{
- if (text != NULL)
- printk("%s", text);
-
- if (upr & mask) {
- if (func != NULL)
- func();
- else
- printk("present\n");
- } else
- printk("not present\n");
-}
-
/*
* calibrate_delay
*
diff --git a/arch/openrisc/lib/Makefile b/arch/openrisc/lib/Makefile
index 79775aaa6baa..53327406b483 100644
--- a/arch/openrisc/lib/Makefile
+++ b/arch/openrisc/lib/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
#
-# Makefile for or32 specific library files..
+# Makefile for or1k specific library files..
#
obj-y := delay.o string.o memset.o memcpy.o
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index ca97d9baab51..c730d1a51686 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -28,7 +28,7 @@ unsigned long pte_misses; /* updated by do_page_fault() */
unsigned long pte_errors; /* updated by do_page_fault() */
/* __PHX__ :: - check the vmalloc_fault in do_page_fault()
- * - also look into include/asm-or32/mmu_context.h
+ * - also look into include/asm/mmu_context.h
*/
volatile pgd_t *current_pgd[NR_CPUS];
diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c
index 2d395998f524..7ee49f5881d1 100644
--- a/arch/parisc/boot/compressed/misc.c
+++ b/arch/parisc/boot/compressed/misc.c
@@ -26,7 +26,7 @@
extern char input_data[];
extern int input_len;
/* output_len is inserted by the linker possibly at an unaligned address */
-extern __le32 output_len __aligned(1);
+extern char output_len;
extern char _text, _end;
extern char _bss, _ebss;
extern char _startcode_end;
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index 99663fc1f997..eef0096db5f8 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -36,16 +36,12 @@ void flush_cache_all_local(void);
void flush_cache_all(void);
void flush_cache_mm(struct mm_struct *mm);
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
void flush_kernel_dcache_page_addr(void *addr);
-static inline void flush_kernel_dcache_page(struct page *page)
-{
- flush_kernel_dcache_page_addr(page_address(page));
-}
#define flush_kernel_dcache_range(start,size) \
flush_kernel_dcache_range_asm((start), (start)+(size));
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
void flush_kernel_vmap_range(void *vaddr, int size);
void invalidate_kernel_vmap_range(void *vaddr, int size);
@@ -59,7 +55,7 @@ extern void flush_dcache_page(struct page *page);
#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
#define flush_icache_page(vma,page) do { \
- flush_kernel_dcache_page(page); \
+ flush_kernel_dcache_page_addr(page_address(page)); \
flush_kernel_icache_page(page_address(page)); \
} while (0)
diff --git a/arch/parisc/include/uapi/asm/swab.h b/arch/parisc/include/uapi/asm/swab.h
deleted file mode 100644
index 35fb2d1bfbbd..000000000000
--- a/arch/parisc/include/uapi/asm/swab.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _PARISC_SWAB_H
-#define _PARISC_SWAB_H
-
-#include <asm/bitsperlong.h>
-#include <linux/types.h>
-#include <linux/compiler.h>
-
-#define __SWAB_64_THRU_32__
-
-static inline __attribute_const__ __u16 __arch_swab16(__u16 x)
-{
- __asm__("dep %0, 15, 8, %0\n\t" /* deposit 00ab -> 0bab */
- "shd %%r0, %0, 8, %0" /* shift 000000ab -> 00ba */
- : "=r" (x)
- : "0" (x));
- return x;
-}
-#define __arch_swab16 __arch_swab16
-
-static inline __attribute_const__ __u32 __arch_swab24(__u32 x)
-{
- __asm__("shd %0, %0, 8, %0\n\t" /* shift xabcxabc -> cxab */
- "dep %0, 15, 8, %0\n\t" /* deposit cxab -> cbab */
- "shd %%r0, %0, 8, %0" /* shift 0000cbab -> 0cba */
- : "=r" (x)
- : "0" (x));
- return x;
-}
-
-static inline __attribute_const__ __u32 __arch_swab32(__u32 x)
-{
- unsigned int temp;
- __asm__("shd %0, %0, 16, %1\n\t" /* shift abcdabcd -> cdab */
- "dep %1, 15, 8, %1\n\t" /* deposit cdab -> cbab */
- "shd %0, %1, 8, %0" /* shift abcdcbab -> dcba */
- : "=r" (x), "=&r" (temp)
- : "0" (x));
- return x;
-}
-#define __arch_swab32 __arch_swab32
-
-#if __BITS_PER_LONG > 32
-/*
-** From "PA-RISC 2.0 Architecture", HP Professional Books.
-** See Appendix I page 8 , "Endian Byte Swapping".
-**
-** Pretty cool algorithm: (* == zero'd bits)
-** PERMH 01234567 -> 67452301 into %0
-** HSHL 67452301 -> 7*5*3*1* into %1
-** HSHR 67452301 -> *6*4*2*0 into %0
-** OR %0 | %1 -> 76543210 into %0 (all done!)
-*/
-static inline __attribute_const__ __u64 __arch_swab64(__u64 x)
-{
- __u64 temp;
- __asm__("permh,3210 %0, %0\n\t"
- "hshl %0, 8, %1\n\t"
- "hshr,u %0, 8, %0\n\t"
- "or %1, %0, %0"
- : "=r" (x), "=&r" (temp)
- : "0" (x));
- return x;
-}
-#define __arch_swab64 __arch_swab64
-#endif /* __BITS_PER_LONG > 32 */
-
-#endif /* _PARISC_SWAB_H */
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 86a1a63563fd..39e02227e231 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -334,7 +334,7 @@ void flush_dcache_page(struct page *page)
return;
}
- flush_kernel_dcache_page(page);
+ flush_kernel_dcache_page_addr(page_address(page));
if (!mapping)
return;
@@ -375,7 +375,6 @@ EXPORT_SYMBOL(flush_dcache_page);
/* Defined in arch/parisc/kernel/pacache.S */
EXPORT_SYMBOL(flush_kernel_dcache_range_asm);
-EXPORT_SYMBOL(flush_kernel_dcache_page_asm);
EXPORT_SYMBOL(flush_data_cache_local);
EXPORT_SYMBOL(flush_kernel_icache_range_asm);
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index eaf0603ae781..0bf854b70612 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -444,3 +444,5 @@
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 common process_mrelease sys_process_mrelease
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 678a0acbad7b..8a93b509337d 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -123,6 +123,7 @@ config PPC
select ARCH_HAS_COPY_MC if PPC64
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE
+ select ARCH_HAS_DEBUG_WX if STRICT_KERNEL_RWX
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_DMA_MAP_DIRECT if PPC_PSERIES
select ARCH_HAS_ELF_RANDOMIZE
@@ -182,6 +183,7 @@ config PPC
select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL
select GENERIC_PCI_IOMAP if PCI
+ select GENERIC_PTDUMP
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
select GENERIC_VDSO_TIME_NS
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 205cd77f321f..192f0ed0097f 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -365,36 +365,6 @@ config FAIL_IOMMU
If you are unsure, say N.
-config PPC_PTDUMP
- bool "Export kernel pagetable layout to userspace via debugfs"
- depends on DEBUG_KERNEL && DEBUG_FS
- help
- This option exports the state of the kernel pagetables to a
- debugfs file. This is only useful for kernel developers who are
- working in architecture specific areas of the kernel - probably
- not a good idea to enable this feature in a production kernel.
-
- If you are unsure, say N.
-
-config PPC_DEBUG_WX
- bool "Warn on W+X mappings at boot"
- depends on PPC_PTDUMP && STRICT_KERNEL_RWX
- help
- Generate a warning if any W+X mappings are found at boot.
-
- This is useful for discovering cases where the kernel is leaving
- W+X mappings after applying NX, as such mappings are a security risk.
-
- Note that even if the check fails, your kernel is possibly
- still fine, as W+X mappings are not a security hole in
- themselves, what they do is that they make the exploitation
- of other unfixed kernel bugs easier.
-
- There is no runtime or memory usage effect of this option
- once the kernel has booted up - it's a one time check.
-
- If in doubt, say "Y".
-
config PPC_FAST_ENDIAN_SWITCH
bool "Deprecated fast endian-switch syscall"
depends on DEBUG_KERNEL && PPC_BOOK3S_64
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 6505d66f1193..aa6808e70647 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -122,6 +122,7 @@ endif
LDFLAGS_vmlinux-y := -Bstatic
LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie
+LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) += -z notext
LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y)
ifdef CONFIG_PPC64
@@ -407,7 +408,8 @@ endef
PHONY += install
install:
- $(Q)$(MAKE) $(build)=$(boot) install
+ sh -x $(srctree)/$(boot)/install.sh "$(KERNELRELEASE)" vmlinux \
+ System.map "$(INSTALL_PATH)"
archclean:
$(Q)$(MAKE) $(clean)=$(boot)
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index e312ea802aa6..6900d0ac2421 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -341,7 +341,6 @@ image-$(CONFIG_TQM8541) += cuImage.tqm8541
image-$(CONFIG_TQM8548) += cuImage.tqm8548
image-$(CONFIG_TQM8555) += cuImage.tqm8555
image-$(CONFIG_TQM8560) += cuImage.tqm8560
-image-$(CONFIG_SBC8548) += cuImage.sbc8548
image-$(CONFIG_KSI8560) += cuImage.ksi8560
# Board ports in arch/powerpc/platform/86xx/Kconfig
@@ -444,16 +443,6 @@ $(obj)/zImage: $(addprefix $(obj)/, $(image-y))
$(obj)/zImage.initrd: $(addprefix $(obj)/, $(initrd-y))
$(Q)rm -f $@; ln $< $@
-# Only install the vmlinux
-install: $(CONFIGURE) $(addprefix $(obj)/, $(image-y))
- sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux System.map "$(INSTALL_PATH)"
-
-# Install the vmlinux and other built boot targets.
-zInstall: $(CONFIGURE) $(addprefix $(obj)/, $(image-y))
- sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux System.map "$(INSTALL_PATH)" $^
-
-PHONY += install zInstall
-
# anything not in $(targets)
clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* treeImage.* \
zImage zImage.initrd zImage.chrp zImage.coff zImage.holly \
diff --git a/arch/powerpc/boot/dts/fsl/sbc8641d.dts b/arch/powerpc/boot/dts/fsl/sbc8641d.dts
deleted file mode 100644
index 3dca10acc161..000000000000
--- a/arch/powerpc/boot/dts/fsl/sbc8641d.dts
+++ /dev/null
@@ -1,176 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SBC8641D Device Tree Source
- *
- * Copyright 2008 Wind River Systems Inc.
- *
- * Paul Gortmaker (see MAINTAINERS for contact information)
- *
- * Based largely on the mpc8641_hpcn.dts by Freescale Semiconductor Inc.
- */
-
-/include/ "mpc8641si-pre.dtsi"
-
-/ {
- model = "SBC8641D";
- compatible = "wind,sbc8641";
-
- memory {
- device_type = "memory";
- reg = <0x00000000 0x20000000>; // 512M at 0x0
- };
-
- lbc: localbus@f8005000 {
- reg = <0xf8005000 0x1000>;
-
- ranges = <0 0 0xff000000 0x01000000 // 16MB Boot flash
- 1 0 0xf0000000 0x00010000 // 64KB EEPROM
- 2 0 0xf1000000 0x00100000 // EPLD (1MB)
- 3 0 0xe0000000 0x04000000 // 64MB LB SDRAM (CS3)
- 4 0 0xe4000000 0x04000000 // 64MB LB SDRAM (CS4)
- 6 0 0xf4000000 0x00100000 // LCD display (1MB)
- 7 0 0xe8000000 0x04000000>; // 64MB OneNAND
-
- flash@0,0 {
- compatible = "cfi-flash";
- reg = <0 0 0x01000000>;
- bank-width = <2>;
- device-width = <2>;
- #address-cells = <1>;
- #size-cells = <1>;
- partition@0 {
- label = "dtb";
- reg = <0x00000000 0x00100000>;
- read-only;
- };
- partition@300000 {
- label = "kernel";
- reg = <0x00100000 0x00400000>;
- read-only;
- };
- partition@400000 {
- label = "fs";
- reg = <0x00500000 0x00a00000>;
- };
- partition@700000 {
- label = "firmware";
- reg = <0x00f00000 0x00100000>;
- read-only;
- };
- };
-
- epld@2,0 {
- compatible = "wrs,epld-localbus";
- #address-cells = <2>;
- #size-cells = <1>;
- reg = <2 0 0x100000>;
- ranges = <0 0 5 0 1 // User switches
- 1 0 5 1 1 // Board ID/Rev
- 3 0 5 3 1>; // LEDs
- };
- };
-
- soc: soc@f8000000 {
- ranges = <0x00000000 0xf8000000 0x00100000>;
-
- enet0: ethernet@24000 {
- tbi-handle = <&tbi0>;
- phy-handle = <&phy0>;
- phy-connection-type = "rgmii-id";
- };
-
- mdio@24520 {
- phy0: ethernet-phy@1f {
- reg = <0x1f>;
- };
- phy1: ethernet-phy@0 {
- reg = <0>;
- };
- phy2: ethernet-phy@1 {
- reg = <1>;
- };
- phy3: ethernet-phy@2 {
- reg = <2>;
- };
- tbi0: tbi-phy@11 {
- reg = <0x11>;
- device_type = "tbi-phy";
- };
- };
-
- enet1: ethernet@25000 {
- tbi-handle = <&tbi1>;
- phy-handle = <&phy1>;
- phy-connection-type = "rgmii-id";
- };
-
- mdio@25520 {
- tbi1: tbi-phy@11 {
- reg = <0x11>;
- device_type = "tbi-phy";
- };
- };
-
- enet2: ethernet@26000 {
- tbi-handle = <&tbi2>;
- phy-handle = <&phy2>;
- phy-connection-type = "rgmii-id";
- };
-
- mdio@26520 {
- tbi2: tbi-phy@11 {
- reg = <0x11>;
- device_type = "tbi-phy";
- };
- };
-
- enet3: ethernet@27000 {
- tbi-handle = <&tbi3>;
- phy-handle = <&phy3>;
- phy-connection-type = "rgmii-id";
- };
-
- mdio@27520 {
- tbi3: tbi-phy@11 {
- reg = <0x11>;
- device_type = "tbi-phy";
- };
- };
- };
-
- pci0: pcie@f8008000 {
- reg = <0xf8008000 0x1000>;
- ranges = <0x02000000 0x0 0x80000000 0x80000000 0x0 0x20000000
- 0x01000000 0x0 0x00000000 0xe2000000 0x0 0x00100000>;
- interrupt-map-mask = <0xff00 0 0 7>;
-
- pcie@0 {
- ranges = <0x02000000 0x0 0x80000000
- 0x02000000 0x0 0x80000000
- 0x0 0x20000000
-
- 0x01000000 0x0 0x00000000
- 0x01000000 0x0 0x00000000
- 0x0 0x00100000>;
- };
-
- };
-
- pci1: pcie@f8009000 {
- reg = <0xf8009000 0x1000>;
- ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000
- 0x01000000 0x0 0x00000000 0xe3000000 0x0 0x00100000>;
-
- pcie@0 {
- ranges = <0x02000000 0x0 0xa0000000
- 0x02000000 0x0 0xa0000000
- 0x0 0x20000000
-
- 0x01000000 0x0 0x00000000
- 0x01000000 0x0 0x00000000
- 0x0 0x00100000>;
- };
- };
-};
-
-/include/ "mpc8641si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/microwatt.dts b/arch/powerpc/boot/dts/microwatt.dts
index 974abbdda249..65b270a90f94 100644
--- a/arch/powerpc/boot/dts/microwatt.dts
+++ b/arch/powerpc/boot/dts/microwatt.dts
@@ -127,6 +127,18 @@
fifo-size = <16>;
interrupts = <0x10 0x1>;
};
+
+ ethernet@8020000 {
+ compatible = "litex,liteeth";
+ reg = <0x8021000 0x100
+ 0x8020800 0x100
+ 0x8030000 0x2000>;
+ reg-names = "mac", "mido", "buffer";
+ litex,rx-slots = <2>;
+ litex,tx-slots = <2>;
+ litex,slot-size = <0x800>;
+ interrupts = <0x11 0x1>;
+ };
};
chosen {
diff --git a/arch/powerpc/boot/dts/sbc8548-altflash.dts b/arch/powerpc/boot/dts/sbc8548-altflash.dts
deleted file mode 100644
index bb7a1e712bb7..000000000000
--- a/arch/powerpc/boot/dts/sbc8548-altflash.dts
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SBC8548 Device Tree Source
- *
- * Configured for booting off the alternate (64MB SODIMM) flash.
- * Requires switching JP12 jumpers and changing SW2.8 setting.
- *
- * Copyright 2013 Wind River Systems Inc.
- *
- * Paul Gortmaker (see MAINTAINERS for contact information)
- */
-
-
-/dts-v1/;
-
-/include/ "sbc8548-pre.dtsi"
-
-/{
- localbus@e0000000 {
- #address-cells = <2>;
- #size-cells = <1>;
- compatible = "simple-bus";
- reg = <0xe0000000 0x5000>;
- interrupt-parent = <&mpic>;
-
- ranges = <0x0 0x0 0xfc000000 0x04000000 /*64MB Flash*/
- 0x3 0x0 0xf0000000 0x04000000 /*64MB SDRAM*/
- 0x4 0x0 0xf4000000 0x04000000 /*64MB SDRAM*/
- 0x5 0x0 0xf8000000 0x00b10000 /* EPLD */
- 0x6 0x0 0xef800000 0x00800000>; /*8MB Flash*/
-
- flash@0,0 {
- #address-cells = <1>;
- #size-cells = <1>;
- reg = <0x0 0x0 0x04000000>;
- compatible = "intel,JS28F128", "cfi-flash";
- bank-width = <4>;
- device-width = <1>;
- partition@0 {
- label = "space";
- /* FC000000 -> FFEFFFFF */
- reg = <0x00000000 0x03f00000>;
- };
- partition@3f00000 {
- label = "bootloader";
- /* FFF00000 -> FFFFFFFF */
- reg = <0x03f00000 0x00100000>;
- read-only;
- };
- };
-
-
- epld@5,0 {
- compatible = "wrs,epld-localbus";
- #address-cells = <2>;
- #size-cells = <1>;
- reg = <0x5 0x0 0x00b10000>;
- ranges = <
- 0x0 0x0 0x5 0x000000 0x1fff /* LED */
- 0x1 0x0 0x5 0x100000 0x1fff /* Switches */
- 0x3 0x0 0x5 0x300000 0x1fff /* HW Rev. */
- 0xb 0x0 0x5 0xb00000 0x1fff /* EEPROM */
- >;
-
- led@0,0 {
- compatible = "led";
- reg = <0x0 0x0 0x1fff>;
- };
-
- switches@1,0 {
- compatible = "switches";
- reg = <0x1 0x0 0x1fff>;
- };
-
- hw-rev@3,0 {
- compatible = "hw-rev";
- reg = <0x3 0x0 0x1fff>;
- };
-
- eeprom@b,0 {
- compatible = "eeprom";
- reg = <0xb 0 0x1fff>;
- };
-
- };
-
- alt-flash@6,0 {
- #address-cells = <1>;
- #size-cells = <1>;
- compatible = "intel,JS28F640", "cfi-flash";
- reg = <0x6 0x0 0x800000>;
- bank-width = <1>;
- device-width = <1>;
- partition@0 {
- label = "space";
- /* EF800000 -> EFF9FFFF */
- reg = <0x00000000 0x007a0000>;
- };
- partition@7a0000 {
- label = "bootloader";
- /* EFFA0000 -> EFFFFFFF */
- reg = <0x007a0000 0x00060000>;
- read-only;
- };
- };
-
-
- };
-};
-
-/include/ "sbc8548-post.dtsi"
diff --git a/arch/powerpc/boot/dts/sbc8548-post.dtsi b/arch/powerpc/boot/dts/sbc8548-post.dtsi
deleted file mode 100644
index 9d848d409408..000000000000
--- a/arch/powerpc/boot/dts/sbc8548-post.dtsi
+++ /dev/null
@@ -1,289 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SBC8548 Device Tree Source
- *
- * Copyright 2007 Wind River Systems Inc.
- *
- * Paul Gortmaker (see MAINTAINERS for contact information)
- */
-
-/{
- soc8548@e0000000 {
- #address-cells = <1>;
- #size-cells = <1>;
- device_type = "soc";
- ranges = <0x00000000 0xe0000000 0x00100000>;
- bus-frequency = <0>;
- compatible = "simple-bus";
-
- ecm-law@0 {
- compatible = "fsl,ecm-law";
- reg = <0x0 0x1000>;
- fsl,num-laws = <10>;
- };
-
- ecm@1000 {
- compatible = "fsl,mpc8548-ecm", "fsl,ecm";
- reg = <0x1000 0x1000>;
- interrupts = <17 2>;
- interrupt-parent = <&mpic>;
- };
-
- memory-controller@2000 {
- compatible = "fsl,mpc8548-memory-controller";
- reg = <0x2000 0x1000>;
- interrupt-parent = <&mpic>;
- interrupts = <0x12 0x2>;
- };
-
- L2: l2-cache-controller@20000 {
- compatible = "fsl,mpc8548-l2-cache-controller";
- reg = <0x20000 0x1000>;
- cache-line-size = <0x20>; // 32 bytes
- cache-size = <0x80000>; // L2, 512K
- interrupt-parent = <&mpic>;
- interrupts = <0x10 0x2>;
- };
-
- i2c@3000 {
- #address-cells = <1>;
- #size-cells = <0>;
- cell-index = <0>;
- compatible = "fsl-i2c";
- reg = <0x3000 0x100>;
- interrupts = <0x2b 0x2>;
- interrupt-parent = <&mpic>;
- dfsrr;
- };
-
- i2c@3100 {
- #address-cells = <1>;
- #size-cells = <0>;
- cell-index = <1>;
- compatible = "fsl-i2c";
- reg = <0x3100 0x100>;
- interrupts = <0x2b 0x2>;
- interrupt-parent = <&mpic>;
- dfsrr;
- };
-
- dma@21300 {
- #address-cells = <1>;
- #size-cells = <1>;
- compatible = "fsl,mpc8548-dma", "fsl,eloplus-dma";
- reg = <0x21300 0x4>;
- ranges = <0x0 0x21100 0x200>;
- cell-index = <0>;
- dma-channel@0 {
- compatible = "fsl,mpc8548-dma-channel",
- "fsl,eloplus-dma-channel";
- reg = <0x0 0x80>;
- cell-index = <0>;
- interrupt-parent = <&mpic>;
- interrupts = <20 2>;
- };
- dma-channel@80 {
- compatible = "fsl,mpc8548-dma-channel",
- "fsl,eloplus-dma-channel";
- reg = <0x80 0x80>;
- cell-index = <1>;
- interrupt-parent = <&mpic>;
- interrupts = <21 2>;
- };
- dma-channel@100 {
- compatible = "fsl,mpc8548-dma-channel",
- "fsl,eloplus-dma-channel";
- reg = <0x100 0x80>;
- cell-index = <2>;
- interrupt-parent = <&mpic>;
- interrupts = <22 2>;
- };
- dma-channel@180 {
- compatible = "fsl,mpc8548-dma-channel",
- "fsl,eloplus-dma-channel";
- reg = <0x180 0x80>;
- cell-index = <3>;
- interrupt-parent = <&mpic>;
- interrupts = <23 2>;
- };
- };
-
- enet0: ethernet@24000 {
- #address-cells = <1>;
- #size-cells = <1>;
- cell-index = <0>;
- device_type = "network";
- model = "eTSEC";
- compatible = "gianfar";
- reg = <0x24000 0x1000>;
- ranges = <0x0 0x24000 0x1000>;
- local-mac-address = [ 00 00 00 00 00 00 ];
- interrupts = <0x1d 0x2 0x1e 0x2 0x22 0x2>;
- interrupt-parent = <&mpic>;
- tbi-handle = <&tbi0>;
- phy-handle = <&phy0>;
-
- mdio@520 {
- #address-cells = <1>;
- #size-cells = <0>;
- compatible = "fsl,gianfar-mdio";
- reg = <0x520 0x20>;
-
- phy0: ethernet-phy@19 {
- interrupt-parent = <&mpic>;
- interrupts = <0x6 0x1>;
- reg = <0x19>;
- };
- phy1: ethernet-phy@1a {
- interrupt-parent = <&mpic>;
- interrupts = <0x7 0x1>;
- reg = <0x1a>;
- };
- tbi0: tbi-phy@11 {
- reg = <0x11>;
- device_type = "tbi-phy";
- };
- };
- };
-
- enet1: ethernet@25000 {
- #address-cells = <1>;
- #size-cells = <1>;
- cell-index = <1>;
- device_type = "network";
- model = "eTSEC";
- compatible = "gianfar";
- reg = <0x25000 0x1000>;
- ranges = <0x0 0x25000 0x1000>;
- local-mac-address = [ 00 00 00 00 00 00 ];
- interrupts = <0x23 0x2 0x24 0x2 0x28 0x2>;
- interrupt-parent = <&mpic>;
- tbi-handle = <&tbi1>;
- phy-handle = <&phy1>;
-
- mdio@520 {
- #address-cells = <1>;
- #size-cells = <0>;
- compatible = "fsl,gianfar-tbi";
- reg = <0x520 0x20>;
-
- tbi1: tbi-phy@11 {
- reg = <0x11>;
- device_type = "tbi-phy";
- };
- };
- };
-
- serial0: serial@4500 {
- cell-index = <0>;
- device_type = "serial";
- compatible = "fsl,ns16550", "ns16550";
- reg = <0x4500 0x100>; // reg base, size
- clock-frequency = <0>; // should we fill in in uboot?
- interrupts = <0x2a 0x2>;
- interrupt-parent = <&mpic>;
- };
-
- serial1: serial@4600 {
- cell-index = <1>;
- device_type = "serial";
- compatible = "fsl,ns16550", "ns16550";
- reg = <0x4600 0x100>; // reg base, size
- clock-frequency = <0>; // should we fill in in uboot?
- interrupts = <0x2a 0x2>;
- interrupt-parent = <&mpic>;
- };
-
- global-utilities@e0000 { //global utilities reg
- compatible = "fsl,mpc8548-guts";
- reg = <0xe0000 0x1000>;
- fsl,has-rstcr;
- };
-
- crypto@30000 {
- compatible = "fsl,sec2.1", "fsl,sec2.0";
- reg = <0x30000 0x10000>;
- interrupts = <45 2>;
- interrupt-parent = <&mpic>;
- fsl,num-channels = <4>;
- fsl,channel-fifo-len = <24>;
- fsl,exec-units-mask = <0xfe>;
- fsl,descriptor-types-mask = <0x12b0ebf>;
- };
-
- mpic: pic@40000 {
- interrupt-controller;
- #address-cells = <0>;
- #interrupt-cells = <2>;
- reg = <0x40000 0x40000>;
- compatible = "chrp,open-pic";
- device_type = "open-pic";
- };
- };
-
- pci0: pci@e0008000 {
- interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
- interrupt-map = <
- /* IDSEL 0x01 (PCI-X slot) @66MHz */
- 0x0800 0x0 0x0 0x1 &mpic 0x2 0x1
- 0x0800 0x0 0x0 0x2 &mpic 0x3 0x1
- 0x0800 0x0 0x0 0x3 &mpic 0x4 0x1
- 0x0800 0x0 0x0 0x4 &mpic 0x1 0x1
-
- /* IDSEL 0x11 (PCI, 3.3V 32bit) @33MHz */
- 0x8800 0x0 0x0 0x1 &mpic 0x2 0x1
- 0x8800 0x0 0x0 0x2 &mpic 0x3 0x1
- 0x8800 0x0 0x0 0x3 &mpic 0x4 0x1
- 0x8800 0x0 0x0 0x4 &mpic 0x1 0x1>;
-
- interrupt-parent = <&mpic>;
- interrupts = <0x18 0x2>;
- bus-range = <0 0>;
- ranges = <0x02000000 0x0 0x80000000 0x80000000 0x0 0x10000000
- 0x01000000 0x0 0x00000000 0xe2000000 0x0 0x00800000>;
- clock-frequency = <66000000>;
- #interrupt-cells = <1>;
- #size-cells = <2>;
- #address-cells = <3>;
- reg = <0xe0008000 0x1000>;
- compatible = "fsl,mpc8540-pcix", "fsl,mpc8540-pci";
- device_type = "pci";
- };
-
- pci1: pcie@e000a000 {
- interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
- interrupt-map = <
-
- /* IDSEL 0x0 (PEX) */
- 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1
- 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1
- 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1
- 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1>;
-
- interrupt-parent = <&mpic>;
- interrupts = <0x1a 0x2>;
- bus-range = <0x0 0xff>;
- ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x10000000
- 0x01000000 0x0 0x00000000 0xe2800000 0x0 0x08000000>;
- clock-frequency = <33000000>;
- #interrupt-cells = <1>;
- #size-cells = <2>;
- #address-cells = <3>;
- reg = <0xe000a000 0x1000>;
- compatible = "fsl,mpc8548-pcie";
- device_type = "pci";
- pcie@0 {
- reg = <0x0 0x0 0x0 0x0 0x0>;
- #size-cells = <2>;
- #address-cells = <3>;
- device_type = "pci";
- ranges = <0x02000000 0x0 0xa0000000
- 0x02000000 0x0 0xa0000000
- 0x0 0x10000000
-
- 0x01000000 0x0 0x00000000
- 0x01000000 0x0 0x00000000
- 0x0 0x00800000>;
- };
- };
-};
diff --git a/arch/powerpc/boot/dts/sbc8548-pre.dtsi b/arch/powerpc/boot/dts/sbc8548-pre.dtsi
deleted file mode 100644
index 0e3665fd15d0..000000000000
--- a/arch/powerpc/boot/dts/sbc8548-pre.dtsi
+++ /dev/null
@@ -1,48 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SBC8548 Device Tree Source
- *
- * Copyright 2007 Wind River Systems Inc.
- *
- * Paul Gortmaker (see MAINTAINERS for contact information)
- */
-
-/{
- model = "SBC8548";
- compatible = "SBC8548";
- #address-cells = <1>;
- #size-cells = <1>;
-
- aliases {
- ethernet0 = &enet0;
- ethernet1 = &enet1;
- serial0 = &serial0;
- serial1 = &serial1;
- pci0 = &pci0;
- pci1 = &pci1;
- };
-
- cpus {
- #address-cells = <1>;
- #size-cells = <0>;
-
- PowerPC,8548@0 {
- device_type = "cpu";
- reg = <0>;
- d-cache-line-size = <0x20>; // 32 bytes
- i-cache-line-size = <0x20>; // 32 bytes
- d-cache-size = <0x8000>; // L1, 32K
- i-cache-size = <0x8000>; // L1, 32K
- timebase-frequency = <0>; // From uboot
- bus-frequency = <0>;
- clock-frequency = <0>;
- next-level-cache = <&L2>;
- };
- };
-
- memory {
- device_type = "memory";
- reg = <0x00000000 0x10000000>;
- };
-
-};
diff --git a/arch/powerpc/boot/dts/sbc8548.dts b/arch/powerpc/boot/dts/sbc8548.dts
deleted file mode 100644
index ce0a119f496e..000000000000
--- a/arch/powerpc/boot/dts/sbc8548.dts
+++ /dev/null
@@ -1,106 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SBC8548 Device Tree Source
- *
- * Copyright 2007 Wind River Systems Inc.
- *
- * Paul Gortmaker (see MAINTAINERS for contact information)
- */
-
-
-/dts-v1/;
-
-/include/ "sbc8548-pre.dtsi"
-
-/{
- localbus@e0000000 {
- #address-cells = <2>;
- #size-cells = <1>;
- compatible = "simple-bus";
- reg = <0xe0000000 0x5000>;
- interrupt-parent = <&mpic>;
-
- ranges = <0x0 0x0 0xff800000 0x00800000 /*8MB Flash*/
- 0x3 0x0 0xf0000000 0x04000000 /*64MB SDRAM*/
- 0x4 0x0 0xf4000000 0x04000000 /*64MB SDRAM*/
- 0x5 0x0 0xf8000000 0x00b10000 /* EPLD */
- 0x6 0x0 0xec000000 0x04000000>; /*64MB Flash*/
-
-
- flash@0,0 {
- #address-cells = <1>;
- #size-cells = <1>;
- compatible = "intel,JS28F640", "cfi-flash";
- reg = <0x0 0x0 0x800000>;
- bank-width = <1>;
- device-width = <1>;
- partition@0 {
- label = "space";
- /* FF800000 -> FFF9FFFF */
- reg = <0x00000000 0x007a0000>;
- };
- partition@7a0000 {
- label = "bootloader";
- /* FFFA0000 -> FFFFFFFF */
- reg = <0x007a0000 0x00060000>;
- read-only;
- };
- };
-
- epld@5,0 {
- compatible = "wrs,epld-localbus";
- #address-cells = <2>;
- #size-cells = <1>;
- reg = <0x5 0x0 0x00b10000>;
- ranges = <
- 0x0 0x0 0x5 0x000000 0x1fff /* LED */
- 0x1 0x0 0x5 0x100000 0x1fff /* Switches */
- 0x3 0x0 0x5 0x300000 0x1fff /* HW Rev. */
- 0xb 0x0 0x5 0xb00000 0x1fff /* EEPROM */
- >;
-
- led@0,0 {
- compatible = "led";
- reg = <0x0 0x0 0x1fff>;
- };
-
- switches@1,0 {
- compatible = "switches";
- reg = <0x1 0x0 0x1fff>;
- };
-
- hw-rev@3,0 {
- compatible = "hw-rev";
- reg = <0x3 0x0 0x1fff>;
- };
-
- eeprom@b,0 {
- compatible = "eeprom";
- reg = <0xb 0 0x1fff>;
- };
-
- };
-
- alt-flash@6,0 {
- #address-cells = <1>;
- #size-cells = <1>;
- reg = <0x6 0x0 0x04000000>;
- compatible = "intel,JS28F128", "cfi-flash";
- bank-width = <4>;
- device-width = <1>;
- partition@0 {
- label = "space";
- /* EC000000 -> EFEFFFFF */
- reg = <0x00000000 0x03f00000>;
- };
- partition@3f00000 {
- label = "bootloader";
- /* EFF00000 -> EFFFFFFF */
- reg = <0x03f00000 0x00100000>;
- read-only;
- };
- };
- };
-};
-
-/include/ "sbc8548-post.dtsi"
diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts
index aaa381da1906..e9c945b123c6 100644
--- a/arch/powerpc/boot/dts/wii.dts
+++ b/arch/powerpc/boot/dts/wii.dts
@@ -216,7 +216,18 @@
control@d800100 {
compatible = "nintendo,hollywood-control";
- reg = <0x0d800100 0x300>;
+ /*
+ * Both the address and length are wrong, according to
+ * Wiibrew this should be <0x0d800000 0x400>, but it
+ * requires refactoring the PIC1, GPIO and OTP nodes
+ * before changing that.
+ */
+ reg = <0x0d800100 0xa0>;
+ };
+
+ otp@d8001ec {
+ compatible = "nintendo,hollywood-otp";
+ reg = <0x0d8001ec 0x8>;
};
disk@d806000 {
diff --git a/arch/powerpc/boot/install.sh b/arch/powerpc/boot/install.sh
index b6a256bc96ee..14473150ddb4 100644
--- a/arch/powerpc/boot/install.sh
+++ b/arch/powerpc/boot/install.sh
@@ -15,12 +15,25 @@
# $2 - kernel image file
# $3 - kernel map file
# $4 - default install path (blank if root directory)
-# $5 and more - kernel boot files; zImage*, uImage, cuImage.*, etc.
#
# Bail with error code if anything goes wrong
set -e
+verify () {
+ if [ ! -f "$1" ]; then
+ echo "" 1>&2
+ echo " *** Missing file: $1" 1>&2
+ echo ' *** You need to run "make" before "make install".' 1>&2
+ echo "" 1>&2
+ exit 1
+ fi
+}
+
+# Make sure the files actually exist
+verify "$2"
+verify "$3"
+
# User may have a custom install script
if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
@@ -41,15 +54,3 @@ fi
cat $2 > $4/$image_name
cp $3 $4/System.map
-
-# Copy all the bootable image files
-path=$4
-shift 4
-while [ $# -ne 0 ]; do
- image_name=`basename $1`
- if [ -f $path/$image_name ]; then
- mv $path/$image_name $path/$image_name.old
- fi
- cat $1 > $path/$image_name
- shift
-done;
diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper
index 1f4676bab786..1cd82564c996 100755
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@@ -298,7 +298,7 @@ cuboot*)
*-tqm8541|*-mpc8560*|*-tqm8560|*-tqm8555|*-ksi8560*)
platformo=$object/cuboot-85xx-cpm2.o
;;
- *-mpc85*|*-tqm85*|*-sbc85*)
+ *-mpc85*|*-tqm85*)
platformo=$object/cuboot-85xx.o
;;
*-amigaone)
diff --git a/arch/powerpc/configs/85xx/sbc8548_defconfig b/arch/powerpc/configs/85xx/sbc8548_defconfig
deleted file mode 100644
index 258881727119..000000000000
--- a/arch/powerpc/configs/85xx/sbc8548_defconfig
+++ /dev/null
@@ -1,50 +0,0 @@
-CONFIG_PPC_85xx=y
-CONFIG_SYSVIPC=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_EXPERT=y
-CONFIG_SLAB=y
-# CONFIG_BLK_DEV_BSG is not set
-CONFIG_SBC8548=y
-CONFIG_GEN_RTC=y
-CONFIG_BINFMT_MISC=y
-CONFIG_MATH_EMULATION=y
-# CONFIG_SECCOMP is not set
-CONFIG_PCI=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_IP_PNP_BOOTP=y
-CONFIG_SYN_COOKIES=y
-# CONFIG_IPV6 is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_ADV_OPTIONS=y
-CONFIG_MTD_CFI_GEOMETRY=y
-CONFIG_MTD_CFI_I4=y
-CONFIG_MTD_CFI_INTELEXT=y
-CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_RAM=y
-CONFIG_NETDEVICES=y
-CONFIG_GIANFAR=y
-CONFIG_BROADCOM_PHY=y
-# CONFIG_INPUT_KEYBOARD is not set
-# CONFIG_INPUT_MOUSE is not set
-# CONFIG_SERIO is not set
-# CONFIG_VT is not set
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-# CONFIG_USB_SUPPORT is not set
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
diff --git a/arch/powerpc/configs/microwatt_defconfig b/arch/powerpc/configs/microwatt_defconfig
index a08b739123da..9465209b8c5b 100644
--- a/arch/powerpc/configs/microwatt_defconfig
+++ b/arch/powerpc/configs/microwatt_defconfig
@@ -5,6 +5,7 @@ CONFIG_PREEMPT_VOLUNTARY=y
CONFIG_TICK_CPU_ACCOUNTING=y
CONFIG_LOG_BUF_SHIFT=16
CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=12
+CONFIG_CGROUPS=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_KALLSYMS_ALL=y
@@ -53,10 +54,12 @@ CONFIG_MTD_SPI_NOR=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_NETDEVICES=y
+CONFIG_LITEX_LITEETH=y
# CONFIG_WLAN is not set
# CONFIG_INPUT is not set
# CONFIG_SERIO is not set
# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
CONFIG_SERIAL_8250=y
# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
CONFIG_SERIAL_8250_CONSOLE=y
@@ -76,8 +79,10 @@ CONFIG_SPI_SPIDEV=y
CONFIG_EXT4_FS=y
# CONFIG_FILE_LOCKING is not set
# CONFIG_DNOTIFY is not set
-# CONFIG_INOTIFY_USER is not set
+CONFIG_AUTOFS_FS=y
+CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
+CONFIG_CRYPTO_SHA256=y
# CONFIG_CRYPTO_HW is not set
# CONFIG_XZ_DEC_X86 is not set
# CONFIG_XZ_DEC_IA64 is not set
diff --git a/arch/powerpc/configs/mpc85xx_base.config b/arch/powerpc/configs/mpc85xx_base.config
index b1593fe6f70b..85907b776908 100644
--- a/arch/powerpc/configs/mpc85xx_base.config
+++ b/arch/powerpc/configs/mpc85xx_base.config
@@ -13,7 +13,6 @@ CONFIG_P1022_DS=y
CONFIG_P1022_RDK=y
CONFIG_P1023_RDB=y
CONFIG_TWR_P102x=y
-CONFIG_SBC8548=y
CONFIG_SOCRATES=y
CONFIG_STX_GP3=y
CONFIG_TQM8540=y
diff --git a/arch/powerpc/configs/mpc86xx_base.config b/arch/powerpc/configs/mpc86xx_base.config
index 67bd1fa036ee..588870e6af3b 100644
--- a/arch/powerpc/configs/mpc86xx_base.config
+++ b/arch/powerpc/configs/mpc86xx_base.config
@@ -1,6 +1,5 @@
CONFIG_PPC_86xx=y
CONFIG_MPC8641_HPCN=y
-CONFIG_SBC8641D=y
CONFIG_MPC8610_HPCD=y
CONFIG_GEF_PPC9A=y
CONFIG_GEF_SBC310=y
diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig
index d21f266cea9a..c74dc76b1d0d 100644
--- a/arch/powerpc/configs/mpc885_ads_defconfig
+++ b/arch/powerpc/configs/mpc885_ads_defconfig
@@ -1,19 +1,30 @@
-CONFIG_PPC_8xx=y
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BPF_JIT=y
+CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y
CONFIG_LOG_BUF_SHIFT=14
CONFIG_EXPERT=y
# CONFIG_ELF_CORE is not set
# CONFIG_BASE_FULL is not set
# CONFIG_FUTEX is not set
+CONFIG_PERF_EVENTS=y
# CONFIG_VM_EVENT_COUNTERS is not set
-# CONFIG_BLK_DEV_BSG is not set
-CONFIG_PARTITION_ADVANCED=y
+CONFIG_PPC_8xx=y
+CONFIG_8xx_GPIO=y
+CONFIG_SMC_UCODE_PATCH=y
+CONFIG_PIN_TLB=y
CONFIG_GEN_RTC=y
CONFIG_HZ_100=y
+CONFIG_MATH_EMULATION=y
+CONFIG_PPC_16K_PAGES=y
+CONFIG_ADVANCED_OPTIONS=y
# CONFIG_SECCOMP is not set
+CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_MODULES=y
+# CONFIG_BLK_DEV_BSG is not set
+CONFIG_PARTITION_ADVANCED=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -21,7 +32,6 @@ CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_PNP=y
CONFIG_SYN_COOKIES=y
-# CONFIG_IPV6 is not set
# CONFIG_FW_LOADER is not set
CONFIG_MTD=y
CONFIG_MTD_BLOCK=y
@@ -34,6 +44,7 @@ CONFIG_MTD_CFI_GEOMETRY=y
# CONFIG_MTD_CFI_I2 is not set
CONFIG_MTD_CFI_I4=y
CONFIG_MTD_CFI_AMDSTD=y
+CONFIG_MTD_PHYSMAP=y
CONFIG_MTD_PHYSMAP_OF=y
# CONFIG_BLK_DEV is not set
CONFIG_NETDEVICES=y
@@ -46,39 +57,25 @@ CONFIG_DAVICOM_PHY=y
# CONFIG_LEGACY_PTYS is not set
CONFIG_SERIAL_CPM=y
CONFIG_SERIAL_CPM_CONSOLE=y
+CONFIG_SPI=y
+CONFIG_SPI_FSL_SPI=y
# CONFIG_HWMON is not set
+CONFIG_WATCHDOG=y
+CONFIG_8xxx_WDT=y
# CONFIG_USB_SUPPORT is not set
# CONFIG_DNOTIFY is not set
CONFIG_TMPFS=y
CONFIG_CRAMFS=y
CONFIG_NFS_FS=y
CONFIG_ROOT_NFS=y
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_DEV_TALITOS=y
CONFIG_CRC32_SLICEBY4=y
CONFIG_DEBUG_INFO=y
CONFIG_MAGIC_SYSRQ=y
-CONFIG_DETECT_HUNG_TASK=y
-CONFIG_PPC_16K_PAGES=y
-CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_FS=y
-CONFIG_PPC_PTDUMP=y
-CONFIG_MODULES=y
-CONFIG_SPI=y
-CONFIG_SPI_FSL_SPI=y
-CONFIG_CRYPTO=y
-CONFIG_CRYPTO_DEV_TALITOS=y
-CONFIG_8xx_GPIO=y
-CONFIG_WATCHDOG=y
-CONFIG_8xxx_WDT=y
-CONFIG_SMC_UCODE_PATCH=y
-CONFIG_ADVANCED_OPTIONS=y
-CONFIG_PIN_TLB=y
-CONFIG_PERF_EVENTS=y
-CONFIG_MATH_EMULATION=y
-CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y
-CONFIG_STRICT_KERNEL_RWX=y
-CONFIG_IPV6=y
-CONFIG_BPF_JIT=y
CONFIG_DEBUG_VM_PGTABLE=y
+CONFIG_DETECT_HUNG_TASK=y
CONFIG_BDI_SWITCH=y
CONFIG_PPC_EARLY_DEBUG=y
-CONFIG_PPC_EARLY_DEBUG_CPM_ADDR=0xff002008
+CONFIG_PPC_PTDUMP=y
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index ee09f7bb6ea9..6697c5e6682f 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -55,7 +55,6 @@ CONFIG_MPC837x_RDB=y
CONFIG_ASP834x=y
CONFIG_PPC_86xx=y
CONFIG_MPC8641_HPCN=y
-CONFIG_SBC8641D=y
CONFIG_MPC8610_HPCD=y
CONFIG_GEF_SBC610=y
CONFIG_CPU_FREQ=y
diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig
index 379c171f3ddd..a0c45bf2bfb1 100644
--- a/arch/powerpc/configs/wii_defconfig
+++ b/arch/powerpc/configs/wii_defconfig
@@ -99,6 +99,7 @@ CONFIG_LEDS_TRIGGER_HEARTBEAT=y
CONFIG_LEDS_TRIGGER_PANIC=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_GENERIC=y
+CONFIG_NVMEM_NINTENDO_OTP=y
CONFIG_EXT2_FS=y
CONFIG_EXT4_FS=y
CONFIG_FUSE_FS=m
diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h
index 19b70c5b5f18..2b736d9fbb1b 100644
--- a/arch/powerpc/include/asm/asm-compat.h
+++ b/arch/powerpc/include/asm/asm-compat.h
@@ -17,7 +17,7 @@
#define PPC_LONG stringify_in_c(.8byte)
#define PPC_LONG_ALIGN stringify_in_c(.balign 8)
#define PPC_TLNEI stringify_in_c(tdnei)
-#define PPC_LLARX(t, a, b, eh) PPC_LDARX(t, a, b, eh)
+#define PPC_LLARX stringify_in_c(ldarx)
#define PPC_STLCX stringify_in_c(stdcx.)
#define PPC_CNTLZL stringify_in_c(cntlzd)
#define PPC_MTOCRF(FXM, RS) MTOCRF((FXM), RS)
@@ -50,7 +50,7 @@
#define PPC_LONG stringify_in_c(.long)
#define PPC_LONG_ALIGN stringify_in_c(.balign 4)
#define PPC_TLNEI stringify_in_c(twnei)
-#define PPC_LLARX(t, a, b, eh) PPC_LWARX(t, a, b, eh)
+#define PPC_LLARX stringify_in_c(lwarx)
#define PPC_STLCX stringify_in_c(stwcx.)
#define PPC_CNTLZL stringify_in_c(cntlzw)
#define PPC_MTOCRF stringify_in_c(mtcrf)
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index a1732a79e92a..6a53ef178bfd 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -207,7 +207,7 @@ arch_atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new)
int r, o = *old;
__asm__ __volatile__ (
-"1:\t" PPC_LWARX(%0,0,%2,1) " # atomic_try_cmpxchg_acquire \n"
+"1: lwarx %0,0,%2,%5 # atomic_try_cmpxchg_acquire \n"
" cmpw 0,%0,%3 \n"
" bne- 2f \n"
" stwcx. %4,0,%2 \n"
@@ -215,7 +215,7 @@ arch_atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new)
"\t" PPC_ACQUIRE_BARRIER " \n"
"2: \n"
: "=&r" (r), "+m" (v->counter)
- : "r" (&v->counter), "r" (o), "r" (new)
+ : "r" (&v->counter), "r" (o), "r" (new), "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0)
: "cr0", "memory");
if (unlikely(r != o))
diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index 299ab33505a6..11847b6a244e 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -70,7 +70,7 @@ static inline void fn(unsigned long mask, \
unsigned long *p = (unsigned long *)_p; \
__asm__ __volatile__ ( \
prefix \
-"1:" PPC_LLARX(%0,0,%3,0) "\n" \
+"1:" PPC_LLARX "%0,0,%3,0\n" \
stringify_in_c(op) "%0,%0,%2\n" \
PPC_STLCX "%0,0,%3\n" \
"bne- 1b\n" \
@@ -115,13 +115,13 @@ static inline unsigned long fn( \
unsigned long *p = (unsigned long *)_p; \
__asm__ __volatile__ ( \
prefix \
-"1:" PPC_LLARX(%0,0,%3,eh) "\n" \
+"1:" PPC_LLARX "%0,0,%3,%4\n" \
stringify_in_c(op) "%1,%0,%2\n" \
PPC_STLCX "%1,0,%3\n" \
"bne- 1b\n" \
postfix \
: "=&r" (old), "=&r" (t) \
- : "r" (mask), "r" (p) \
+ : "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \
: "cc", "memory"); \
return (old & mask); \
}
@@ -170,7 +170,7 @@ clear_bit_unlock_return_word(int nr, volatile unsigned long *addr)
__asm__ __volatile__ (
PPC_RELEASE_BARRIER
-"1:" PPC_LLARX(%0,0,%3,0) "\n"
+"1:" PPC_LLARX "%0,0,%3,0\n"
"andc %1,%0,%2\n"
PPC_STLCX "%1,0,%3\n"
"bne- 1b\n"
diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
index a1cc73a88710..170339969b7c 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -90,7 +90,7 @@
/* Prevent access to userspace using any key values */
LOAD_REG_IMMEDIATE(\gpr2, AMR_KUAP_BLOCKED)
999: tdne \gpr1, \gpr2
- EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
+ EMIT_WARN_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_BOOK3S_KUAP, 67)
#endif
.endm
diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index 0b2162890d8b..02c08d1492f8 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -4,6 +4,7 @@
#ifdef __KERNEL__
#include <asm/asm-compat.h>
+#include <asm/extable.h>
#ifdef CONFIG_BUG
@@ -30,6 +31,11 @@
.endm
#endif /* verbose */
+.macro EMIT_WARN_ENTRY addr,file,line,flags
+ EX_TABLE(\addr,\addr+4)
+ EMIT_BUG_ENTRY \addr,\file,\line,\flags
+.endm
+
#else /* !__ASSEMBLY__ */
/* _EMIT_BUG_ENTRY expects args %0,%1,%2,%3 to be FILE, LINE, flags and
sizeof(struct bug_entry), respectively */
@@ -58,6 +64,16 @@
"i" (sizeof(struct bug_entry)), \
##__VA_ARGS__)
+#define WARN_ENTRY(insn, flags, label, ...) \
+ asm_volatile_goto( \
+ "1: " insn "\n" \
+ EX_TABLE(1b, %l[label]) \
+ _EMIT_BUG_ENTRY \
+ : : "i" (__FILE__), "i" (__LINE__), \
+ "i" (flags), \
+ "i" (sizeof(struct bug_entry)), \
+ ##__VA_ARGS__ : : label)
+
/*
* BUG_ON() and WARN_ON() do their best to cooperate with compile-time
* optimisations. However depending on the complexity of the condition
@@ -68,7 +84,19 @@
BUG_ENTRY("twi 31, 0, 0", 0); \
unreachable(); \
} while (0)
+#define HAVE_ARCH_BUG
+#define __WARN_FLAGS(flags) do { \
+ __label__ __label_warn_on; \
+ \
+ WARN_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags), __label_warn_on); \
+ unreachable(); \
+ \
+__label_warn_on: \
+ break; \
+} while (0)
+
+#ifdef CONFIG_PPC64
#define BUG_ON(x) do { \
if (__builtin_constant_p(x)) { \
if (x) \
@@ -78,31 +106,43 @@
} \
} while (0)
-#define __WARN_FLAGS(flags) BUG_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags))
-
#define WARN_ON(x) ({ \
- int __ret_warn_on = !!(x); \
- if (__builtin_constant_p(__ret_warn_on)) { \
- if (__ret_warn_on) \
+ bool __ret_warn_on = false; \
+ do { \
+ if (__builtin_constant_p((x))) { \
+ if (!(x)) \
+ break; \
__WARN(); \
- } else { \
- BUG_ENTRY(PPC_TLNEI " %4, 0", \
- BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \
- "r" (__ret_warn_on)); \
- } \
+ __ret_warn_on = true; \
+ } else { \
+ __label__ __label_warn_on; \
+ \
+ WARN_ENTRY(PPC_TLNEI " %4, 0", \
+ BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \
+ __label_warn_on, \
+ "r" ((__force long)(x))); \
+ break; \
+__label_warn_on: \
+ __ret_warn_on = true; \
+ } \
+ } while (0); \
unlikely(__ret_warn_on); \
})
-#define HAVE_ARCH_BUG
#define HAVE_ARCH_BUG_ON
#define HAVE_ARCH_WARN_ON
+#endif
+
#endif /* __ASSEMBLY __ */
#else
#ifdef __ASSEMBLY__
.macro EMIT_BUG_ENTRY addr,file,line,flags
.endm
+.macro EMIT_WARN_ENTRY addr,file,line,flags
+.endm
#else /* !__ASSEMBLY__ */
#define _EMIT_BUG_ENTRY
+#define _EMIT_WARN_ENTRY
#endif
#endif /* CONFIG_BUG */
diff --git a/arch/powerpc/include/asm/debugfs.h b/arch/powerpc/include/asm/debugfs.h
deleted file mode 100644
index 2c5c48571d75..000000000000
--- a/arch/powerpc/include/asm/debugfs.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _ASM_POWERPC_DEBUGFS_H
-#define _ASM_POWERPC_DEBUGFS_H
-
-/*
- * Copyright 2017, Michael Ellerman, IBM Corporation.
- */
-
-#include <linux/debugfs.h>
-
-extern struct dentry *powerpc_debugfs_root;
-
-#endif /* _ASM_POWERPC_DEBUGFS_H */
diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index bf2402fed3e0..4265d5e95c2c 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -111,6 +111,7 @@ int drmem_update_dt(void);
int __init
walk_drmem_lmbs_early(unsigned long node, void *data,
int (*func)(struct drmem_lmb *, const __be32 **, void *));
+void drmem_update_lmbs(struct property *prop);
#endif
static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb)
diff --git a/arch/powerpc/include/asm/extable.h b/arch/powerpc/include/asm/extable.h
index eb91b2d2935a..26ce2e5c0fa8 100644
--- a/arch/powerpc/include/asm/extable.h
+++ b/arch/powerpc/include/asm/extable.h
@@ -17,6 +17,8 @@
#define ARCH_HAS_RELATIVE_EXTABLE
+#ifndef __ASSEMBLY__
+
struct exception_table_entry {
int insn;
int fixup;
@@ -28,3 +30,15 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x)
}
#endif
+
+/*
+ * Helper macro for exception table entries
+ */
+#define EX_TABLE(_fault, _target) \
+ stringify_in_c(.section __ex_table,"a";)\
+ stringify_in_c(.balign 4;) \
+ stringify_in_c(.long (_fault) - . ;) \
+ stringify_in_c(.long (_target) - . ;) \
+ stringify_in_c(.previous)
+
+#endif
diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index 7604673787d6..97a3bd9ffeb9 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -44,7 +44,7 @@
#define FW_FEATURE_OPAL ASM_CONST(0x0000000010000000)
#define FW_FEATURE_SET_MODE ASM_CONST(0x0000000040000000)
#define FW_FEATURE_BEST_ENERGY ASM_CONST(0x0000000080000000)
-#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0000000100000000)
+#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0000000100000000)
#define FW_FEATURE_PRRN ASM_CONST(0x0000000200000000)
#define FW_FEATURE_DRMEM_V2 ASM_CONST(0x0000000400000000)
#define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000)
@@ -53,6 +53,7 @@
#define FW_FEATURE_ULTRAVISOR ASM_CONST(0x0000004000000000)
#define FW_FEATURE_STUFF_TCE ASM_CONST(0x0000008000000000)
#define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000)
+#define FW_FEATURE_FORM2_AFFINITY ASM_CONST(0x0000020000000000)
#ifndef __ASSEMBLY__
@@ -69,11 +70,11 @@ enum {
FW_FEATURE_SPLPAR | FW_FEATURE_LPAR |
FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
- FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
+ FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN |
FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
- FW_FEATURE_RPT_INVALIDATE,
+ FW_FEATURE_RPT_INVALIDATE | FW_FEATURE_FORM2_AFFINITY,
FW_FEATURE_PSERIES_ALWAYS = 0,
FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR,
FW_FEATURE_POWERNV_ALWAYS = 0,
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index deef7c94d7b6..bf3b84128525 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -154,6 +154,7 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
*/
extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
int nid, unsigned long res_start, unsigned long res_end);
+bool iommu_table_in_use(struct iommu_table *tbl);
#define IOMMU_TABLE_GROUP_MAX_TABLES 2
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index eaf3a562bf1e..19b6942c6969 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -39,6 +39,7 @@ struct kvm_nested_guest {
pgd_t *shadow_pgtable; /* our page table for this guest */
u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */
u64 process_table; /* process table entry for this guest */
+ u64 hfscr; /* HFSCR that the L1 requested for this nested guest */
long refcnt; /* number of pointers to this struct */
struct mutex tlb_lock; /* serialize page faults and tlbies */
struct kvm_nested_guest *next;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 9f52f282b1aa..a779f7849cfb 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -811,6 +811,8 @@ struct kvm_vcpu_arch {
u32 online;
+ u64 hfscr_permitted; /* A mask of permitted HFSCR facilities */
+
/* For support of nested guests */
struct kvm_nested_guest *nested;
u32 nested_vcpu_id;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2d88944f9f34..671fbd1a765e 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -664,9 +664,9 @@ extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
extern void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu);
extern int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
- struct irq_desc *host_desc);
+ unsigned long host_irq);
extern int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
- struct irq_desc *host_desc);
+ unsigned long host_irq);
extern u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu);
extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h
index 6e20bb5c74ea..de7f79157918 100644
--- a/arch/powerpc/include/asm/membarrier.h
+++ b/arch/powerpc/include/asm/membarrier.h
@@ -12,7 +12,8 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
* when switching from userspace to kernel is not needed after
* store to rq->curr.
*/
- if (likely(!(atomic_read(&next->membarrier_state) &
+ if (IS_ENABLED(CONFIG_SMP) &&
+ likely(!(atomic_read(&next->membarrier_state) &
(MEMBARRIER_STATE_PRIVATE_EXPEDITED |
MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
return;
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 27016b98ecb2..8abe8e42e045 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -324,7 +324,7 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
}
#endif /* !CONFIG_DEBUG_VM */
-static inline bool radix_enabled(void)
+static __always_inline bool radix_enabled(void)
{
return mmu_has_feature(MMU_FTR_TYPE_RADIX);
}
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 74424c14515c..90f488fa4c17 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -126,6 +126,11 @@ struct pci_controller {
#endif /* CONFIG_PPC64 */
void *private_data;
+
+ /* IRQ domain hierarchy */
+ struct irq_domain *dev_domain;
+ struct irq_domain *msi_domain;
+ struct fwnode_handle *fwnode;
};
/* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/include/asm/pmc.h b/arch/powerpc/include/asm/pmc.h
index c6bbe9778d3c..3c09109e708e 100644
--- a/arch/powerpc/include/asm/pmc.h
+++ b/arch/powerpc/include/asm/pmc.h
@@ -34,6 +34,13 @@ static inline void ppc_set_pmu_inuse(int inuse)
#endif
}
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+static inline int ppc_get_pmu_inuse(void)
+{
+ return get_paca()->pmcregs_in_use;
+}
+#endif
+
extern void power4_enable_pmcs(void);
#else /* CONFIG_PPC64 */
diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h
index d0ee0ede5767..b3f480799352 100644
--- a/arch/powerpc/include/asm/pnv-pci.h
+++ b/arch/powerpc/include/asm/pnv-pci.h
@@ -33,7 +33,7 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num);
void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num);
int pnv_cxl_get_irq_count(struct pci_dev *dev);
struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev);
-int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq);
+int64_t pnv_opal_pci_msi_eoi(struct irq_data *d);
bool is_pnv_opal_msi(struct irq_chip *chip);
#ifdef CONFIG_CXL_BASE
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index bede76dd3db7..baea657bc868 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -576,8 +576,6 @@
#define PPC_DIVDE(t, a, b) stringify_in_c(.long PPC_RAW_DIVDE(t, a, b))
#define PPC_DIVDEU(t, a, b) stringify_in_c(.long PPC_RAW_DIVDEU(t, a, b))
#define PPC_LQARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LQARX(t, a, b, eh))
-#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LDARX(t, a, b, eh))
-#define PPC_LWARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LWARX(t, a, b, eh))
#define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_RAW_STQCX(t, a, b))
#define PPC_MADDHD(t, a, b, c) stringify_in_c(.long PPC_RAW_MADDHD(t, a, b, c))
#define PPC_MADDHDU(t, a, b, c) stringify_in_c(.long PPC_RAW_MADDHDU(t, a, b, c))
diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 116c1519728a..1c538a9a11e0 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -10,6 +10,7 @@
#include <asm/ppc-opcode.h>
#include <asm/firmware.h>
#include <asm/feature-fixups.h>
+#include <asm/extable.h>
#ifdef __ASSEMBLY__
@@ -259,7 +260,7 @@ n:
/* Be careful, this will clobber the lr register. */
#define LOAD_REG_ADDR_PIC(reg, name) \
- bl 0f; \
+ bcl 20,31,$+4; \
0: mflr reg; \
addis reg,reg,(name - 0b)@ha; \
addi reg,reg,(name - 0b)@l;
@@ -752,16 +753,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96)
#endif /* __ASSEMBLY__ */
-/*
- * Helper macro for exception table entries
- */
-#define EX_TABLE(_fault, _target) \
- stringify_in_c(.section __ex_table,"a";)\
- stringify_in_c(.balign 4;) \
- stringify_in_c(.long (_fault) - . ;) \
- stringify_in_c(.long (_target) - . ;) \
- stringify_in_c(.previous)
-
#define SOFT_MASK_TABLE(_start, _end) \
stringify_in_c(.section __soft_mask_table,"a";)\
stringify_in_c(.balign 8;) \
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 324a13351749..5c80152e8f18 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -147,8 +147,9 @@ extern int of_read_drc_info_cell(struct property **prop,
#define OV5_MSI 0x0201 /* PCIe/MSI support */
#define OV5_CMO 0x0480 /* Cooperative Memory Overcommitment */
#define OV5_XCMO 0x0440 /* Page Coalescing */
-#define OV5_TYPE1_AFFINITY 0x0580 /* Type 1 NUMA affinity */
+#define OV5_FORM1_AFFINITY 0x0580 /* FORM1 NUMA affinity */
#define OV5_PRRN 0x0540 /* Platform Resource Reassignment */
+#define OV5_FORM2_AFFINITY 0x0520 /* Form2 NUMA affinity */
#define OV5_HP_EVT 0x0604 /* Hot Plug Event support */
#define OV5_RESIZE_HPT 0x0601 /* Hash Page Table resizing */
#define OV5_PFO_HW_RNG 0x1180 /* PFO Random Number Generator */
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index 14422e851494..6e560f035614 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -22,6 +22,7 @@
#include <linux/err.h>
#include <uapi/asm/ptrace.h>
#include <asm/asm-const.h>
+#include <asm/reg.h>
#ifndef __ASSEMBLY__
struct pt_regs
@@ -43,8 +44,14 @@ struct pt_regs
unsigned long mq;
#endif
unsigned long trap;
- unsigned long dar;
- unsigned long dsisr;
+ union {
+ unsigned long dar;
+ unsigned long dear;
+ };
+ union {
+ unsigned long dsisr;
+ unsigned long esr;
+ };
unsigned long result;
};
};
@@ -197,11 +204,7 @@ static inline unsigned long frame_pointer(struct pt_regs *regs)
return 0;
}
-#ifdef __powerpc64__
-#define user_mode(regs) ((((regs)->msr) >> MSR_PR_LG) & 0x1)
-#else
#define user_mode(regs) (((regs)->msr & MSR_PR) != 0)
-#endif
#define force_successful_syscall_return() \
do { \
@@ -286,6 +289,28 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
regs->gpr[3] = rc;
}
+static inline bool cpu_has_msr_ri(void)
+{
+ return !IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x);
+}
+
+static inline bool regs_is_unrecoverable(struct pt_regs *regs)
+{
+ return unlikely(cpu_has_msr_ri() && !(regs->msr & MSR_RI));
+}
+
+static inline void regs_set_recoverable(struct pt_regs *regs)
+{
+ if (cpu_has_msr_ri())
+ regs_set_return_msr(regs, regs->msr | MSR_RI);
+}
+
+static inline void regs_set_unrecoverable(struct pt_regs *regs)
+{
+ if (cpu_has_msr_ri())
+ regs_set_return_msr(regs, regs->msr & ~MSR_RI);
+}
+
#define arch_has_single_step() (1)
#define arch_has_block_step() (true)
#define ARCH_HAS_USER_SINGLE_STEP_REPORT
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index be85cf156a1f..e9d27265253b 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -415,6 +415,7 @@
#define FSCR_TAR __MASK(FSCR_TAR_LG)
#define FSCR_EBB __MASK(FSCR_EBB_LG)
#define FSCR_DSCR __MASK(FSCR_DSCR_LG)
+#define FSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */
#define SPRN_HFSCR 0xbe /* HV=1 Facility Status & Control Register */
#define HFSCR_PREFIX __MASK(FSCR_PREFIX_LG)
#define HFSCR_MSGP __MASK(FSCR_MSGP_LG)
@@ -426,7 +427,7 @@
#define HFSCR_DSCR __MASK(FSCR_DSCR_LG)
#define HFSCR_VECVSX __MASK(FSCR_VECVSX_LG)
#define HFSCR_FP __MASK(FSCR_FP_LG)
-#define HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */
+#define HFSCR_INTR_CAUSE FSCR_INTR_CAUSE
#define SPRN_TAR 0x32f /* Target Address Register */
#define SPRN_LPCR 0x13E /* LPAR Control Register */
#define LPCR_VPM0 ASM_CONST(0x8000000000000000)
diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index 324d7b298ec3..6e4af4492a14 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -38,14 +38,6 @@ extern char start_virt_trampolines[];
extern char end_virt_trampolines[];
#endif
-static inline int in_kernel_text(unsigned long addr)
-{
- if (addr >= (unsigned long)_stext && addr < (unsigned long)__init_end)
- return 1;
-
- return 0;
-}
-
static inline unsigned long kernel_toc_addr(void)
{
/* Defined by the linker, see vmlinux.lds.S */
diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h
index 552f325412cc..8985791a2ba5 100644
--- a/arch/powerpc/include/asm/simple_spinlock.h
+++ b/arch/powerpc/include/asm/simple_spinlock.h
@@ -51,7 +51,7 @@ static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock)
token = LOCK_TOKEN;
__asm__ __volatile__(
-"1: " PPC_LWARX(%0,0,%2,1) "\n\
+"1: lwarx %0,0,%2,1\n\
cmpwi 0,%0,0\n\
bne- 2f\n\
stwcx. %1,0,%2\n\
@@ -179,7 +179,7 @@ static inline long __arch_read_trylock(arch_rwlock_t *rw)
long tmp;
__asm__ __volatile__(
-"1: " PPC_LWARX(%0,0,%1,1) "\n"
+"1: lwarx %0,0,%1,1\n"
__DO_SIGN_EXTEND
" addic. %0,%0,1\n\
ble- 2f\n"
@@ -203,7 +203,7 @@ static inline long __arch_write_trylock(arch_rwlock_t *rw)
token = WRLOCK_TOKEN;
__asm__ __volatile__(
-"1: " PPC_LWARX(%0,0,%2,1) "\n\
+"1: lwarx %0,0,%2,1\n\
cmpwi 0,%0,0\n\
bne- 2f\n"
" stwcx. %1,0,%2\n\
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 03b3d010cbab..7ef1cd8168a0 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -33,6 +33,10 @@ extern bool coregroup_enabled;
extern int cpu_to_chip_id(int cpu);
extern int *chip_id_lookup_table;
+DECLARE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map);
+DECLARE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map);
+DECLARE_PER_CPU(cpumask_var_t, thread_group_l3_cache_map);
+
#ifdef CONFIG_SMP
struct smp_ops_t {
@@ -141,6 +145,7 @@ extern int cpu_to_core_id(int cpu);
extern bool has_big_cores;
extern bool thread_group_shares_l2;
+extern bool thread_group_shares_l3;
#define cpu_smt_mask cpu_smt_mask
#ifdef CONFIG_SCHED_SMT
@@ -195,6 +200,7 @@ extern void __cpu_die(unsigned int cpu);
#define hard_smp_processor_id() get_hard_smp_processor_id(0)
#define smp_setup_cpu_maps()
#define thread_group_shares_l2 0
+#define thread_group_shares_l3 0
static inline void inhibit_secondary_onlining(void) {}
static inline void uninhibit_secondary_onlining(void) {}
static inline const struct cpumask *cpu_sibling_mask(int cpu)
diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h
index ba0f88f3a30d..c60ebd04b2ed 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -90,10 +90,9 @@ static inline void syscall_get_arguments(struct task_struct *task,
unsigned long val, mask = -1UL;
unsigned int n = 6;
-#ifdef CONFIG_COMPAT
- if (test_tsk_thread_flag(task, TIF_32BIT))
+ if (is_32bit_task())
mask = 0xffffffff;
-#endif
+
while (n--) {
if (n == 0)
val = regs->orig_gpr3;
@@ -116,16 +115,11 @@ static inline void syscall_set_arguments(struct task_struct *task,
static inline int syscall_get_arch(struct task_struct *task)
{
- int arch;
-
- if (IS_ENABLED(CONFIG_PPC64) && !test_tsk_thread_flag(task, TIF_32BIT))
- arch = AUDIT_ARCH_PPC64;
+ if (is_32bit_task())
+ return AUDIT_ARCH_PPC;
+ else if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN))
+ return AUDIT_ARCH_PPC64LE;
else
- arch = AUDIT_ARCH_PPC;
-
-#ifdef __LITTLE_ENDIAN__
- arch |= __AUDIT_ARCH_LE;
-#endif
- return arch;
+ return AUDIT_ARCH_PPC64;
}
#endif /* _ASM_SYSCALL_H */
diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h
index 398171fdcd9f..7ee66ae5444d 100644
--- a/arch/powerpc/include/asm/syscalls.h
+++ b/arch/powerpc/include/asm/syscalls.h
@@ -6,6 +6,7 @@
#include <linux/compiler.h>
#include <linux/linkage.h>
#include <linux/types.h>
+#include <linux/compat.h>
struct rtas_args;
@@ -18,5 +19,34 @@ asmlinkage long sys_mmap2(unsigned long addr, size_t len,
asmlinkage long ppc64_personality(unsigned long personality);
asmlinkage long sys_rtas(struct rtas_args __user *uargs);
+#ifdef CONFIG_COMPAT
+unsigned long compat_sys_mmap2(unsigned long addr, size_t len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long pgoff);
+
+compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_size_t count,
+ u32 reg6, u32 pos1, u32 pos2);
+
+compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count,
+ u32 reg6, u32 pos1, u32 pos2);
+
+compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offset1, u32 offset2, u32 count);
+
+int compat_sys_truncate64(const char __user *path, u32 reg4,
+ unsigned long len1, unsigned long len2);
+
+long compat_sys_fallocate(int fd, int mode, u32 offset1, u32 offset2, u32 len1, u32 len2);
+
+int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1,
+ unsigned long len2);
+
+long ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2,
+ size_t len, int advice);
+
+long compat_sys_sync_file_range2(int fd, unsigned int flags,
+ unsigned int offset1, unsigned int offset2,
+ unsigned int nbytes1, unsigned int nbytes2);
+#endif
+
#endif /* __KERNEL__ */
#endif /* __ASM_POWERPC_SYSCALLS_H */
diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
index db5fc2f2262d..0c34d2756d92 100644
--- a/arch/powerpc/include/asm/tce.h
+++ b/arch/powerpc/include/asm/tce.h
@@ -19,15 +19,7 @@
#define TCE_VB 0
#define TCE_PCI 1
-/* TCE page size is 4096 bytes (1 << 12) */
-
-#define TCE_SHIFT 12
-#define TCE_PAGE_SIZE (1 << TCE_SHIFT)
-
#define TCE_ENTRY_SIZE 8 /* each TCE is 64 bits */
-
-#define TCE_RPN_MASK 0xfffffffffful /* 40-bit RPN (4K pages) */
-#define TCE_RPN_SHIFT 12
#define TCE_VALID 0x800 /* TCE valid */
#define TCE_ALLIO 0x400 /* TCE valid for all lpars */
#define TCE_PCI_WRITE 0x2 /* write from PCI allowed */
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index e4db64c0e184..36fcafb1fd6d 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -36,7 +36,7 @@ static inline int pcibus_to_node(struct pci_bus *bus)
cpu_all_mask : \
cpumask_of_node(pcibus_to_node(bus)))
-extern int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
+int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
extern int __node_distance(int, int);
#define node_distance(a, b) __node_distance(a, b)
@@ -64,6 +64,12 @@ static inline int early_cpu_to_node(int cpu)
}
int of_drconf_to_nid_single(struct drmem_lmb *lmb);
+void update_numa_distance(struct device_node *node);
+
+extern void map_cpu_to_node(int cpu, int node);
+#ifdef CONFIG_HOTPLUG_CPU
+extern void unmap_cpu_from_node(unsigned long cpu);
+#endif /* CONFIG_HOTPLUG_CPU */
#else
@@ -83,7 +89,7 @@ static inline void sysfs_remove_device_from_node(struct device *dev,
static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}
-static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+static inline int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
{
return 0;
}
@@ -93,6 +99,15 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb)
return first_online_node;
}
+static inline void update_numa_distance(struct device_node *node) {}
+
+#ifdef CONFIG_SMP
+static inline void map_cpu_to_node(int cpu, int node) {}
+#ifdef CONFIG_HOTPLUG_CPU
+static inline void unmap_cpu_from_node(unsigned long cpu) {}
+#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_SMP */
+
#endif /* CONFIG_NUMA */
#if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index b541c690a31c..5eb462af6766 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -9,8 +9,6 @@
#define NR_syscalls __NR_syscalls
-#define __NR__exit __NR_exit
-
#ifndef __ASSEMBLY__
#include <linux/types.h>
diff --git a/arch/powerpc/include/asm/vdso/processor.h b/arch/powerpc/include/asm/vdso/processor.h
index e072577bc7c0..8d79f994b4aa 100644
--- a/arch/powerpc/include/asm/vdso/processor.h
+++ b/arch/powerpc/include/asm/vdso/processor.h
@@ -5,12 +5,21 @@
#ifndef __ASSEMBLY__
/* Macros for adjusting thread priority (hardware multi-threading) */
+#ifdef CONFIG_PPC64
#define HMT_very_low() asm volatile("or 31, 31, 31 # very low priority")
#define HMT_low() asm volatile("or 1, 1, 1 # low priority")
#define HMT_medium_low() asm volatile("or 6, 6, 6 # medium low priority")
#define HMT_medium() asm volatile("or 2, 2, 2 # medium priority")
#define HMT_medium_high() asm volatile("or 5, 5, 5 # medium high priority")
#define HMT_high() asm volatile("or 3, 3, 3 # high priority")
+#else
+#define HMT_very_low()
+#define HMT_low()
+#define HMT_medium_low()
+#define HMT_medium()
+#define HMT_medium_high()
+#define HMT_high()
+#endif
#ifdef CONFIG_PPC64
#define cpu_relax() do { HMT_low(); HMT_medium(); barrier(); } while (0)
diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h
index d9cf192368ad..0ac9bfddf704 100644
--- a/arch/powerpc/include/asm/xics.h
+++ b/arch/powerpc/include/asm/xics.h
@@ -89,10 +89,11 @@ static inline int ics_opal_init(void) { return -ENODEV; }
/* ICS instance, hooked up to chip_data of an irq */
struct ics {
struct list_head link;
- int (*map)(struct ics *ics, unsigned int virq);
+ int (*check)(struct ics *ics, unsigned int hwirq);
void (*mask_unknown)(struct ics *ics, unsigned long vec);
long (*get_server)(struct ics *ics, unsigned long vec);
int (*host_match)(struct ics *ics, struct device_node *node);
+ struct irq_chip *chip;
char data[];
};
diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h
index 8b211faa0e42..cf8bb6ac4463 100644
--- a/arch/powerpc/include/asm/xive-regs.h
+++ b/arch/powerpc/include/asm/xive-regs.h
@@ -80,10 +80,13 @@
#define TM_QW0W2_VU PPC_BIT32(0)
#define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1,31) // XX 2,31 ?
#define TM_QW1W2_VO PPC_BIT32(0)
+#define TM_QW1W2_HO PPC_BIT32(1) /* P10 XIVE2 */
#define TM_QW1W2_OS_CAM PPC_BITMASK32(8,31)
#define TM_QW2W2_VP PPC_BIT32(0)
+#define TM_QW2W2_HP PPC_BIT32(1) /* P10 XIVE2 */
#define TM_QW2W2_POOL_CAM PPC_BITMASK32(8,31)
#define TM_QW3W2_VT PPC_BIT32(0)
+#define TM_QW3W2_HT PPC_BIT32(1) /* P10 XIVE2 */
#define TM_QW3W2_LP PPC_BIT32(6)
#define TM_QW3W2_LE PPC_BIT32(7)
#define TM_QW3W2_T PPC_BIT32(31)
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index aa094a8655b0..92930b0b5d0e 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -111,6 +111,7 @@ void xive_native_free_vp_block(u32 vp_base);
int xive_native_populate_irq_data(u32 hw_irq,
struct xive_irq_data *data);
void xive_cleanup_irq_data(struct xive_irq_data *xd);
+void xive_irq_free_data(unsigned int virq);
void xive_native_free_irq(u32 irq);
int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
@@ -125,6 +126,7 @@ int xive_native_enable_vp(u32 vp_id, bool single_escalation);
int xive_native_disable_vp(u32 vp_id);
int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
bool xive_native_has_single_escalation(void);
+bool xive_native_has_save_restore(void);
int xive_native_get_queue_info(u32 vp_id, uint32_t prio,
u64 *out_qpage,
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index f66b63e81c3b..7be36c1e1db6 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -46,7 +46,8 @@ obj-y := cputable.o syscalls.o \
prom.o traps.o setup-common.o \
udbg.o misc.o io.o misc_$(BITS).o \
of_platform.o prom_parse.o firmware.o \
- hw_breakpoint_constraints.o interrupt.o
+ hw_breakpoint_constraints.o interrupt.o \
+ kdebugfs.o
obj-y += ptrace/
obj-$(CONFIG_PPC64) += setup_64.o \
paca.o nvram_64.o note.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 5bee245d832b..e563d3222d69 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -286,23 +286,16 @@ int main(void)
STACK_PT_REGS_OFFSET(_CCR, ccr);
STACK_PT_REGS_OFFSET(_XER, xer);
STACK_PT_REGS_OFFSET(_DAR, dar);
+ STACK_PT_REGS_OFFSET(_DEAR, dear);
STACK_PT_REGS_OFFSET(_DSISR, dsisr);
+ STACK_PT_REGS_OFFSET(_ESR, esr);
STACK_PT_REGS_OFFSET(ORIG_GPR3, orig_gpr3);
STACK_PT_REGS_OFFSET(RESULT, result);
STACK_PT_REGS_OFFSET(_TRAP, trap);
-#ifndef CONFIG_PPC64
- /*
- * The PowerPC 400-class & Book-E processors have neither the DAR
- * nor the DSISR SPRs. Hence, we overload them to hold the similar
- * DEAR and ESR SPRs for such processors. For critical interrupts
- * we use them to hold SRR0 and SRR1.
- */
- STACK_PT_REGS_OFFSET(_DEAR, dar);
- STACK_PT_REGS_OFFSET(_ESR, dsisr);
-#else /* CONFIG_PPC64 */
+#ifdef CONFIG_PPC64
STACK_PT_REGS_OFFSET(SOFTE, softe);
STACK_PT_REGS_OFFSET(_PPR, ppr);
-#endif /* CONFIG_PPC64 */
+#endif
#ifdef CONFIG_PPC_PKEY
STACK_PT_REGS_OFFSET(STACK_REGS_AMR, amr);
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 6f903e9aa20b..cf1be75b7833 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -120,6 +120,7 @@ struct cache {
struct cpumask shared_cpu_map; /* online CPUs using this cache */
int type; /* split cache disambiguation */
int level; /* level not explicit in device tree */
+ int group_id; /* id of the group of threads that share this cache */
struct list_head list; /* global list of cache objects */
struct cache *next_local; /* next cache of >= level */
};
@@ -142,22 +143,24 @@ static const char *cache_type_string(const struct cache *cache)
}
static void cache_init(struct cache *cache, int type, int level,
- struct device_node *ofnode)
+ struct device_node *ofnode, int group_id)
{
cache->type = type;
cache->level = level;
cache->ofnode = of_node_get(ofnode);
+ cache->group_id = group_id;
INIT_LIST_HEAD(&cache->list);
list_add(&cache->list, &cache_list);
}
-static struct cache *new_cache(int type, int level, struct device_node *ofnode)
+static struct cache *new_cache(int type, int level,
+ struct device_node *ofnode, int group_id)
{
struct cache *cache;
cache = kzalloc(sizeof(*cache), GFP_KERNEL);
if (cache)
- cache_init(cache, type, level, ofnode);
+ cache_init(cache, type, level, ofnode, group_id);
return cache;
}
@@ -309,20 +312,24 @@ static struct cache *cache_find_first_sibling(struct cache *cache)
return cache;
list_for_each_entry(iter, &cache_list, list)
- if (iter->ofnode == cache->ofnode && iter->next_local == cache)
+ if (iter->ofnode == cache->ofnode &&
+ iter->group_id == cache->group_id &&
+ iter->next_local == cache)
return iter;
return cache;
}
-/* return the first cache on a local list matching node */
-static struct cache *cache_lookup_by_node(const struct device_node *node)
+/* return the first cache on a local list matching node and thread-group id */
+static struct cache *cache_lookup_by_node_group(const struct device_node *node,
+ int group_id)
{
struct cache *cache = NULL;
struct cache *iter;
list_for_each_entry(iter, &cache_list, list) {
- if (iter->ofnode != node)
+ if (iter->ofnode != node ||
+ iter->group_id != group_id)
continue;
cache = cache_find_first_sibling(iter);
break;
@@ -352,14 +359,15 @@ static int cache_is_unified_d(const struct device_node *np)
CACHE_TYPE_UNIFIED_D : CACHE_TYPE_UNIFIED;
}
-static struct cache *cache_do_one_devnode_unified(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode_unified(struct device_node *node, int group_id,
+ int level)
{
pr_debug("creating L%d ucache for %pOFP\n", level, node);
- return new_cache(cache_is_unified_d(node), level, node);
+ return new_cache(cache_is_unified_d(node), level, node, group_id);
}
-static struct cache *cache_do_one_devnode_split(struct device_node *node,
+static struct cache *cache_do_one_devnode_split(struct device_node *node, int group_id,
int level)
{
struct cache *dcache, *icache;
@@ -367,8 +375,8 @@ static struct cache *cache_do_one_devnode_split(struct device_node *node,
pr_debug("creating L%d dcache and icache for %pOFP\n", level,
node);
- dcache = new_cache(CACHE_TYPE_DATA, level, node);
- icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node);
+ dcache = new_cache(CACHE_TYPE_DATA, level, node, group_id);
+ icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node, group_id);
if (!dcache || !icache)
goto err;
@@ -382,31 +390,32 @@ err:
return NULL;
}
-static struct cache *cache_do_one_devnode(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode(struct device_node *node, int group_id, int level)
{
struct cache *cache;
if (cache_node_is_unified(node))
- cache = cache_do_one_devnode_unified(node, level);
+ cache = cache_do_one_devnode_unified(node, group_id, level);
else
- cache = cache_do_one_devnode_split(node, level);
+ cache = cache_do_one_devnode_split(node, group_id, level);
return cache;
}
static struct cache *cache_lookup_or_instantiate(struct device_node *node,
+ int group_id,
int level)
{
struct cache *cache;
- cache = cache_lookup_by_node(node);
+ cache = cache_lookup_by_node_group(node, group_id);
WARN_ONCE(cache && cache->level != level,
"cache level mismatch on lookup (got %d, expected %d)\n",
cache->level, level);
if (!cache)
- cache = cache_do_one_devnode(node, level);
+ cache = cache_do_one_devnode(node, group_id, level);
return cache;
}
@@ -443,7 +452,30 @@ static void do_subsidiary_caches_debugcheck(struct cache *cache)
of_node_get_device_type(cache->ofnode));
}
-static void do_subsidiary_caches(struct cache *cache)
+/*
+ * If sub-groups of threads in a core containing @cpu_id share the
+ * L@level-cache (information obtained via "ibm,thread-groups"
+ * device-tree property), then we identify the group by the first
+ * thread-sibling in the group. We define this to be the group-id.
+ *
+ * In the absence of any thread-group information for L@level-cache,
+ * this function returns -1.
+ */
+static int get_group_id(unsigned int cpu_id, int level)
+{
+ if (has_big_cores && level == 1)
+ return cpumask_first(per_cpu(thread_group_l1_cache_map,
+ cpu_id));
+ else if (thread_group_shares_l2 && level == 2)
+ return cpumask_first(per_cpu(thread_group_l2_cache_map,
+ cpu_id));
+ else if (thread_group_shares_l3 && level == 3)
+ return cpumask_first(per_cpu(thread_group_l3_cache_map,
+ cpu_id));
+ return -1;
+}
+
+static void do_subsidiary_caches(struct cache *cache, unsigned int cpu_id)
{
struct device_node *subcache_node;
int level = cache->level;
@@ -452,9 +484,11 @@ static void do_subsidiary_caches(struct cache *cache)
while ((subcache_node = of_find_next_cache_node(cache->ofnode))) {
struct cache *subcache;
+ int group_id;
level++;
- subcache = cache_lookup_or_instantiate(subcache_node, level);
+ group_id = get_group_id(cpu_id, level);
+ subcache = cache_lookup_or_instantiate(subcache_node, group_id, level);
of_node_put(subcache_node);
if (!subcache)
break;
@@ -468,6 +502,7 @@ static struct cache *cache_chain_instantiate(unsigned int cpu_id)
{
struct device_node *cpu_node;
struct cache *cpu_cache = NULL;
+ int group_id;
pr_debug("creating cache object(s) for CPU %i\n", cpu_id);
@@ -476,11 +511,13 @@ static struct cache *cache_chain_instantiate(unsigned int cpu_id)
if (!cpu_node)
goto out;
- cpu_cache = cache_lookup_or_instantiate(cpu_node, 1);
+ group_id = get_group_id(cpu_id, 1);
+
+ cpu_cache = cache_lookup_or_instantiate(cpu_node, group_id, 1);
if (!cpu_cache)
goto out;
- do_subsidiary_caches(cpu_cache);
+ do_subsidiary_caches(cpu_cache, cpu_id);
cache_cpu_set(cpu_cache, cpu_id);
out:
@@ -641,45 +678,6 @@ static ssize_t level_show(struct kobject *k, struct kobj_attribute *attr, char *
static struct kobj_attribute cache_level_attr =
__ATTR(level, 0444, level_show, NULL);
-static unsigned int index_dir_to_cpu(struct cache_index_dir *index)
-{
- struct kobject *index_dir_kobj = &index->kobj;
- struct kobject *cache_dir_kobj = index_dir_kobj->parent;
- struct kobject *cpu_dev_kobj = cache_dir_kobj->parent;
- struct device *dev = kobj_to_dev(cpu_dev_kobj);
-
- return dev->id;
-}
-
-/*
- * On big-core systems, each core has two groups of CPUs each of which
- * has its own L1-cache. The thread-siblings which share l1-cache with
- * @cpu can be obtained via cpu_smallcore_mask().
- *
- * On some big-core systems, the L2 cache is shared only between some
- * groups of siblings. This is already parsed and encoded in
- * cpu_l2_cache_mask().
- *
- * TODO: cache_lookup_or_instantiate() needs to be made aware of the
- * "ibm,thread-groups" property so that cache->shared_cpu_map
- * reflects the correct siblings on platforms that have this
- * device-tree property. This helper function is only a stop-gap
- * solution so that we report the correct siblings to the
- * userspace via sysfs.
- */
-static const struct cpumask *get_shared_cpu_map(struct cache_index_dir *index, struct cache *cache)
-{
- if (has_big_cores) {
- int cpu = index_dir_to_cpu(index);
- if (cache->level == 1)
- return cpu_smallcore_mask(cpu);
- if (cache->level == 2 && thread_group_shares_l2)
- return cpu_l2_cache_mask(cpu);
- }
-
- return &cache->shared_cpu_map;
-}
-
static ssize_t
show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bool list)
{
@@ -690,7 +688,7 @@ show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bo
index = kobj_to_cache_index_dir(k);
cache = index->cache;
- mask = get_shared_cpu_map(index, cache);
+ mask = &cache->shared_cpu_map;
return cpumap_print_to_pagebuf(list, buf, mask);
}
@@ -848,13 +846,15 @@ static struct cache *cache_lookup_by_cpu(unsigned int cpu_id)
{
struct device_node *cpu_node;
struct cache *cache;
+ int group_id;
cpu_node = of_get_cpu_node(cpu_id, NULL);
WARN_ONCE(!cpu_node, "no OF node found for CPU %i\n", cpu_id);
if (!cpu_node)
return NULL;
- cache = cache_lookup_by_node(cpu_node);
+ group_id = get_group_id(cpu_id, 1);
+ cache = cache_lookup_by_node_group(cpu_node, group_id);
of_node_put(cpu_node);
return cache;
diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c
index cdc2dccb987d..64e423d2fe0f 100644
--- a/arch/powerpc/kernel/dawr.c
+++ b/arch/powerpc/kernel/dawr.c
@@ -9,7 +9,6 @@
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/debugfs.h>
-#include <asm/debugfs.h>
#include <asm/machdep.h>
#include <asm/hvcall.h>
@@ -101,7 +100,7 @@ static int __init dawr_force_setup(void)
if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) {
/* Turn DAWR off by default, but allow admin to turn it on */
debugfs_create_file_unsafe("dawr_enable_dangerous", 0600,
- powerpc_debugfs_root,
+ arch_debugfs_dir,
&dawr_force_enable,
&dawr_enable_fops);
}
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 3bbdcc86d01b..e9b597ed423c 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -21,9 +21,9 @@
#include <linux/spinlock.h>
#include <linux/export.h>
#include <linux/of.h>
+#include <linux/debugfs.h>
#include <linux/atomic.h>
-#include <asm/debugfs.h>
#include <asm/eeh.h>
#include <asm/eeh_event.h>
#include <asm/io.h>
@@ -1901,24 +1901,24 @@ static int __init eeh_init_proc(void)
proc_create_single("powerpc/eeh", 0, NULL, proc_eeh_show);
#ifdef CONFIG_DEBUG_FS
debugfs_create_file_unsafe("eeh_enable", 0600,
- powerpc_debugfs_root, NULL,
+ arch_debugfs_dir, NULL,
&eeh_enable_dbgfs_ops);
debugfs_create_u32("eeh_max_freezes", 0600,
- powerpc_debugfs_root, &eeh_max_freezes);
+ arch_debugfs_dir, &eeh_max_freezes);
debugfs_create_bool("eeh_disable_recovery", 0600,
- powerpc_debugfs_root,
+ arch_debugfs_dir,
&eeh_debugfs_no_recover);
debugfs_create_file_unsafe("eeh_dev_check", 0600,
- powerpc_debugfs_root, NULL,
+ arch_debugfs_dir, NULL,
&eeh_dev_check_fops);
debugfs_create_file_unsafe("eeh_dev_break", 0600,
- powerpc_debugfs_root, NULL,
+ arch_debugfs_dir, NULL,
&eeh_dev_break_fops);
debugfs_create_file_unsafe("eeh_force_recover", 0600,
- powerpc_debugfs_root, NULL,
+ arch_debugfs_dir, NULL,
&eeh_force_recover_fops);
debugfs_create_file_unsafe("eeh_dev_can_recover", 0600,
- powerpc_debugfs_root, NULL,
+ arch_debugfs_dir, NULL,
&eeh_dev_can_recover_fops);
eeh_cache_debugfs_init();
#endif
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index bf3270426d82..9bdaaf7fddc9 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -12,8 +12,8 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
+#include <linux/debugfs.h>
#include <asm/pci-bridge.h>
-#include <asm/debugfs.h>
#include <asm/ppc-pci.h>
@@ -283,6 +283,6 @@ DEFINE_SHOW_ATTRIBUTE(eeh_addr_cache);
void eeh_cache_debugfs_init(void)
{
debugfs_create_file_unsafe("eeh_address_cache", 0400,
- powerpc_debugfs_root, NULL,
+ arch_debugfs_dir, NULL,
&eeh_addr_cache_fops);
}
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 0273a1349006..61fdd53cdd9a 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -161,10 +161,10 @@ ret_from_fork:
ret_from_kernel_thread:
REST_NVGPRS(r1)
bl schedule_tail
- mtlr r14
+ mtctr r14
mr r3,r15
PPC440EP_ERR42
- blrl
+ bctrl
li r3,0
b ret_from_syscall
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 15720f8661a1..70cff7b49e17 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -309,7 +309,7 @@ _GLOBAL(enter_rtas)
*/
lbz r0,PACAIRQSOFTMASK(r13)
1: tdeqi r0,IRQS_ENABLED
- EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING
+ EMIT_WARN_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING
#endif
/* Hard-disable interrupts */
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index 1401787b0b93..711c66b76df1 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -545,8 +545,8 @@ __end_interrupts:
PROLOG_ADDITION_2REGS)
mfspr r14,SPRN_DEAR
mfspr r15,SPRN_ESR
- std r14,_DAR(r1)
- std r15,_DSISR(r1)
+ std r14,_DEAR(r1)
+ std r15,_ESR(r1)
ld r14,PACA_EXGEN+EX_R14(r13)
ld r15,PACA_EXGEN+EX_R15(r13)
EXCEPTION_COMMON(0x300)
@@ -558,8 +558,8 @@ __end_interrupts:
PROLOG_ADDITION_2REGS)
li r15,0
mr r14,r10
- std r14,_DAR(r1)
- std r15,_DSISR(r1)
+ std r14,_DEAR(r1)
+ std r15,_ESR(r1)
ld r14,PACA_EXGEN+EX_R14(r13)
ld r15,PACA_EXGEN+EX_R15(r13)
EXCEPTION_COMMON(0x400)
@@ -575,8 +575,8 @@ __end_interrupts:
PROLOG_ADDITION_2REGS)
mfspr r14,SPRN_DEAR
mfspr r15,SPRN_ESR
- std r14,_DAR(r1)
- std r15,_DSISR(r1)
+ std r14,_DEAR(r1)
+ std r15,_ESR(r1)
ld r14,PACA_EXGEN+EX_R14(r13)
ld r15,PACA_EXGEN+EX_R15(r13)
EXCEPTION_COMMON(0x600)
@@ -587,7 +587,7 @@ __end_interrupts:
NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM,
PROLOG_ADDITION_1REG)
mfspr r14,SPRN_ESR
- std r14,_DSISR(r1)
+ std r14,_ESR(r1)
ld r14,PACA_EXGEN+EX_R14(r13)
EXCEPTION_COMMON(0x700)
addi r3,r1,STACK_FRAME_OVERHEAD
@@ -1057,8 +1057,8 @@ bad_stack_book3e:
std r11,_CCR(r1)
mfspr r10,SPRN_DEAR
mfspr r11,SPRN_ESR
- std r10,_DAR(r1)
- std r11,_DSISR(r1)
+ std r10,_DEAR(r1)
+ std r11,_ESR(r1)
std r0,GPR0(r1); /* save r0 in stackframe */ \
std r2,GPR2(r1); /* save r2 in stackframe */ \
SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \
@@ -1127,7 +1127,7 @@ found_iprot:
* r3 = MAS0_TLBSEL (for the iprot array)
* r4 = SPRN_TLBnCFG
*/
- bl invstr /* Find our address */
+ bcl 20,31,$+4 /* Find our address */
invstr: mflr r6 /* Make it accessible */
mfmsr r7
rlwinm r5,r7,27,31,31 /* extract MSR[IS] */
@@ -1196,7 +1196,7 @@ skpinv: addi r6,r6,1 /* Increment */
mfmsr r6
xori r6,r6,MSR_IS
mtspr SPRN_SRR1,r6
- bl 1f /* Find our address */
+ bcl 20,31,$+4 /* Find our address */
1: mflr r6
addi r6,r6,(2f - 1b)
mtspr SPRN_SRR0,r6
@@ -1256,7 +1256,7 @@ skpinv: addi r6,r6,1 /* Increment */
* r4 = MAS0 w/TLBSEL & ESEL for the temp mapping
*/
/* Now we branch the new virtual address mapped by this entry */
- bl 1f /* Find our address */
+ bcl 20,31,$+4 /* Find our address */
1: mflr r6
addi r6,r6,(2f - 1b)
tovirt(r6,r6)
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index b990075285f5..b7ceb041743c 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -24,8 +24,8 @@
#include <linux/slab.h>
#include <linux/cma.h>
#include <linux/hugetlb.h>
+#include <linux/debugfs.h>
-#include <asm/debugfs.h>
#include <asm/page.h>
#include <asm/prom.h>
#include <asm/fadump.h>
@@ -1557,7 +1557,7 @@ static void fadump_init_files(void)
return;
}
- debugfs_create_file("fadump_region", 0444, powerpc_debugfs_root, NULL,
+ debugfs_create_file("fadump_region", 0444, arch_debugfs_dir, NULL,
&fadump_region_fops);
if (fw_dump.dump_active) {
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index 6010adcee16e..ba4afe3b5a9c 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -91,8 +91,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
isync
/* enable use of FP after return */
#ifdef CONFIG_PPC32
- mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */
- tovirt(r5, r5)
+ addi r5,r2,THREAD
lwz r4,THREAD_FPEXC_MODE(r5)
ori r9,r9,MSR_FP /* enable FP for current */
or r9,r9,r4
diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S
index 8bccce6544b5..dedc17fac8f8 100644
--- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S
+++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* 1. Find the index of the entry we're executing in */
- bl invstr /* Find our address */
+ bcl 20,31,$+4 /* Find our address */
invstr: mflr r6 /* Make it accessible */
mfmsr r7
rlwinm r4,r7,27,31,31 /* extract MSR[IS] */
@@ -85,7 +85,7 @@ skpinv: addi r6,r6,1 /* Increment */
addi r6,r6,10
slw r6,r8,r6 /* convert to mask */
- bl 1f /* Find our address */
+ bcl 20,31,$+4 /* Find our address */
1: mflr r7
mfspr r8,SPRN_MAS3
@@ -117,7 +117,7 @@ skpinv: addi r6,r6,1 /* Increment */
xori r6,r4,1
slwi r6,r6,5 /* setup new context with other address space */
- bl 1f /* Find our address */
+ bcl 20,31,$+4 /* Find our address */
1: mflr r9
rlwimi r7,r9,0,20,31
addi r7,r7,(2f - 1b)
@@ -207,7 +207,7 @@ next_tlb_setup:
lis r7,MSR_KERNEL@h
ori r7,r7,MSR_KERNEL@l
- bl 1f /* Find our address */
+ bcl 20,31,$+4 /* Find our address */
1: mflr r9
rlwimi r6,r9,0,20,31
addi r6,r6,(2f - 1b)
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index ddc978a2d381..02d2928d1e01 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -70,7 +70,7 @@ _ENTRY(_start);
* address.
* r21 will be loaded with the physical runtime address of _stext
*/
- bl 0f /* Get our runtime address */
+ bcl 20,31,$+4 /* Get our runtime address */
0: mflr r21 /* Make it accessible */
addis r21,r21,(_stext - 0b)@ha
addi r21,r21,(_stext - 0b)@l /* Get our current runtime base */
@@ -853,7 +853,7 @@ _GLOBAL(init_cpu_state)
wmmucr: mtspr SPRN_MMUCR,r3 /* Put MMUCR */
sync
- bl invstr /* Find our address */
+ bcl 20,31,$+4 /* Find our address */
invstr: mflr r5 /* Make it accessible */
tlbsx r23,0,r5 /* Find entry we are in */
li r4,0 /* Start at TLB entry 0 */
@@ -1045,7 +1045,7 @@ head_start_47x:
sync
/* Find the entry we are running from */
- bl 1f
+ bcl 20,31,$+4
1: mflr r23
tlbsx r23,0,r23
tlbre r24,r23,0
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 79930b0bc781..f17ae2083733 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -712,6 +712,8 @@ _GLOBAL(copy_and_flush)
isync
blr
+_ASM_NOKPROBE_SYMBOL(copy_and_flush); /* Called in real mode */
+
.align 8
copy_to_here:
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 9a2f4265e6d2..0a9a0f301474 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -79,7 +79,7 @@ _ENTRY(_start);
mr r23,r3
mr r25,r4
- bl 0f
+ bcl 20,31,$+4
0: mflr r8
addis r3,r8,(is_second_reloc - 0b)@ha
lwz r19,(is_second_reloc - 0b)@l(r3)
@@ -1132,7 +1132,7 @@ _GLOBAL(switch_to_as1)
bne 1b
/* Get the tlb entry used by the current running code */
- bl 0f
+ bcl 20,31,$+4
0: mflr r4
tlbsx 0,r4
@@ -1166,7 +1166,7 @@ _GLOBAL(switch_to_as1)
_GLOBAL(restore_to_as0)
mflr r0
- bl 0f
+ bcl 20,31,$+4
0: mflr r9
addi r9,r9,1f - 0b
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 21a638aff72f..91a3be14808b 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -22,7 +22,6 @@
#include <asm/processor.h>
#include <asm/sstep.h>
#include <asm/debug.h>
-#include <asm/debugfs.h>
#include <asm/hvcall.h>
#include <asm/inst.h>
#include <linux/uaccess.h>
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 21bbd615ca41..a73f3f70a657 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -8,7 +8,6 @@
#include <asm/asm-prototypes.h>
#include <asm/kup.h>
#include <asm/cputime.h>
-#include <asm/interrupt.h>
#include <asm/hw_irq.h>
#include <asm/interrupt.h>
#include <asm/kprobes.h>
@@ -93,8 +92,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
CT_WARN_ON(ct_state() == CONTEXT_KERNEL);
user_exit_irqoff();
- if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x))
- BUG_ON(!(regs->msr & MSR_RI));
+ BUG_ON(regs_is_unrecoverable(regs));
BUG_ON(!(regs->msr & MSR_PR));
BUG_ON(arch_irq_disabled_regs(regs));
@@ -463,9 +461,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs)
{
unsigned long ret;
- if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x))
- BUG_ON(!(regs->msr & MSR_RI));
- BUG_ON(!(regs->msr & MSR_PR));
+ BUG_ON(regs_is_unrecoverable(regs));
BUG_ON(arch_irq_disabled_regs(regs));
CT_WARN_ON(ct_state() == CONTEXT_USER);
@@ -496,10 +492,8 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs)
bool stack_store = current_thread_info()->flags &
_TIF_EMULATE_STACK_STORE;
- if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) &&
- unlikely(!(regs->msr & MSR_RI)))
+ if (regs_is_unrecoverable(regs))
unrecoverable_exception(regs);
- BUG_ON(regs->msr & MSR_PR);
/*
* CT_WARN_ON comes here via program_check_exception,
* so avoid recursion.
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 30b7736f0896..07093b7cdcb9 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -688,32 +688,24 @@ static void iommu_table_reserve_pages(struct iommu_table *tbl,
if (tbl->it_offset == 0)
set_bit(0, tbl->it_map);
- tbl->it_reserved_start = res_start;
- tbl->it_reserved_end = res_end;
-
- /* Check if res_start..res_end isn't empty and overlaps the table */
- if (res_start && res_end &&
- (tbl->it_offset + tbl->it_size < res_start ||
- res_end < tbl->it_offset))
- return;
+ if (res_start < tbl->it_offset)
+ res_start = tbl->it_offset;
- for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
- set_bit(i - tbl->it_offset, tbl->it_map);
-}
+ if (res_end > (tbl->it_offset + tbl->it_size))
+ res_end = tbl->it_offset + tbl->it_size;
-static void iommu_table_release_pages(struct iommu_table *tbl)
-{
- int i;
+ /* Check if res_start..res_end is a valid range in the table */
+ if (res_start >= res_end) {
+ tbl->it_reserved_start = tbl->it_offset;
+ tbl->it_reserved_end = tbl->it_offset;
+ return;
+ }
- /*
- * In case we have reserved the first bit, we should not emit
- * the warning below.
- */
- if (tbl->it_offset == 0)
- clear_bit(0, tbl->it_map);
+ tbl->it_reserved_start = res_start;
+ tbl->it_reserved_end = res_end;
for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
- clear_bit(i - tbl->it_offset, tbl->it_map);
+ set_bit(i - tbl->it_offset, tbl->it_map);
}
/*
@@ -777,6 +769,22 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
return tbl;
}
+bool iommu_table_in_use(struct iommu_table *tbl)
+{
+ unsigned long start = 0, end;
+
+ /* ignore reserved bit0 */
+ if (tbl->it_offset == 0)
+ start = 1;
+ end = tbl->it_reserved_start - tbl->it_offset;
+ if (find_next_bit(tbl->it_map, end, start) != end)
+ return true;
+
+ start = tbl->it_reserved_end - tbl->it_offset;
+ end = tbl->it_size;
+ return find_next_bit(tbl->it_map, end, start) != end;
+}
+
static void iommu_table_free(struct kref *kref)
{
struct iommu_table *tbl;
@@ -793,10 +801,8 @@ static void iommu_table_free(struct kref *kref)
iommu_debugfs_del(tbl);
- iommu_table_release_pages(tbl);
-
/* verify that table contains no entries */
- if (!bitmap_empty(tbl->it_map, tbl->it_size))
+ if (iommu_table_in_use(tbl))
pr_warn("%s: Unexpected TCEs\n", __func__);
/* free bitmap */
@@ -1097,14 +1103,9 @@ int iommu_take_ownership(struct iommu_table *tbl)
for (i = 0; i < tbl->nr_pools; i++)
spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
- iommu_table_release_pages(tbl);
-
- if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+ if (iommu_table_in_use(tbl)) {
pr_err("iommu_tce: it_map is not empty");
ret = -EBUSY;
- /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
- iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
- tbl->it_reserved_end);
} else {
memset(tbl->it_map, 0xff, sz);
}
diff --git a/arch/powerpc/kernel/kdebugfs.c b/arch/powerpc/kernel/kdebugfs.c
new file mode 100644
index 000000000000..36d3124d5a8b
--- /dev/null
+++ b/arch/powerpc/kernel/kdebugfs.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/init.h>
+
+struct dentry *arch_debugfs_dir;
+EXPORT_SYMBOL(arch_debugfs_dir);
+
+static int __init arch_kdebugfs_init(void)
+{
+ arch_debugfs_dir = debugfs_create_dir("powerpc", NULL);
+ return 0;
+}
+arch_initcall(arch_kdebugfs_init);
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index 5be96feccb55..fb7de3543c03 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -29,7 +29,7 @@ _GLOBAL(reloc_offset)
li r3, 0
_GLOBAL(add_reloc_offset)
mflr r0
- bl 1f
+ bcl 20,31,$+4
1: mflr r5
PPC_LL r4,(2f-1b)(r5)
subf r5,r4,r5
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 39ab15419592..e5127b19fec2 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -67,7 +67,7 @@ _GLOBAL(reloc_got2)
srwi. r8,r8,2
beqlr
mtctr r8
- bl 1f
+ bcl 20,31,$+4
1: mflr r0
lis r4,1b@ha
addi r4,r4,1b@l
@@ -237,7 +237,7 @@ _GLOBAL(copy_page)
addi r3,r3,-4
0: twnei r5, 0 /* WARN if r3 is not cache aligned */
- EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
+ EMIT_WARN_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
addi r4,r4,-4
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 4b761a18a74d..d38a019b38e1 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -255,7 +255,7 @@ _GLOBAL(scom970_write)
* Physical (hardware) cpu id should be in r3.
*/
_GLOBAL(kexec_wait)
- bl 1f
+ bcl 20,31,$+4
1: mflr r5
addi r5,r5,kexec_flag-1b
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 001e90cd8948..c3573430919d 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -29,6 +29,7 @@
#include <linux/slab.h>
#include <linux/vgaarb.h>
#include <linux/numa.h>
+#include <linux/msi.h>
#include <asm/processor.h>
#include <asm/io.h>
@@ -1060,11 +1061,16 @@ void pcibios_bus_add_device(struct pci_dev *dev)
int pcibios_add_device(struct pci_dev *dev)
{
+ struct irq_domain *d;
+
#ifdef CONFIG_PCI_IOV
if (ppc_md.pcibios_fixup_sriov)
ppc_md.pcibios_fixup_sriov(dev);
#endif /* CONFIG_PCI_IOV */
+ d = dev_get_msi_domain(&dev->bus->dev);
+ if (d)
+ dev_set_msi_domain(&dev->dev, d);
return 0;
}
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 185beb290580..50436b52c213 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs)
trap == INTERRUPT_DATA_STORAGE ||
trap == INTERRUPT_ALIGNMENT) {
if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
- pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr);
+ pr_cont("DEAR: "REG" ESR: "REG" ", regs->dear, regs->esr);
else
pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr);
}
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index f620e04dc9bf..44b2cdc0aae3 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -640,7 +640,9 @@ static void __init early_reserve_mem(void)
}
#endif /* CONFIG_BLK_DEV_INITRD */
-#ifdef CONFIG_PPC32
+ if (!IS_ENABLED(CONFIG_PPC32))
+ return;
+
/*
* Handle the case where we might be booting from an old kexec
* image that setup the mem_rsvmap as pairs of 32-bit values
@@ -661,7 +663,6 @@ static void __init early_reserve_mem(void)
}
return;
}
-#endif
}
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index a5bf355ce1d6..95a42d49e291 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1096,7 +1096,8 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = {
#else
0,
#endif
- .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN),
+ .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN) |
+ OV5_FEAT(OV5_FORM2_AFFINITY),
.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
.micro_checkpoint = 0,
.reserved0 = 0,
diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c
index 0a0a33eb0d28..7c7093c17c45 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -373,8 +373,12 @@ void __init pt_regs_check(void)
offsetof(struct user_pt_regs, trap));
BUILD_BUG_ON(offsetof(struct pt_regs, dar) !=
offsetof(struct user_pt_regs, dar));
+ BUILD_BUG_ON(offsetof(struct pt_regs, dear) !=
+ offsetof(struct user_pt_regs, dar));
BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) !=
offsetof(struct user_pt_regs, dsisr));
+ BUILD_BUG_ON(offsetof(struct pt_regs, esr) !=
+ offsetof(struct user_pt_regs, dsisr));
BUILD_BUG_ON(offsetof(struct pt_regs, result) !=
offsetof(struct user_pt_regs, result));
diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S
index 10e96f3e22fe..0508c14b4c28 100644
--- a/arch/powerpc/kernel/reloc_32.S
+++ b/arch/powerpc/kernel/reloc_32.S
@@ -30,7 +30,7 @@ R_PPC_RELATIVE = 22
_GLOBAL(relocate)
mflr r0 /* Save our LR */
- bl 0f /* Find our current runtime address */
+ bcl 20,31,$+4 /* Find our current runtime address */
0: mflr r12 /* Make it accessible */
mtlr r0
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 8561dfb33f24..32ee17753eb4 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -429,7 +429,7 @@ static void rtas_event_scan(struct work_struct *w)
do_event_scan();
- get_online_cpus();
+ cpus_read_lock();
/* raw_ OK because just using CPU as starting point. */
cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
@@ -451,7 +451,7 @@ static void rtas_event_scan(struct work_struct *w)
schedule_delayed_work_on(cpu, &event_scan_work,
__round_jiffies_relative(event_scan_delay, cpu));
- put_online_cpus();
+ cpus_read_unlock();
}
#ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
index cc51fa52e783..1a998490fe60 100644
--- a/arch/powerpc/kernel/security.c
+++ b/arch/powerpc/kernel/security.c
@@ -11,10 +11,10 @@
#include <linux/nospec.h>
#include <linux/prctl.h>
#include <linux/seq_buf.h>
+#include <linux/debugfs.h>
#include <asm/asm-prototypes.h>
#include <asm/code-patching.h>
-#include <asm/debugfs.h>
#include <asm/security_features.h>
#include <asm/setup.h>
#include <asm/inst.h>
@@ -106,7 +106,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_barrier_nospec, barrier_nospec_get,
static __init int barrier_nospec_debugfs_init(void)
{
debugfs_create_file_unsafe("barrier_nospec", 0600,
- powerpc_debugfs_root, NULL,
+ arch_debugfs_dir, NULL,
&fops_barrier_nospec);
return 0;
}
@@ -114,7 +114,7 @@ device_initcall(barrier_nospec_debugfs_init);
static __init int security_feature_debugfs_init(void)
{
- debugfs_create_x64("security_features", 0400, powerpc_debugfs_root,
+ debugfs_create_x64("security_features", 0400, arch_debugfs_dir,
&powerpc_security_features);
return 0;
}
@@ -420,7 +420,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_stf_barrier, stf_barrier_get, stf_barrier_set,
static __init int stf_barrier_debugfs_init(void)
{
- debugfs_create_file_unsafe("stf_barrier", 0600, powerpc_debugfs_root,
+ debugfs_create_file_unsafe("stf_barrier", 0600, arch_debugfs_dir,
NULL, &fops_stf_barrier);
return 0;
}
@@ -748,7 +748,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_count_cache_flush, count_cache_flush_get,
static __init int count_cache_flush_debugfs_init(void)
{
debugfs_create_file_unsafe("count_cache_flush", 0600,
- powerpc_debugfs_root, NULL,
+ arch_debugfs_dir, NULL,
&fops_count_cache_flush);
return 0;
}
@@ -834,9 +834,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_uaccess_flush, uaccess_flush_get, uaccess_flush_set
static __init int rfi_flush_debugfs_init(void)
{
- debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush);
- debugfs_create_file("entry_flush", 0600, powerpc_debugfs_root, NULL, &fops_entry_flush);
- debugfs_create_file("uaccess_flush", 0600, powerpc_debugfs_root, NULL, &fops_uaccess_flush);
+ debugfs_create_file("rfi_flush", 0600, arch_debugfs_dir, NULL, &fops_rfi_flush);
+ debugfs_create_file("entry_flush", 0600, arch_debugfs_dir, NULL, &fops_entry_flush);
+ debugfs_create_file("uaccess_flush", 0600, arch_debugfs_dir, NULL, &fops_uaccess_flush);
return 0;
}
device_initcall(rfi_flush_debugfs_init);
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index aa9c2d01424a..b1e43b69a559 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -33,7 +33,6 @@
#include <linux/of_platform.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
-#include <asm/debugfs.h>
#include <asm/io.h>
#include <asm/paca.h>
#include <asm/prom.h>
@@ -773,18 +772,6 @@ static int __init check_cache_coherency(void)
late_initcall(check_cache_coherency);
#endif /* CONFIG_CHECK_CACHE_COHERENCY */
-#ifdef CONFIG_DEBUG_FS
-struct dentry *powerpc_debugfs_root;
-EXPORT_SYMBOL(powerpc_debugfs_root);
-
-static int powerpc_debugfs_init(void)
-{
- powerpc_debugfs_root = debugfs_create_dir("powerpc", NULL);
- return 0;
-}
-arch_initcall(powerpc_debugfs_init);
-#endif
-
void ppc_printk_progress(char *s, unsigned short hex)
{
pr_info("%s\n", s);
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 1ff258f6c76c..eaa79a0996d1 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -32,7 +32,6 @@
#include <linux/nmi.h>
#include <linux/pgtable.h>
-#include <asm/debugfs.h>
#include <asm/kvm_guest.h>
#include <asm/io.h>
#include <asm/kdump.h>
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 447b78a87c8f..9cc7d3dbf439 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -78,6 +78,7 @@ struct task_struct *secondary_current;
bool has_big_cores;
bool coregroup_enabled;
bool thread_group_shares_l2;
+bool thread_group_shares_l3;
DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
@@ -101,7 +102,7 @@ enum {
#define MAX_THREAD_LIST_SIZE 8
#define THREAD_GROUP_SHARE_L1 1
-#define THREAD_GROUP_SHARE_L2 2
+#define THREAD_GROUP_SHARE_L2_L3 2
struct thread_groups {
unsigned int property;
unsigned int nr_groups;
@@ -122,14 +123,20 @@ static struct thread_groups_list tgl[NR_CPUS] __initdata;
* On big-cores system, thread_group_l1_cache_map for each CPU corresponds to
* the set its siblings that share the L1-cache.
*/
-static DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map);
+DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map);
/*
* On some big-cores system, thread_group_l2_cache_map for each CPU
* corresponds to the set its siblings within the core that share the
* L2-cache.
*/
-static DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map);
+DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map);
+
+/*
+ * On P10, thread_group_l3_cache_map for each CPU is equal to the
+ * thread_group_l2_cache_map
+ */
+DEFINE_PER_CPU(cpumask_var_t, thread_group_l3_cache_map);
/* SMP operations for this machine */
struct smp_ops_t *smp_ops;
@@ -889,19 +896,41 @@ out:
return tg;
}
+static int update_mask_from_threadgroup(cpumask_var_t *mask, struct thread_groups *tg, int cpu, int cpu_group_start)
+{
+ int first_thread = cpu_first_thread_sibling(cpu);
+ int i;
+
+ zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu));
+
+ for (i = first_thread; i < first_thread + threads_per_core; i++) {
+ int i_group_start = get_cpu_thread_group_start(i, tg);
+
+ if (unlikely(i_group_start == -1)) {
+ WARN_ON_ONCE(1);
+ return -ENODATA;
+ }
+
+ if (i_group_start == cpu_group_start)
+ cpumask_set_cpu(i, *mask);
+ }
+
+ return 0;
+}
+
static int __init init_thread_group_cache_map(int cpu, int cache_property)
{
- int first_thread = cpu_first_thread_sibling(cpu);
- int i, cpu_group_start = -1, err = 0;
+ int cpu_group_start = -1, err = 0;
struct thread_groups *tg = NULL;
cpumask_var_t *mask = NULL;
if (cache_property != THREAD_GROUP_SHARE_L1 &&
- cache_property != THREAD_GROUP_SHARE_L2)
+ cache_property != THREAD_GROUP_SHARE_L2_L3)
return -EINVAL;
tg = get_thread_groups(cpu, cache_property, &err);
+
if (!tg)
return err;
@@ -912,25 +941,18 @@ static int __init init_thread_group_cache_map(int cpu, int cache_property)
return -ENODATA;
}
- if (cache_property == THREAD_GROUP_SHARE_L1)
+ if (cache_property == THREAD_GROUP_SHARE_L1) {
mask = &per_cpu(thread_group_l1_cache_map, cpu);
- else if (cache_property == THREAD_GROUP_SHARE_L2)
+ update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start);
+ }
+ else if (cache_property == THREAD_GROUP_SHARE_L2_L3) {
mask = &per_cpu(thread_group_l2_cache_map, cpu);
-
- zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu));
-
- for (i = first_thread; i < first_thread + threads_per_core; i++) {
- int i_group_start = get_cpu_thread_group_start(i, tg);
-
- if (unlikely(i_group_start == -1)) {
- WARN_ON_ONCE(1);
- return -ENODATA;
- }
-
- if (i_group_start == cpu_group_start)
- cpumask_set_cpu(i, *mask);
+ update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start);
+ mask = &per_cpu(thread_group_l3_cache_map, cpu);
+ update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start);
}
+
return 0;
}
@@ -1020,14 +1042,16 @@ static int __init init_big_cores(void)
has_big_cores = true;
for_each_possible_cpu(cpu) {
- int err = init_thread_group_cache_map(cpu, THREAD_GROUP_SHARE_L2);
+ int err = init_thread_group_cache_map(cpu, THREAD_GROUP_SHARE_L2_L3);
if (err)
return err;
}
thread_group_shares_l2 = true;
- pr_debug("L2 cache only shared by the threads in the small core\n");
+ thread_group_shares_l3 = true;
+ pr_debug("L2/L3 cache only shared by the threads in the small core\n");
+
return 0;
}
@@ -1085,7 +1109,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
}
if (cpu_to_chip_id(boot_cpuid) != -1) {
- int idx = num_possible_cpus() / threads_per_core;
+ int idx = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
/*
* All threads of a core will all belong to the same core,
@@ -1376,7 +1400,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask)
l2_cache = cpu_to_l2cache(cpu);
if (!l2_cache || !*mask) {
/* Assume only core siblings share cache with this CPU */
- for_each_cpu(i, submask_fn(cpu))
+ for_each_cpu(i, cpu_sibling_mask(cpu))
set_cpus_related(cpu, i, cpu_l2_cache_mask);
return false;
@@ -1418,6 +1442,8 @@ static void remove_cpu_from_masks(int cpu)
struct cpumask *(*mask_fn)(int) = cpu_sibling_mask;
int i;
+ unmap_cpu_from_node(cpu);
+
if (shared_caches)
mask_fn = cpu_l2_cache_mask;
@@ -1502,7 +1528,9 @@ static void add_cpu_to_masks(int cpu)
* This CPU will not be in the online mask yet so we need to manually
* add it to it's own thread sibling mask.
*/
+ map_cpu_to_node(cpu, cpu_to_node(cpu));
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_core_mask(cpu));
for (i = first_thread; i < first_thread + threads_per_core; i++)
if (cpu_online(i))
@@ -1520,11 +1548,6 @@ static void add_cpu_to_masks(int cpu)
if (chip_id_lookup_table && ret)
chip_id = cpu_to_chip_id(cpu);
- if (chip_id == -1) {
- cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
- goto out;
- }
-
if (shared_caches)
submask_fn = cpu_l2_cache_mask;
@@ -1534,6 +1557,10 @@ static void add_cpu_to_masks(int cpu)
/* Skip all CPUs already part of current CPU core mask */
cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu));
+ /* If chip_id is -1; limit the cpu_core_mask to within DIE*/
+ if (chip_id == -1)
+ cpumask_and(mask, mask, cpu_cpu_mask(cpu));
+
for_each_cpu(i, mask) {
if (chip_id == cpu_to_chip_id(i)) {
or_cpumasks_related(cpu, i, submask_fn, cpu_core_mask);
@@ -1543,7 +1570,6 @@ static void add_cpu_to_masks(int cpu)
}
}
-out:
free_cpumask_var(mask);
}
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index 2b0d04a1b7d2..9e4a4a7af380 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -8,6 +8,7 @@
* Copyright 2018 Nick Piggin, Michael Ellerman, IBM Corp.
*/
+#include <linux/delay.h>
#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/module.h>
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index bf4ae0f0e36c..825931e400df 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -41,20 +41,13 @@ static inline long do_mmap2(unsigned long addr, size_t len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long off, int shift)
{
- long ret = -EINVAL;
-
if (!arch_validate_prot(prot, addr))
- goto out;
+ return -EINVAL;
- if (shift) {
- if (off & ((1 << shift) - 1))
- goto out;
- off >>= shift;
- }
+ if (!IS_ALIGNED(off, 1 << shift))
+ return -EINVAL;
- ret = ksys_mmap_pgoff(addr, len, prot, flags, fd, off);
-out:
- return ret;
+ return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> shift);
}
SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len,
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 6f3953f2a0d5..29b55e2e035c 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -526,3 +526,5 @@
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 common process_mrelease sys_process_mrelease
diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index b9a047d92ec0..8e83d19fe8fa 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -164,7 +164,7 @@ static void tau_work_func(struct work_struct *work)
queue_work(tau_workq, work);
}
-DECLARE_WORK(tau_work, tau_work_func);
+static DECLARE_WORK(tau_work, tau_work_func);
/*
* setup the TAU
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index c487ba5a6e11..934d8ae66cc6 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -31,6 +31,7 @@
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
#include <linux/kernel.h>
#include <linux/param.h>
#include <linux/string.h>
@@ -52,8 +53,6 @@
#include <linux/irq_work.h>
#include <linux/of_clk.h>
#include <linux/suspend.h>
-#include <linux/sched/cputime.h>
-#include <linux/sched/clock.h>
#include <linux/processor.h>
#include <asm/trace.h>
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index cd5f633c9b1b..4390f8d72126 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -37,10 +37,10 @@
#include <linux/smp.h>
#include <linux/console.h>
#include <linux/kmsg_dump.h>
+#include <linux/debugfs.h>
#include <asm/emulated_ops.h>
#include <linux/uaccess.h>
-#include <asm/debugfs.h>
#include <asm/interrupt.h>
#include <asm/io.h>
#include <asm/machdep.h>
@@ -427,7 +427,7 @@ void hv_nmi_check_nonrecoverable(struct pt_regs *regs)
return;
nonrecoverable:
- regs_set_return_msr(regs, regs->msr & ~MSR_RI);
+ regs_set_unrecoverable(regs);
#endif
}
DEFINE_INTERRUPT_HANDLER_NMI(system_reset_exception)
@@ -497,7 +497,7 @@ out:
die("Unrecoverable nested System Reset", regs, SIGABRT);
#endif
/* Must die if the interrupt is not recoverable */
- if (!(regs->msr & MSR_RI)) {
+ if (regs_is_unrecoverable(regs)) {
/* For the reason explained in die_mce, nmi_exit before die */
nmi_exit();
die("Unrecoverable System Reset", regs, SIGABRT);
@@ -549,7 +549,7 @@ static inline int check_io_access(struct pt_regs *regs)
printk(KERN_DEBUG "%s bad port %lx at %p\n",
(*nip & 0x100)? "OUT to": "IN from",
regs->gpr[rb] - _IO_BASE, nip);
- regs_set_return_msr(regs, regs->msr | MSR_RI);
+ regs_set_recoverable(regs);
regs_set_return_ip(regs, extable_fixup(entry));
return 1;
}
@@ -561,7 +561,7 @@ static inline int check_io_access(struct pt_regs *regs)
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
/* On 4xx, the reason for the machine check or program exception
is in the ESR. */
-#define get_reason(regs) ((regs)->dsisr)
+#define get_reason(regs) ((regs)->esr)
#define REASON_FP ESR_FP
#define REASON_ILLEGAL (ESR_PIL | ESR_PUO)
#define REASON_PRIVILEGED ESR_PPR
@@ -839,7 +839,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
bail:
/* Must die if the interrupt is not recoverable */
- if (!(regs->msr & MSR_RI))
+ if (regs_is_unrecoverable(regs))
die_mce("Unrecoverable Machine check", regs, SIGBUS);
#ifdef CONFIG_PPC_BOOK3S_64
@@ -1481,8 +1481,13 @@ static void do_program_check(struct pt_regs *regs)
if (!(regs->msr & MSR_PR) && /* not user-mode */
report_bug(bugaddr, regs) == BUG_TRAP_TYPE_WARN) {
- regs_add_return_ip(regs, 4);
- return;
+ const struct exception_table_entry *entry;
+
+ entry = search_exception_tables(bugaddr);
+ if (entry) {
+ regs_set_return_ip(regs, extable_fixup(entry) + regs->nip - bugaddr);
+ return;
+ }
}
_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
return;
@@ -2271,7 +2276,7 @@ static int __init ppc_warn_emulated_init(void)
struct ppc_emulated_entry *entries = (void *)&ppc_emulated;
dir = debugfs_create_dir("emulated_instructions",
- powerpc_debugfs_root);
+ arch_debugfs_dir);
debugfs_create_u32("do_warn", 0644, dir, &ppc_warn_emulated);
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index fc120fac1910..ba03eedfdcd8 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -65,9 +65,8 @@ _GLOBAL(load_up_altivec)
1:
/* enable use of VMX after return */
#ifdef CONFIG_PPC32
- mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */
+ addi r5,r2,THREAD
oris r9,r9,MSR_VEC@h
- tovirt(r5, r5)
#else
ld r4,PACACURRENT(r13)
addi r5,r4,THREAD /* Get THREAD */
@@ -81,7 +80,6 @@ _GLOBAL(load_up_altivec)
li r4,1
stb r4,THREAD_LOAD_VEC(r5)
addi r6,r5,THREAD_VRSTATE
- li r4,1
li r10,VRSTATE_VSCR
stw r4,THREAD_USED_VR(r5)
lvx v0,r10,r6
diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
index 8a449b2d8715..89c069d664a5 100644
--- a/arch/powerpc/kexec/core_64.c
+++ b/arch/powerpc/kexec/core_64.c
@@ -64,15 +64,18 @@ int default_machine_kexec_prepare(struct kimage *image)
begin = image->segment[i].mem;
end = begin + image->segment[i].memsz;
- if ((begin < high) && (end > low))
+ if ((begin < high) && (end > low)) {
+ of_node_put(node);
return -ETXTBSY;
+ }
}
}
return 0;
}
-static void copy_segments(unsigned long ind)
+/* Called during kexec sequence with MMU off */
+static notrace void copy_segments(unsigned long ind)
{
unsigned long entry;
unsigned long *ptr;
@@ -105,7 +108,8 @@ static void copy_segments(unsigned long ind)
}
}
-void kexec_copy_flush(struct kimage *image)
+/* Called during kexec sequence with MMU off */
+notrace void kexec_copy_flush(struct kimage *image)
{
long i, nr_segments = image->nr_segments;
struct kexec_segment ranges[KEXEC_SEGMENT_MAX];
diff --git a/arch/powerpc/kexec/relocate_32.S b/arch/powerpc/kexec/relocate_32.S
index 61946c19e07c..cf6e52bdf8d8 100644
--- a/arch/powerpc/kexec/relocate_32.S
+++ b/arch/powerpc/kexec/relocate_32.S
@@ -93,7 +93,7 @@ wmmucr:
* Invalidate all the TLB entries except the current entry
* where we are running from
*/
- bl 0f /* Find our address */
+ bcl 20,31,$+4 /* Find our address */
0: mflr r5 /* Make it accessible */
tlbsx r23,0,r5 /* Find entry we are in */
li r4,0 /* Start at TLB entry 0 */
@@ -158,7 +158,7 @@ write_out:
/* Switch to other address space in MSR */
insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */
- bl 1f
+ bcl 20,31,$+4
1: mflr r8
addi r8, r8, (2f-1b) /* Find the target offset */
@@ -202,7 +202,7 @@ next_tlb:
li r9,0
insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */
- bl 1f
+ bcl 20,31,$+4
1: mflr r8
and r8, r8, r11 /* Get our offset within page */
addi r8, r8, (2f-1b)
@@ -240,7 +240,7 @@ setup_map_47x:
sync
/* Find the entry we are running from */
- bl 2f
+ bcl 20,31,$+4
2: mflr r23
tlbsx r23, 0, r23
tlbre r24, r23, 0 /* TLB Word 0 */
@@ -296,7 +296,7 @@ clear_utlb_entry:
/* Update the msr to the new TS */
insrwi r5, r7, 1, 26
- bl 1f
+ bcl 20,31,$+4
1: mflr r6
addi r6, r6, (2f-1b)
@@ -355,7 +355,7 @@ write_utlb:
/* Defaults to 256M */
lis r10, 0x1000
- bl 1f
+ bcl 20,31,$+4
1: mflr r4
addi r4, r4, (2f-1b) /* virtual address of 2f */
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index e45644657d49..ff581d70f20c 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -38,7 +38,6 @@ config KVM_BOOK3S_32_HANDLER
config KVM_BOOK3S_64_HANDLER
bool
select KVM_BOOK3S_HANDLER
- select PPC_DAWR_FORCE_ENABLE
config KVM_BOOK3S_PR_POSSIBLE
bool
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 740e51def5a5..58391b4b32ed 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -23,7 +23,8 @@ extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu,
extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu,
int sprn, ulong *spr_val);
extern int kvmppc_book3s_init_pr(void);
-extern void kvmppc_book3s_exit_pr(void);
+void kvmppc_book3s_exit_pr(void);
+extern int kvmppc_handle_exit_pr(struct kvm_vcpu *vcpu, unsigned int exit_nr);
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
extern void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val);
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 26b8b27a3755..feee40cb2ba1 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -196,7 +196,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
hva_t ptegp;
u64 pteg[16];
u64 avpn = 0;
- u64 v, r;
+ u64 r;
u64 v_val, v_mask;
u64 eaddr_mask;
int i;
@@ -285,7 +285,6 @@ do_second:
goto do_second;
}
- v = be64_to_cpu(pteg[i]);
r = be64_to_cpu(pteg[i+1]);
pp = (r & HPTE_R_PP) | key;
if (r & HPTE_R_PP0)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index b5905ae4377c..16359525a40f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -44,6 +44,9 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
(to != NULL) ? __pa(to): 0,
(from != NULL) ? __pa(from): 0, n);
+ if (eaddr & (0xFFFUL << 52))
+ return ret;
+
quadrant = 1;
if (!pid)
quadrant = 2;
@@ -65,10 +68,12 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
}
isync();
+ pagefault_disable();
if (is_load)
- ret = copy_from_user_nofault(to, (const void __user *)from, n);
+ ret = __copy_from_user_inatomic(to, (const void __user *)from, n);
else
- ret = copy_to_user_nofault((void __user *)to, from, n);
+ ret = __copy_to_user_inatomic((void __user *)to, from, n);
+ pagefault_enable();
/* switch the pid first to avoid running host with unallocated pid */
if (quadrant == 1 && pid != old_pid)
@@ -81,7 +86,6 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
return ret;
}
-EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix);
static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
void *to, void *from, unsigned long n)
@@ -117,14 +121,12 @@ long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
return ret;
}
-EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix);
long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
unsigned long n)
{
return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
}
-EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix);
int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, u64 root,
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index dc6591548f0c..636c6ae0939b 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -173,10 +173,13 @@ static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt,
idx -= stt->offset;
page = stt->pages[idx / TCES_PER_PAGE];
/*
- * page must not be NULL in real mode,
- * kvmppc_rm_ioba_validate() must have taken care of this.
+ * kvmppc_rm_ioba_validate() allows pages not be allocated if TCE is
+ * being cleared, otherwise it returns H_TOO_HARD and we skip this.
*/
- WARN_ON_ONCE_RM(!page);
+ if (!page) {
+ WARN_ON_ONCE_RM(tce != 0);
+ return;
+ }
tbl = kvmppc_page_address(page);
tbl[idx % TCES_PER_PAGE] = tce;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 085fb8ecbf68..bb0dacf7cbec 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -59,6 +59,7 @@
#include <asm/kvm_book3s.h>
#include <asm/mmu_context.h>
#include <asm/lppaca.h>
+#include <asm/pmc.h>
#include <asm/processor.h>
#include <asm/cputhreads.h>
#include <asm/page.h>
@@ -1165,7 +1166,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
break;
#endif
case H_RANDOM:
- if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
+ if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4]))
ret = H_HARDWARE;
break;
case H_RPT_INVALIDATE:
@@ -1679,6 +1680,21 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
r = RESUME_GUEST;
}
break;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ case BOOK3S_INTERRUPT_HV_SOFTPATCH:
+ /*
+ * This occurs for various TM-related instructions that
+ * we need to emulate on POWER9 DD2.2. We have already
+ * handled the cases where the guest was in real-suspend
+ * mode and was transitioning to transactional state.
+ */
+ r = kvmhv_p9_tm_emulation(vcpu);
+ if (r != -1)
+ break;
+ fallthrough; /* go to facility unavailable handler */
+#endif
+
/*
* This occurs if the guest (kernel or userspace), does something that
* is prohibited by HFSCR.
@@ -1697,18 +1713,6 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
}
break;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
- case BOOK3S_INTERRUPT_HV_SOFTPATCH:
- /*
- * This occurs for various TM-related instructions that
- * we need to emulate on POWER9 DD2.2. We have already
- * handled the cases where the guest was in real-suspend
- * mode and was transitioning to transactional state.
- */
- r = kvmhv_p9_tm_emulation(vcpu);
- break;
-#endif
-
case BOOK3S_INTERRUPT_HV_RM_HARD:
r = RESUME_PASSTHROUGH;
break;
@@ -1727,6 +1731,7 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
{
+ struct kvm_nested_guest *nested = vcpu->arch.nested;
int r;
int srcu_idx;
@@ -1811,9 +1816,41 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
* mode and was transitioning to transactional state.
*/
r = kvmhv_p9_tm_emulation(vcpu);
- break;
+ if (r != -1)
+ break;
+ fallthrough; /* go to facility unavailable handler */
#endif
+ case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
+ u64 cause = vcpu->arch.hfscr >> 56;
+
+ /*
+ * Only pass HFU interrupts to the L1 if the facility is
+ * permitted but disabled by the L1's HFSCR, otherwise
+ * the interrupt does not make sense to the L1 so turn
+ * it into a HEAI.
+ */
+ if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) ||
+ (nested->hfscr & (1UL << cause))) {
+ vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST;
+
+ /*
+ * If the fetch failed, return to guest and
+ * try executing it again.
+ */
+ r = kvmppc_get_last_inst(vcpu, INST_GENERIC,
+ &vcpu->arch.emul_inst);
+ if (r != EMULATE_DONE)
+ r = RESUME_GUEST;
+ else
+ r = RESUME_HOST;
+ } else {
+ r = RESUME_HOST;
+ }
+
+ break;
+ }
+
case BOOK3S_INTERRUPT_HV_RM_HARD:
vcpu->arch.trap = 0;
r = RESUME_GUEST;
@@ -2684,6 +2721,7 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)
spin_lock_init(&vcpu->arch.vpa_update_lock);
spin_lock_init(&vcpu->arch.tbacct_lock);
vcpu->arch.busy_preempt = TB_NIL;
+ vcpu->arch.shregs.msr = MSR_ME;
vcpu->arch.intr_msr = MSR_SF | MSR_ME;
/*
@@ -2705,6 +2743,8 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)
if (cpu_has_feature(CPU_FTR_TM_COMP))
vcpu->arch.hfscr |= HFSCR_TM;
+ vcpu->arch.hfscr_permitted = vcpu->arch.hfscr;
+
kvmppc_mmu_book3s_hv_init(vcpu);
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@ -3727,7 +3767,6 @@ static void load_spr_state(struct kvm_vcpu *vcpu)
mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
mtspr(SPRN_BESCR, vcpu->arch.bescr);
- mtspr(SPRN_WORT, vcpu->arch.wort);
mtspr(SPRN_TIDR, vcpu->arch.tid);
mtspr(SPRN_AMR, vcpu->arch.amr);
mtspr(SPRN_UAMOR, vcpu->arch.uamor);
@@ -3754,7 +3793,6 @@ static void store_spr_state(struct kvm_vcpu *vcpu)
vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
vcpu->arch.bescr = mfspr(SPRN_BESCR);
- vcpu->arch.wort = mfspr(SPRN_WORT);
vcpu->arch.tid = mfspr(SPRN_TIDR);
vcpu->arch.amr = mfspr(SPRN_AMR);
vcpu->arch.uamor = mfspr(SPRN_UAMOR);
@@ -3786,7 +3824,6 @@ static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu,
struct p9_host_os_sprs *host_os_sprs)
{
mtspr(SPRN_PSPB, 0);
- mtspr(SPRN_WORT, 0);
mtspr(SPRN_UAMOR, 0);
mtspr(SPRN_DSCR, host_os_sprs->dscr);
@@ -3852,6 +3889,18 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
+#ifdef CONFIG_PPC_PSERIES
+ if (kvmhv_on_pseries()) {
+ barrier();
+ if (vcpu->arch.vpa.pinned_addr) {
+ struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
+ get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use;
+ } else {
+ get_lppaca()->pmcregs_in_use = 1;
+ }
+ barrier();
+ }
+#endif
kvmhv_load_guest_pmu(vcpu);
msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
@@ -3986,6 +4035,13 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
save_pmu |= nesting_enabled(vcpu->kvm);
kvmhv_save_guest_pmu(vcpu, save_pmu);
+#ifdef CONFIG_PPC_PSERIES
+ if (kvmhv_on_pseries()) {
+ barrier();
+ get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse();
+ barrier();
+ }
+#endif
vc->entry_exit_map = 0x101;
vc->in_guest = 0;
@@ -5328,6 +5384,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
struct kvmppc_passthru_irqmap *pimap;
struct irq_chip *chip;
int i, rc = 0;
+ struct irq_data *host_data;
if (!kvm_irq_bypass)
return 1;
@@ -5355,7 +5412,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
* what our real-mode EOI code does, or a XIVE interrupt
*/
chip = irq_data_get_irq_chip(&desc->irq_data);
- if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) {
+ if (!chip || !is_pnv_opal_msi(chip)) {
pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
host_irq, guest_gsi);
mutex_unlock(&kvm->lock);
@@ -5392,15 +5449,22 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
* the KVM real mode handler.
*/
smp_wmb();
- irq_map->r_hwirq = desc->irq_data.hwirq;
+
+ /*
+ * The 'host_irq' number is mapped in the PCI-MSI domain but
+ * the underlying calls, which will EOI the interrupt in real
+ * mode, need an HW IRQ number mapped in the XICS IRQ domain.
+ */
+ host_data = irq_domain_get_irq_data(irq_get_default_host(), host_irq);
+ irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data);
if (i == pimap->n_mapped)
pimap->n_mapped++;
if (xics_on_xive())
- rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
+ rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
else
- kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+ kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq);
if (rc)
irq_map->r_hwirq = 0;
@@ -5439,7 +5503,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
}
if (xics_on_xive())
- rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
+ rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq);
else
kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index be8ef1c5b1bf..fcf4760a3a0e 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -137,23 +137,23 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
* exist in the system. We use a counter of VMs to track this.
*
* One of the operations we need to block is onlining of secondaries, so we
- * protect hv_vm_count with get/put_online_cpus().
+ * protect hv_vm_count with cpus_read_lock/unlock().
*/
static atomic_t hv_vm_count;
void kvm_hv_vm_activated(void)
{
- get_online_cpus();
+ cpus_read_lock();
atomic_inc(&hv_vm_count);
- put_online_cpus();
+ cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(kvm_hv_vm_activated);
void kvm_hv_vm_deactivated(void)
{
- get_online_cpus();
+ cpus_read_lock();
atomic_dec(&hv_vm_count);
- put_online_cpus();
+ cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(kvm_hv_vm_deactivated);
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 898f942eb198..ed8a2c9f5629 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -99,13 +99,12 @@ static void byteswap_hv_regs(struct hv_guest_state *hr)
hr->dawrx1 = swab64(hr->dawrx1);
}
-static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
+static void save_hv_return_state(struct kvm_vcpu *vcpu,
struct hv_guest_state *hr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
hr->dpdes = vc->dpdes;
- hr->hfscr = vcpu->arch.hfscr;
hr->purr = vcpu->arch.purr;
hr->spurr = vcpu->arch.spurr;
hr->ic = vcpu->arch.ic;
@@ -119,7 +118,7 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
hr->pidr = vcpu->arch.pid;
hr->cfar = vcpu->arch.cfar;
hr->ppr = vcpu->arch.ppr;
- switch (trap) {
+ switch (vcpu->arch.trap) {
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
hr->hdar = vcpu->arch.fault_dar;
hr->hdsisr = vcpu->arch.fault_dsisr;
@@ -128,55 +127,17 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
case BOOK3S_INTERRUPT_H_INST_STORAGE:
hr->asdr = vcpu->arch.fault_gpa;
break;
+ case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
+ hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) |
+ (HFSCR_INTR_CAUSE & vcpu->arch.hfscr));
+ break;
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
hr->heir = vcpu->arch.emul_inst;
break;
}
}
-/*
- * This can result in some L0 HV register state being leaked to an L1
- * hypervisor when the hv_guest_state is copied back to the guest after
- * being modified here.
- *
- * There is no known problem with such a leak, and in many cases these
- * register settings could be derived by the guest by observing behaviour
- * and timing, interrupts, etc., but it is an issue to consider.
- */
-static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
-{
- struct kvmppc_vcore *vc = vcpu->arch.vcore;
- u64 mask;
-
- /*
- * Don't let L1 change LPCR bits for the L2 except these:
- */
- mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
- LPCR_LPES | LPCR_MER;
-
- /*
- * Additional filtering is required depending on hardware
- * and configuration.
- */
- hr->lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm,
- (vc->lpcr & ~mask) | (hr->lpcr & mask));
-
- /*
- * Don't let L1 enable features for L2 which we've disabled for L1,
- * but preserve the interrupt cause field.
- */
- hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
-
- /* Don't let data address watchpoint match in hypervisor state */
- hr->dawrx0 &= ~DAWRX_HYP;
- hr->dawrx1 &= ~DAWRX_HYP;
-
- /* Don't let completed instruction address breakpt match in HV state */
- if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
- hr->ciabr &= ~CIABR_PRIV;
-}
-
-static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+static void restore_hv_regs(struct kvm_vcpu *vcpu, const struct hv_guest_state *hr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
@@ -288,6 +249,43 @@ static int kvmhv_write_guest_state_and_regs(struct kvm_vcpu *vcpu,
sizeof(struct pt_regs));
}
+static void load_l2_hv_regs(struct kvm_vcpu *vcpu,
+ const struct hv_guest_state *l2_hv,
+ const struct hv_guest_state *l1_hv, u64 *lpcr)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ u64 mask;
+
+ restore_hv_regs(vcpu, l2_hv);
+
+ /*
+ * Don't let L1 change LPCR bits for the L2 except these:
+ */
+ mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
+ LPCR_LPES | LPCR_MER;
+
+ /*
+ * Additional filtering is required depending on hardware
+ * and configuration.
+ */
+ *lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm,
+ (vc->lpcr & ~mask) | (*lpcr & mask));
+
+ /*
+ * Don't let L1 enable features for L2 which we don't allow for L1,
+ * but preserve the interrupt cause field.
+ */
+ vcpu->arch.hfscr = l2_hv->hfscr & (HFSCR_INTR_CAUSE | vcpu->arch.hfscr_permitted);
+
+ /* Don't let data address watchpoint match in hypervisor state */
+ vcpu->arch.dawrx0 = l2_hv->dawrx0 & ~DAWRX_HYP;
+ vcpu->arch.dawrx1 = l2_hv->dawrx1 & ~DAWRX_HYP;
+
+ /* Don't let completed instruction address breakpt match in HV state */
+ if ((l2_hv->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
+ vcpu->arch.ciabr = l2_hv->ciabr & ~CIABR_PRIV;
+}
+
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
{
long int err, r;
@@ -296,7 +294,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
struct hv_guest_state l2_hv = {0}, saved_l1_hv;
struct kvmppc_vcore *vc = vcpu->arch.vcore;
u64 hv_ptr, regs_ptr;
- u64 hdec_exp;
+ u64 hdec_exp, lpcr;
s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
if (vcpu->kvm->arch.l1_ptcr == 0)
@@ -364,13 +362,14 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
/* set L1 state to L2 state */
vcpu->arch.nested = l2;
vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
+ l2->hfscr = l2_hv.hfscr;
vcpu->arch.regs = l2_regs;
/* Guest must always run with ME enabled, HV disabled. */
vcpu->arch.shregs.msr = (vcpu->arch.regs.msr | MSR_ME) & ~MSR_HV;
- sanitise_hv_regs(vcpu, &l2_hv);
- restore_hv_regs(vcpu, &l2_hv);
+ lpcr = l2_hv.lpcr;
+ load_l2_hv_regs(vcpu, &l2_hv, &saved_l1_hv, &lpcr);
vcpu->arch.ret = RESUME_GUEST;
vcpu->arch.trap = 0;
@@ -380,7 +379,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
r = RESUME_HOST;
break;
}
- r = kvmhv_run_single_vcpu(vcpu, hdec_exp, l2_hv.lpcr);
+ r = kvmhv_run_single_vcpu(vcpu, hdec_exp, lpcr);
} while (is_kvmppc_resume_guest(r));
/* save L2 state for return */
@@ -390,7 +389,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
delta_ic = vcpu->arch.ic - l2_hv.ic;
delta_vtb = vc->vtb - l2_hv.vtb;
- save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
+ save_hv_return_state(vcpu, &l2_hv);
/* restore L1 state */
vcpu->arch.nested = NULL;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 0a11ec88a0ae..587c33fc4564 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -706,6 +706,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
icp->rm_eoied_irq = irq;
}
+ /* Handle passthrough interrupts */
if (state->host_irq) {
++vcpu->stat.pthru_all;
if (state->intr_cpu != -1) {
@@ -759,12 +760,12 @@ int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
static unsigned long eoi_rc;
-static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
+static void icp_eoi(struct irq_data *d, u32 hwirq, __be32 xirr, bool *again)
{
void __iomem *xics_phys;
int64_t rc;
- rc = pnv_opal_pci_msi_eoi(c, hwirq);
+ rc = pnv_opal_pci_msi_eoi(d);
if (rc)
eoi_rc = rc;
@@ -872,8 +873,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
icp_rm_deliver_irq(xics, icp, irq, false);
/* EOI the interrupt */
- icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
- again);
+ icp_eoi(irq_desc_get_irq_data(irq_map->desc), irq_map->r_hwirq, xirr, again);
if (check_too_hard(xics, icp) == H_TOO_HARD)
return 2;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 8dd437d7a2c6..75079397c2a5 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1088,12 +1088,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE
beq kvmppc_hisi
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
- /* For softpatch interrupt, go off and do TM instruction emulation */
- cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
- beq kvmppc_tm_emul
-#endif
-
/* See if this is a leftover HDEC interrupt */
cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
bne 2f
@@ -1599,42 +1593,6 @@ maybe_reenter_guest:
blt deliver_guest_interrupt
b guest_exit_cont
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-/*
- * Softpatch interrupt for transactional memory emulation cases
- * on POWER9 DD2.2. This is early in the guest exit path - we
- * haven't saved registers or done a treclaim yet.
- */
-kvmppc_tm_emul:
- /* Save instruction image in HEIR */
- mfspr r3, SPRN_HEIR
- stw r3, VCPU_HEIR(r9)
-
- /*
- * The cases we want to handle here are those where the guest
- * is in real suspend mode and is trying to transition to
- * transactional mode.
- */
- lbz r0, HSTATE_FAKE_SUSPEND(r13)
- cmpwi r0, 0 /* keep exiting guest if in fake suspend */
- bne guest_exit_cont
- rldicl r3, r11, 64 - MSR_TS_S_LG, 62
- cmpwi r3, 1 /* or if not in suspend state */
- bne guest_exit_cont
-
- /* Call C code to do the emulation */
- mr r3, r9
- bl kvmhv_p9_tm_emulation_early
- nop
- ld r9, HSTATE_KVM_VCPU(r13)
- li r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
- cmpwi r3, 0
- beq guest_exit_cont /* continue exiting if not handled */
- ld r10, VCPU_PC(r9)
- ld r11, VCPU_MSR(r9)
- b fast_interrupt_c_return /* go back to guest if handled */
-#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
-
/*
* Check whether an HDSI is an HPTE not found fault or something else.
* If it is an HPTE not found fault that is due to the guest accessing
diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c
index cc90b8b82329..866cadd70094 100644
--- a/arch/powerpc/kvm/book3s_hv_tm.c
+++ b/arch/powerpc/kvm/book3s_hv_tm.c
@@ -47,6 +47,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
int ra, rs;
/*
+ * The TM softpatch interrupt sets NIP to the instruction following
+ * the faulting instruction, which is not executed. Rewind nip to the
+ * faulting instruction so it looks like a normal synchronous
+ * interrupt, then update nip in the places where the instruction is
+ * emulated.
+ */
+ vcpu->arch.regs.nip -= 4;
+
+ /*
* rfid, rfebb, and mtmsrd encode bit 31 = 0 since it's a reserved bit
* in these instructions, so masking bit 31 out doesn't change these
* instructions. For treclaim., tsr., and trechkpt. instructions if bit
@@ -67,7 +76,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
(newmsr & MSR_TM)));
newmsr = sanitize_msr(newmsr);
vcpu->arch.shregs.msr = newmsr;
- vcpu->arch.cfar = vcpu->arch.regs.nip - 4;
+ vcpu->arch.cfar = vcpu->arch.regs.nip;
vcpu->arch.regs.nip = vcpu->arch.shregs.srr0;
return RESUME_GUEST;
@@ -79,14 +88,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
}
/* check EBB facility is available */
if (!(vcpu->arch.hfscr & HFSCR_EBB)) {
- /* generate an illegal instruction interrupt */
- kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
- return RESUME_GUEST;
+ vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE;
+ vcpu->arch.hfscr |= (u64)FSCR_EBB_LG << 56;
+ vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL;
+ return -1; /* rerun host interrupt handler */
}
if ((msr & MSR_PR) && !(vcpu->arch.fscr & FSCR_EBB)) {
/* generate a facility unavailable interrupt */
- vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) |
- ((u64)FSCR_EBB_LG << 56);
+ vcpu->arch.fscr &= ~FSCR_INTR_CAUSE;
+ vcpu->arch.fscr |= (u64)FSCR_EBB_LG << 56;
kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL);
return RESUME_GUEST;
}
@@ -100,7 +110,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
vcpu->arch.bescr = bescr;
msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
vcpu->arch.shregs.msr = msr;
- vcpu->arch.cfar = vcpu->arch.regs.nip - 4;
+ vcpu->arch.cfar = vcpu->arch.regs.nip;
vcpu->arch.regs.nip = vcpu->arch.ebbrr;
return RESUME_GUEST;
@@ -116,6 +126,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
newmsr = (newmsr & ~MSR_LE) | (msr & MSR_LE);
newmsr = sanitize_msr(newmsr);
vcpu->arch.shregs.msr = newmsr;
+ vcpu->arch.regs.nip += 4;
return RESUME_GUEST;
/* ignore bit 31, see comment above */
@@ -128,14 +139,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
}
/* check for TM disabled in the HFSCR or MSR */
if (!(vcpu->arch.hfscr & HFSCR_TM)) {
- /* generate an illegal instruction interrupt */
- kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
- return RESUME_GUEST;
+ vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE;
+ vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56;
+ vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL;
+ return -1; /* rerun host interrupt handler */
}
if (!(msr & MSR_TM)) {
/* generate a facility unavailable interrupt */
- vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) |
- ((u64)FSCR_TM_LG << 56);
+ vcpu->arch.fscr &= ~FSCR_INTR_CAUSE;
+ vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56;
kvmppc_book3s_queue_irqprio(vcpu,
BOOK3S_INTERRUPT_FAC_UNAVAIL);
return RESUME_GUEST;
@@ -152,20 +164,22 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
msr = (msr & ~MSR_TS_MASK) | MSR_TS_S;
}
vcpu->arch.shregs.msr = msr;
+ vcpu->arch.regs.nip += 4;
return RESUME_GUEST;
/* ignore bit 31, see comment above */
case (PPC_INST_TRECLAIM & PO_XOP_OPCODE_MASK):
/* check for TM disabled in the HFSCR or MSR */
if (!(vcpu->arch.hfscr & HFSCR_TM)) {
- /* generate an illegal instruction interrupt */
- kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
- return RESUME_GUEST;
+ vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE;
+ vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56;
+ vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL;
+ return -1; /* rerun host interrupt handler */
}
if (!(msr & MSR_TM)) {
/* generate a facility unavailable interrupt */
- vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) |
- ((u64)FSCR_TM_LG << 56);
+ vcpu->arch.fscr &= ~FSCR_INTR_CAUSE;
+ vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56;
kvmppc_book3s_queue_irqprio(vcpu,
BOOK3S_INTERRUPT_FAC_UNAVAIL);
return RESUME_GUEST;
@@ -189,6 +203,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
+ vcpu->arch.regs.nip += 4;
return RESUME_GUEST;
/* ignore bit 31, see comment above */
@@ -196,14 +211,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
/* XXX do we need to check for PR=0 here? */
/* check for TM disabled in the HFSCR or MSR */
if (!(vcpu->arch.hfscr & HFSCR_TM)) {
- /* generate an illegal instruction interrupt */
- kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
- return RESUME_GUEST;
+ vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE;
+ vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56;
+ vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL;
+ return -1; /* rerun host interrupt handler */
}
if (!(msr & MSR_TM)) {
/* generate a facility unavailable interrupt */
- vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) |
- ((u64)FSCR_TM_LG << 56);
+ vcpu->arch.fscr &= ~FSCR_INTR_CAUSE;
+ vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56;
kvmppc_book3s_queue_irqprio(vcpu,
BOOK3S_INTERRUPT_FAC_UNAVAIL);
return RESUME_GUEST;
@@ -220,6 +236,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
vcpu->arch.shregs.msr = msr | MSR_TS_S;
+ vcpu->arch.regs.nip += 4;
return RESUME_GUEST;
}
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 303e3cb096db..ebd5d920de8c 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -10,13 +10,13 @@
#include <linux/gfp.h>
#include <linux/anon_inodes.h>
#include <linux/spinlock.h>
-
+#include <linux/debugfs.h>
#include <linux/uaccess.h>
+
#include <asm/kvm_book3s.h>
#include <asm/kvm_ppc.h>
#include <asm/hvcall.h>
#include <asm/xics.h>
-#include <asm/debugfs.h>
#include <asm/time.h>
#include <linux/seq_file.h>
@@ -1024,7 +1024,7 @@ static void xics_debugfs_init(struct kvmppc_xics *xics)
return;
}
- xics->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root,
+ xics->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir,
xics, &xics_debug_fops);
pr_debug("%s: created %s\n", __func__, name);
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 8cfab3547494..a18db9e16ea4 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -22,7 +22,6 @@
#include <asm/xive.h>
#include <asm/xive-regs.h>
#include <asm/debug.h>
-#include <asm/debugfs.h>
#include <asm/time.h>
#include <asm/opal.h>
@@ -59,6 +58,25 @@
*/
#define XIVE_Q_GAP 2
+static bool kvmppc_xive_vcpu_has_save_restore(struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+ /* Check enablement at VP level */
+ return xc->vp_cam & TM_QW1W2_HO;
+}
+
+bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+ struct kvmppc_xive *xive = xc->xive;
+
+ if (xive->flags & KVMPPC_XIVE_FLAG_SAVE_RESTORE)
+ return kvmppc_xive_vcpu_has_save_restore(vcpu);
+
+ return true;
+}
+
/*
* Push a vcpu's context to the XIVE on guest entry.
* This assumes we are in virtual mode (MMU on)
@@ -77,7 +95,8 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
return;
eieio();
- __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
+ if (!kvmppc_xive_vcpu_has_save_restore(vcpu))
+ __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
__raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
vcpu->arch.xive_pushed = 1;
eieio();
@@ -149,7 +168,8 @@ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
/* First load to pull the context, we ignore the value */
__raw_readl(tima + TM_SPC_PULL_OS_CTX);
/* Second load to recover the context state (Words 0 and 1) */
- vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS);
+ if (!kvmppc_xive_vcpu_has_save_restore(vcpu))
+ vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS);
/* Fixup some of the state for the next load */
vcpu->arch.xive_saved_state.lsmfb = 0;
@@ -363,9 +383,9 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio)
if (!vcpu->arch.xive_vcpu)
continue;
rc = xive_provision_queue(vcpu, prio);
- if (rc == 0 && !xive->single_escalation)
+ if (rc == 0 && !kvmppc_xive_has_single_escalation(xive))
kvmppc_xive_attach_escalation(vcpu, prio,
- xive->single_escalation);
+ kvmppc_xive_has_single_escalation(xive));
if (rc)
return rc;
}
@@ -922,13 +942,13 @@ int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
}
int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
- struct irq_desc *host_desc)
+ unsigned long host_irq)
{
struct kvmppc_xive *xive = kvm->arch.xive;
struct kvmppc_xive_src_block *sb;
struct kvmppc_xive_irq_state *state;
- struct irq_data *host_data = irq_desc_get_irq_data(host_desc);
- unsigned int host_irq = irq_desc_get_irq(host_desc);
+ struct irq_data *host_data =
+ irq_domain_get_irq_data(irq_get_default_host(), host_irq);
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
u16 idx;
u8 prio;
@@ -937,7 +957,8 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
if (!xive)
return -ENODEV;
- pr_devel("set_mapped girq 0x%lx host HW irq 0x%x...\n",guest_irq, hw_irq);
+ pr_debug("%s: GIRQ 0x%lx host IRQ %ld XIVE HW IRQ 0x%x\n",
+ __func__, guest_irq, host_irq, hw_irq);
sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
if (!sb)
@@ -959,7 +980,7 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
*/
rc = irq_set_vcpu_affinity(host_irq, state);
if (rc) {
- pr_err("Failed to set VCPU affinity for irq %d\n", host_irq);
+ pr_err("Failed to set VCPU affinity for host IRQ %ld\n", host_irq);
return rc;
}
@@ -1019,12 +1040,11 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped);
int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
- struct irq_desc *host_desc)
+ unsigned long host_irq)
{
struct kvmppc_xive *xive = kvm->arch.xive;
struct kvmppc_xive_src_block *sb;
struct kvmppc_xive_irq_state *state;
- unsigned int host_irq = irq_desc_get_irq(host_desc);
u16 idx;
u8 prio;
int rc;
@@ -1032,7 +1052,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
if (!xive)
return -ENODEV;
- pr_devel("clr_mapped girq 0x%lx...\n", guest_irq);
+ pr_debug("%s: GIRQ 0x%lx host IRQ %ld\n", __func__, guest_irq, host_irq);
sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
if (!sb)
@@ -1059,7 +1079,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
/* Release the passed-through interrupt to the host */
rc = irq_set_vcpu_affinity(host_irq, NULL);
if (rc) {
- pr_err("Failed to clr VCPU affinity for irq %d\n", host_irq);
+ pr_err("Failed to clr VCPU affinity for host IRQ %ld\n", host_irq);
return rc;
}
@@ -1199,7 +1219,7 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
/* Free escalations */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
if (xc->esc_virq[i]) {
- if (xc->xive->single_escalation)
+ if (kvmppc_xive_has_single_escalation(xc->xive))
xive_cleanup_single_escalation(vcpu, xc,
xc->esc_virq[i]);
free_irq(xc->esc_virq[i], vcpu);
@@ -1319,6 +1339,12 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
if (r)
goto bail;
+ if (!kvmppc_xive_check_save_restore(vcpu)) {
+ pr_err("inconsistent save-restore setup for VCPU %d\n", cpu);
+ r = -EIO;
+ goto bail;
+ }
+
/* Configure VCPU fields for use by assembly push/pull */
vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
@@ -1340,7 +1366,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
* Enable the VP first as the single escalation mode will
* affect escalation interrupts numbering
*/
- r = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
+ r = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive));
if (r) {
pr_err("Failed to enable VP in OPAL, err %d\n", r);
goto bail;
@@ -1357,15 +1383,15 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
struct xive_q *q = &xc->queues[i];
/* Single escalation, no queue 7 */
- if (i == 7 && xive->single_escalation)
+ if (i == 7 && kvmppc_xive_has_single_escalation(xive))
break;
/* Is queue already enabled ? Provision it */
if (xive->qmap & (1 << i)) {
r = xive_provision_queue(vcpu, i);
- if (r == 0 && !xive->single_escalation)
+ if (r == 0 && !kvmppc_xive_has_single_escalation(xive))
kvmppc_xive_attach_escalation(
- vcpu, i, xive->single_escalation);
+ vcpu, i, kvmppc_xive_has_single_escalation(xive));
if (r)
goto bail;
} else {
@@ -1380,7 +1406,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
}
/* If not done above, attach priority 0 escalation */
- r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation);
+ r = kvmppc_xive_attach_escalation(vcpu, 0, kvmppc_xive_has_single_escalation(xive));
if (r)
goto bail;
@@ -2135,7 +2161,11 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
*/
xive->nr_servers = KVM_MAX_VCPUS;
- xive->single_escalation = xive_native_has_single_escalation();
+ if (xive_native_has_single_escalation())
+ xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION;
+
+ if (xive_native_has_save_restore())
+ xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE;
kvm->arch.xive = xive;
return 0;
@@ -2329,7 +2359,7 @@ static void xive_debugfs_init(struct kvmppc_xive *xive)
return;
}
- xive->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+ xive->dentry = debugfs_create_file(name, S_IRUGO, arch_debugfs_dir,
xive, &xive_debug_fops);
pr_debug("%s: created %s\n", __func__, name);
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index afe9eeac6d56..e6a9651c6f1e 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -97,6 +97,9 @@ struct kvmppc_xive_ops {
int (*reset_mapped)(struct kvm *kvm, unsigned long guest_irq);
};
+#define KVMPPC_XIVE_FLAG_SINGLE_ESCALATION 0x1
+#define KVMPPC_XIVE_FLAG_SAVE_RESTORE 0x2
+
struct kvmppc_xive {
struct kvm *kvm;
struct kvm_device *dev;
@@ -133,7 +136,7 @@ struct kvmppc_xive {
u32 q_page_order;
/* Flags */
- u8 single_escalation;
+ u8 flags;
/* Number of entries in the VP block */
u32 nr_servers;
@@ -307,6 +310,12 @@ void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
struct kvmppc_xive_vcpu *xc, int irq);
int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp);
int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr);
+bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu);
+
+static inline bool kvmppc_xive_has_single_escalation(struct kvmppc_xive *xive)
+{
+ return xive->flags & KVMPPC_XIVE_FLAG_SINGLE_ESCALATION;
+}
#endif /* CONFIG_KVM_XICS */
#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 573ecaab3597..99db9ac49901 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -20,7 +20,6 @@
#include <asm/xive.h>
#include <asm/xive-regs.h>
#include <asm/debug.h>
-#include <asm/debugfs.h>
#include <asm/opal.h>
#include <linux/debugfs.h>
@@ -93,7 +92,7 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
/* Free the escalation irq */
if (xc->esc_virq[i]) {
- if (xc->xive->single_escalation)
+ if (kvmppc_xive_has_single_escalation(xc->xive))
xive_cleanup_single_escalation(vcpu, xc,
xc->esc_virq[i]);
free_irq(xc->esc_virq[i], vcpu);
@@ -168,11 +167,17 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
goto bail;
}
+ if (!kvmppc_xive_check_save_restore(vcpu)) {
+ pr_err("inconsistent save-restore setup for VCPU %d\n", server_num);
+ rc = -EIO;
+ goto bail;
+ }
+
/*
* Enable the VP first as the single escalation mode will
* affect escalation interrupts numbering
*/
- rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
+ rc = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive));
if (rc) {
pr_err("Failed to enable VP in OPAL: %d\n", rc);
goto bail;
@@ -693,7 +698,7 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
}
rc = kvmppc_xive_attach_escalation(vcpu, priority,
- xive->single_escalation);
+ kvmppc_xive_has_single_escalation(xive));
error:
if (rc)
kvmppc_xive_native_cleanup_queue(vcpu, priority);
@@ -820,7 +825,7 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive)
for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
/* Single escalation, no queue 7 */
- if (prio == 7 && xive->single_escalation)
+ if (prio == 7 && kvmppc_xive_has_single_escalation(xive))
break;
if (xc->esc_virq[prio]) {
@@ -1111,7 +1116,12 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
*/
xive->nr_servers = KVM_MAX_VCPUS;
- xive->single_escalation = xive_native_has_single_escalation();
+ if (xive_native_has_single_escalation())
+ xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION;
+
+ if (xive_native_has_save_restore())
+ xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE;
+
xive->ops = &kvmppc_xive_native_ops;
kvm->arch.xive = xive;
@@ -1257,7 +1267,7 @@ static void xive_native_debugfs_init(struct kvmppc_xive *xive)
return;
}
- xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root,
+ xive->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir,
xive, &xive_native_debug_fops);
pr_debug("%s: created %s\n", __func__, name);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index eae4ec2988fc..df8172da2301 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
-obj-$(CONFIG_PPC_PTDUMP) += ptdump/
+obj-$(CONFIG_PTDUMP_CORE) += ptdump/
obj-$(CONFIG_KASAN) += kasan/
diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
index 52e170bd95ae..d8279bfe68ea 100644
--- a/arch/powerpc/mm/book3s64/hash_native.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -787,7 +787,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
* TODO: add batching support when enabled. remember, no dynamic memory here,
* although there is the control page available...
*/
-static void native_hpte_clear(void)
+static notrace void native_hpte_clear(void)
{
unsigned long vpn = 0;
unsigned long slot, slots;
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index ac5720371c0d..c145776d3ae5 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -36,8 +36,8 @@
#include <linux/hugetlb.h>
#include <linux/cpu.h>
#include <linux/pgtable.h>
+#include <linux/debugfs.h>
-#include <asm/debugfs.h>
#include <asm/interrupt.h>
#include <asm/processor.h>
#include <asm/mmu.h>
@@ -2072,7 +2072,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n")
static int __init hash64_debugfs(void)
{
- debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, NULL,
+ debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL,
&fops_hpt_order);
return 0;
}
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 9ffa65074cb0..9e16c7b1a6c5 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -6,9 +6,9 @@
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/memblock.h>
+#include <linux/debugfs.h>
#include <misc/cxl-base.h>
-#include <asm/debugfs.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/trace.h>
@@ -172,8 +172,8 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-/* For use by kexec */
-void mmu_cleanup_all(void)
+/* For use by kexec, called with MMU off */
+notrace void mmu_cleanup_all(void)
{
if (radix_enabled())
radix__mmu_cleanup_all();
@@ -520,7 +520,7 @@ static int __init pgtable_debugfs_setup(void)
* invalidated as expected.
*/
debugfs_create_bool("tlbie_enabled", 0600,
- powerpc_debugfs_root,
+ arch_debugfs_dir,
&tlbie_enabled);
return 0;
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index e50ddf129c15..ae20add7954a 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -679,7 +679,8 @@ void radix__early_init_mmu_secondary(void)
mtspr(SPRN_UAMOR, 0);
}
-void radix__mmu_cleanup_all(void)
+/* Called during kexec sequence with MMU off */
+notrace void radix__mmu_cleanup_all(void)
{
unsigned long lpcr;
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index aefc100d79a7..7724af19ed7e 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -10,6 +10,7 @@
#include <linux/memblock.h>
#include <linux/mmu_context.h>
#include <linux/sched/mm.h>
+#include <linux/debugfs.h>
#include <asm/ppc-opcode.h>
#include <asm/tlb.h>
@@ -1106,8 +1107,8 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
* invalidating a full PID, so it has a far lower threshold to change from
* individual page flushes to full-pid flushes.
*/
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
-static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
+static u32 tlb_single_page_flush_ceiling __read_mostly = 33;
+static u32 tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
static inline void __radix__flush_tlb_range(struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -1524,3 +1525,14 @@ void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid,
EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
+static int __init create_tlb_single_page_flush_ceiling(void)
+{
+ debugfs_create_u32("tlb_single_page_flush_ceiling", 0600,
+ arch_debugfs_dir, &tlb_single_page_flush_ceiling);
+ debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600,
+ arch_debugfs_dir, &tlb_local_single_page_flush_ceiling);
+ return 0;
+}
+late_initcall(create_tlb_single_page_flush_ceiling);
+
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index c91bd85eb90e..f0037bcc47a0 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -822,7 +822,7 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault)
/* IRQs are not reconciled here, so can't check irqs_disabled */
VM_WARN_ON(mfmsr() & MSR_EE);
- if (unlikely(!(regs->msr & MSR_RI)))
+ if (regs_is_unrecoverable(regs))
return -EINVAL;
/*
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 9af3832c9d8d..22197b18d85e 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -18,6 +18,7 @@ static int n_root_addr_cells, n_root_size_cells;
static struct drmem_lmb_info __drmem_info;
struct drmem_lmb_info *drmem_info = &__drmem_info;
+static bool in_drmem_update;
u64 drmem_lmb_memory_max(void)
{
@@ -178,6 +179,11 @@ int drmem_update_dt(void)
if (!memory)
return -1;
+ /*
+ * Set in_drmem_update to prevent the notifier callback to process the
+ * DT property back since the change is coming from the LMB tree.
+ */
+ in_drmem_update = true;
prop = of_find_property(memory, "ibm,dynamic-memory", NULL);
if (prop) {
rc = drmem_update_dt_v1(memory, prop);
@@ -186,6 +192,7 @@ int drmem_update_dt(void)
if (prop)
rc = drmem_update_dt_v2(memory, prop);
}
+ in_drmem_update = false;
of_node_put(memory);
return rc;
@@ -307,6 +314,45 @@ int __init walk_drmem_lmbs_early(unsigned long node, void *data,
return ret;
}
+/*
+ * Update the LMB associativity index.
+ */
+static int update_lmb(struct drmem_lmb *updated_lmb,
+ __maybe_unused const __be32 **usm,
+ __maybe_unused void *data)
+{
+ struct drmem_lmb *lmb;
+
+ for_each_drmem_lmb(lmb) {
+ if (lmb->drc_index != updated_lmb->drc_index)
+ continue;
+
+ lmb->aa_index = updated_lmb->aa_index;
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Update the LMB associativity index.
+ *
+ * This needs to be called when the hypervisor is updating the
+ * dynamic-reconfiguration-memory node property.
+ */
+void drmem_update_lmbs(struct property *prop)
+{
+ /*
+ * Don't update the LMBs if triggered by the update done in
+ * drmem_update_dt(), the LMB values have been used to the update the DT
+ * property in that case.
+ */
+ if (in_drmem_update)
+ return;
+ if (!strcmp(prop->name, "ibm,dynamic-memory"))
+ __walk_drmem_v1_lmbs(prop->value, NULL, NULL, update_lmb);
+ else if (!strcmp(prop->name, "ibm,dynamic-memory-v2"))
+ __walk_drmem_v2_lmbs(prop->value, NULL, NULL, update_lmb);
+}
#endif
static int init_drmem_lmb_size(struct device_node *dn)
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 7dac910c0b21..dd1cabc2ea0f 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -180,7 +180,7 @@ static inline void mmu_mark_rodata_ro(void) { }
void __init mmu_mapin_immr(void);
#endif
-#ifdef CONFIG_PPC_DEBUG_WX
+#ifdef CONFIG_DEBUG_WX
void ptdump_check_wx(void);
#else
static inline void ptdump_check_wx(void) { }
diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S
index 4613bf8e9aae..5add4a51e51f 100644
--- a/arch/powerpc/mm/nohash/tlb_low.S
+++ b/arch/powerpc/mm/nohash/tlb_low.S
@@ -199,7 +199,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2)
* Touch enough instruction cache lines to ensure cache hits
*/
1: mflr r9
- bl 2f
+ bcl 20,31,$+4
2: mflr r6
li r7,32
PPC_ICBT(0,R6,R7) /* touch next cache line */
@@ -414,7 +414,7 @@ _GLOBAL(loadcam_multi)
* Set up temporary TLB entry that is the same as what we're
* running from, but in AS=1.
*/
- bl 1f
+ bcl 20,31,$+4
1: mflr r6
tlbsx 0,r8
mfspr r6,SPRN_MAS1
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index f2bf98bdcea2..6f14c8fb6359 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -40,9 +40,6 @@ static int numa_enabled = 1;
static char *cmdline __initdata;
-static int numa_debug;
-#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
-
int numa_cpu_lookup_table[NR_CPUS];
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
struct pglist_data *node_data[MAX_NUMNODES];
@@ -51,14 +48,22 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
EXPORT_SYMBOL(node_to_cpumask_map);
EXPORT_SYMBOL(node_data);
-static int min_common_depth;
+static int primary_domain_index;
static int n_mem_addr_cells, n_mem_size_cells;
-static int form1_affinity;
+
+#define FORM0_AFFINITY 0
+#define FORM1_AFFINITY 1
+#define FORM2_AFFINITY 2
+static int affinity_form;
#define MAX_DISTANCE_REF_POINTS 4
static int distance_ref_points_depth;
static const __be32 *distance_ref_points;
static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
+static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = {
+ [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 }
+};
+static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE };
/*
* Allocate node_to_cpumask_map based on number of available nodes
@@ -79,7 +84,7 @@ static void __init setup_node_to_cpumask_map(void)
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
/* cpumask_of_node() will now work */
- dbg("Node to cpumask map for %u nodes\n", nr_node_ids);
+ pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}
static int __init fake_numa_create_new_node(unsigned long end_pfn,
@@ -123,7 +128,7 @@ static int __init fake_numa_create_new_node(unsigned long end_pfn,
cmdline = p;
fake_nid++;
*nid = fake_nid;
- dbg("created new fake_node with id %d\n", fake_nid);
+ pr_debug("created new fake_node with id %d\n", fake_nid);
return 1;
}
return 0;
@@ -137,33 +142,79 @@ static void reset_numa_cpu_lookup_table(void)
numa_cpu_lookup_table[cpu] = -1;
}
-static void map_cpu_to_node(int cpu, int node)
+void map_cpu_to_node(int cpu, int node)
{
update_numa_cpu_lookup_table(cpu, node);
- dbg("adding cpu %d to node %d\n", cpu, node);
-
- if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
+ if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) {
+ pr_debug("adding cpu %d to node %d\n", cpu, node);
cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
+ }
}
#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
-static void unmap_cpu_from_node(unsigned long cpu)
+void unmap_cpu_from_node(unsigned long cpu)
{
int node = numa_cpu_lookup_table[cpu];
- dbg("removing cpu %lu from node %d\n", cpu, node);
-
if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
+ pr_debug("removing cpu %lu from node %d\n", cpu, node);
} else {
- printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
- cpu, node);
+ pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node);
}
}
#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
-int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+static int __associativity_to_nid(const __be32 *associativity,
+ int max_array_sz)
+{
+ int nid;
+ /*
+ * primary_domain_index is 1 based array index.
+ */
+ int index = primary_domain_index - 1;
+
+ if (!numa_enabled || index >= max_array_sz)
+ return NUMA_NO_NODE;
+
+ nid = of_read_number(&associativity[index], 1);
+
+ /* POWER4 LPAR uses 0xffff as invalid node */
+ if (nid == 0xffff || nid >= nr_node_ids)
+ nid = NUMA_NO_NODE;
+ return nid;
+}
+/*
+ * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
+ * info is found.
+ */
+static int associativity_to_nid(const __be32 *associativity)
+{
+ int array_sz = of_read_number(associativity, 1);
+
+ /* Skip the first element in the associativity array */
+ return __associativity_to_nid((associativity + 1), array_sz);
+}
+
+static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+{
+ int dist;
+ int node1, node2;
+
+ node1 = associativity_to_nid(cpu1_assoc);
+ node2 = associativity_to_nid(cpu2_assoc);
+
+ dist = numa_distance_table[node1][node2];
+ if (dist <= LOCAL_DISTANCE)
+ return 0;
+ else if (dist <= REMOTE_DISTANCE)
+ return 1;
+ else
+ return 2;
+}
+
+static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
{
int dist = 0;
@@ -179,6 +230,15 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
return dist;
}
+int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+{
+ /* We should not get called with FORM0 */
+ VM_WARN_ON(affinity_form == FORM0_AFFINITY);
+ if (affinity_form == FORM1_AFFINITY)
+ return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
+ return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc);
+}
+
/* must hold reference to node during call */
static const __be32 *of_get_associativity(struct device_node *dev)
{
@@ -190,7 +250,9 @@ int __node_distance(int a, int b)
int i;
int distance = LOCAL_DISTANCE;
- if (!form1_affinity)
+ if (affinity_form == FORM2_AFFINITY)
+ return numa_distance_table[a][b];
+ else if (affinity_form == FORM0_AFFINITY)
return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
for (i = 0; i < distance_ref_points_depth; i++) {
@@ -205,52 +267,6 @@ int __node_distance(int a, int b)
}
EXPORT_SYMBOL(__node_distance);
-static void initialize_distance_lookup_table(int nid,
- const __be32 *associativity)
-{
- int i;
-
- if (!form1_affinity)
- return;
-
- for (i = 0; i < distance_ref_points_depth; i++) {
- const __be32 *entry;
-
- entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
- distance_lookup_table[nid][i] = of_read_number(entry, 1);
- }
-}
-
-/*
- * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
- * info is found.
- */
-static int associativity_to_nid(const __be32 *associativity)
-{
- int nid = NUMA_NO_NODE;
-
- if (!numa_enabled)
- goto out;
-
- if (of_read_number(associativity, 1) >= min_common_depth)
- nid = of_read_number(&associativity[min_common_depth], 1);
-
- /* POWER4 LPAR uses 0xffff as invalid node */
- if (nid == 0xffff || nid >= nr_node_ids)
- nid = NUMA_NO_NODE;
-
- if (nid > 0 &&
- of_read_number(associativity, 1) >= distance_ref_points_depth) {
- /*
- * Skip the length field and send start of associativity array
- */
- initialize_distance_lookup_table(nid, associativity + 1);
- }
-
-out:
- return nid;
-}
-
/* Returns the nid associated with the given device tree node,
* or -1 if not found.
*/
@@ -284,11 +300,159 @@ int of_node_to_nid(struct device_node *device)
}
EXPORT_SYMBOL(of_node_to_nid);
-static int __init find_min_common_depth(void)
+static void __initialize_form1_numa_distance(const __be32 *associativity,
+ int max_array_sz)
+{
+ int i, nid;
+
+ if (affinity_form != FORM1_AFFINITY)
+ return;
+
+ nid = __associativity_to_nid(associativity, max_array_sz);
+ if (nid != NUMA_NO_NODE) {
+ for (i = 0; i < distance_ref_points_depth; i++) {
+ const __be32 *entry;
+ int index = be32_to_cpu(distance_ref_points[i]) - 1;
+
+ /*
+ * broken hierarchy, return with broken distance table
+ */
+ if (WARN(index >= max_array_sz, "Broken ibm,associativity property"))
+ return;
+
+ entry = &associativity[index];
+ distance_lookup_table[nid][i] = of_read_number(entry, 1);
+ }
+ }
+}
+
+static void initialize_form1_numa_distance(const __be32 *associativity)
+{
+ int array_sz;
+
+ array_sz = of_read_number(associativity, 1);
+ /* Skip the first element in the associativity array */
+ __initialize_form1_numa_distance(associativity + 1, array_sz);
+}
+
+/*
+ * Used to update distance information w.r.t newly added node.
+ */
+void update_numa_distance(struct device_node *node)
{
- int depth;
+ int nid;
+
+ if (affinity_form == FORM0_AFFINITY)
+ return;
+ else if (affinity_form == FORM1_AFFINITY) {
+ const __be32 *associativity;
+
+ associativity = of_get_associativity(node);
+ if (!associativity)
+ return;
+
+ initialize_form1_numa_distance(associativity);
+ return;
+ }
+
+ /* FORM2 affinity */
+ nid = of_node_to_nid_single(node);
+ if (nid == NUMA_NO_NODE)
+ return;
+
+ /*
+ * With FORM2 we expect NUMA distance of all possible NUMA
+ * nodes to be provided during boot.
+ */
+ WARN(numa_distance_table[nid][nid] == -1,
+ "NUMA distance details for node %d not provided\n", nid);
+}
+
+/*
+ * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN}
+ * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements}
+ */
+static void initialize_form2_numa_distance_lookup_table(void)
+{
+ int i, j;
+ struct device_node *root;
+ const __u8 *numa_dist_table;
+ const __be32 *numa_lookup_index;
+ int numa_dist_table_length;
+ int max_numa_index, distance_index;
+
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ root = of_find_node_by_path("/ibm,opal");
+ else
+ root = of_find_node_by_path("/rtas");
+ if (!root)
+ root = of_find_node_by_path("/");
+
+ numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL);
+ max_numa_index = of_read_number(&numa_lookup_index[0], 1);
+
+ /* first element of the array is the size and is encode-int */
+ numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL);
+ numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1);
+ /* Skip the size which is encoded int */
+ numa_dist_table += sizeof(__be32);
+
+ pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n",
+ numa_dist_table_length, max_numa_index);
+
+ for (i = 0; i < max_numa_index; i++)
+ /* +1 skip the max_numa_index in the property */
+ numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
+
+
+ if (numa_dist_table_length != max_numa_index * max_numa_index) {
+ WARN(1, "Wrong NUMA distance information\n");
+ /* consider everybody else just remote. */
+ for (i = 0; i < max_numa_index; i++) {
+ for (j = 0; j < max_numa_index; j++) {
+ int nodeA = numa_id_index_table[i];
+ int nodeB = numa_id_index_table[j];
+
+ if (nodeA == nodeB)
+ numa_distance_table[nodeA][nodeB] = LOCAL_DISTANCE;
+ else
+ numa_distance_table[nodeA][nodeB] = REMOTE_DISTANCE;
+ }
+ }
+ }
+
+ distance_index = 0;
+ for (i = 0; i < max_numa_index; i++) {
+ for (j = 0; j < max_numa_index; j++) {
+ int nodeA = numa_id_index_table[i];
+ int nodeB = numa_id_index_table[j];
+
+ numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index++];
+ pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]);
+ }
+ }
+ of_node_put(root);
+}
+
+static int __init find_primary_domain_index(void)
+{
+ int index;
struct device_node *root;
+ /*
+ * Check for which form of affinity.
+ */
+ if (firmware_has_feature(FW_FEATURE_OPAL)) {
+ affinity_form = FORM1_AFFINITY;
+ } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
+ pr_debug("Using form 2 affinity\n");
+ affinity_form = FORM2_AFFINITY;
+ } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
+ pr_debug("Using form 1 affinity\n");
+ affinity_form = FORM1_AFFINITY;
+ } else
+ affinity_form = FORM0_AFFINITY;
+
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
else
@@ -313,42 +477,37 @@ static int __init find_min_common_depth(void)
&distance_ref_points_depth);
if (!distance_ref_points) {
- dbg("NUMA: ibm,associativity-reference-points not found.\n");
+ pr_debug("ibm,associativity-reference-points not found.\n");
goto err;
}
distance_ref_points_depth /= sizeof(int);
-
- if (firmware_has_feature(FW_FEATURE_OPAL) ||
- firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
- dbg("Using form 1 affinity\n");
- form1_affinity = 1;
- }
-
- if (form1_affinity) {
- depth = of_read_number(distance_ref_points, 1);
- } else {
+ if (affinity_form == FORM0_AFFINITY) {
if (distance_ref_points_depth < 2) {
- printk(KERN_WARNING "NUMA: "
- "short ibm,associativity-reference-points\n");
+ pr_warn("short ibm,associativity-reference-points\n");
goto err;
}
- depth = of_read_number(&distance_ref_points[1], 1);
+ index = of_read_number(&distance_ref_points[1], 1);
+ } else {
+ /*
+ * Both FORM1 and FORM2 affinity find the primary domain details
+ * at the same offset.
+ */
+ index = of_read_number(distance_ref_points, 1);
}
-
/*
* Warn and cap if the hardware supports more than
* MAX_DISTANCE_REF_POINTS domains.
*/
if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
- printk(KERN_WARNING "NUMA: distance array capped at "
- "%d entries\n", MAX_DISTANCE_REF_POINTS);
+ pr_warn("distance array capped at %d entries\n",
+ MAX_DISTANCE_REF_POINTS);
distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
}
of_node_put(root);
- return depth;
+ return index;
err:
of_node_put(root);
@@ -426,6 +585,38 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
return 0;
}
+static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
+{
+ struct assoc_arrays aa = { .arrays = NULL };
+ int default_nid = NUMA_NO_NODE;
+ int nid = default_nid;
+ int rc, index;
+
+ if ((primary_domain_index < 0) || !numa_enabled)
+ return default_nid;
+
+ rc = of_get_assoc_arrays(&aa);
+ if (rc)
+ return default_nid;
+
+ if (primary_domain_index <= aa.array_sz &&
+ !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
+ const __be32 *associativity;
+
+ index = lmb->aa_index * aa.array_sz;
+ associativity = &aa.arrays[index];
+ nid = __associativity_to_nid(associativity, aa.array_sz);
+ if (nid > 0 && affinity_form == FORM1_AFFINITY) {
+ /*
+ * lookup array associativity entries have
+ * no length of the array as the first element.
+ */
+ __initialize_form1_numa_distance(associativity, aa.array_sz);
+ }
+ }
+ return nid;
+}
+
/*
* This is like of_node_to_nid_single() for memory represented in the
* ibm,dynamic-reconfiguration-memory node.
@@ -437,35 +628,28 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
int nid = default_nid;
int rc, index;
- if ((min_common_depth < 0) || !numa_enabled)
+ if ((primary_domain_index < 0) || !numa_enabled)
return default_nid;
rc = of_get_assoc_arrays(&aa);
if (rc)
return default_nid;
- if (min_common_depth <= aa.array_sz &&
+ if (primary_domain_index <= aa.array_sz &&
!(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
- index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
- nid = of_read_number(&aa.arrays[index], 1);
-
- if (nid == 0xffff || nid >= nr_node_ids)
- nid = default_nid;
+ const __be32 *associativity;
- if (nid > 0) {
- index = lmb->aa_index * aa.array_sz;
- initialize_distance_lookup_table(nid,
- &aa.arrays[index]);
- }
+ index = lmb->aa_index * aa.array_sz;
+ associativity = &aa.arrays[index];
+ nid = __associativity_to_nid(associativity, aa.array_sz);
}
-
return nid;
}
#ifdef CONFIG_PPC_SPLPAR
-static int vphn_get_nid(long lcpu)
+
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
{
- __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
long rc, hwid;
/*
@@ -485,12 +669,30 @@ static int vphn_get_nid(long lcpu)
rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
if (rc == H_SUCCESS)
- return associativity_to_nid(associativity);
+ return 0;
}
+ return -1;
+}
+
+static int vphn_get_nid(long lcpu)
+{
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+
+
+ if (!__vphn_get_associativity(lcpu, associativity))
+ return associativity_to_nid(associativity);
+
return NUMA_NO_NODE;
+
}
#else
+
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
+{
+ return -1;
+}
+
static int vphn_get_nid(long unused)
{
return NUMA_NO_NODE;
@@ -598,9 +800,6 @@ static int ppc_numa_cpu_prepare(unsigned int cpu)
static int ppc_numa_cpu_dead(unsigned int cpu)
{
-#ifdef CONFIG_HOTPLUG_CPU
- unmap_cpu_from_node(cpu);
-#endif
return 0;
}
@@ -685,7 +884,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
size = read_n_cells(n_mem_size_cells, usm);
}
- nid = of_drconf_to_nid_single(lmb);
+ nid = get_nid_and_numa_distance(lmb);
fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
&nid);
node_set_online(nid);
@@ -702,24 +901,31 @@ static int __init parse_numa_properties(void)
struct device_node *memory;
int default_nid = 0;
unsigned long i;
+ const __be32 *associativity;
if (numa_enabled == 0) {
- printk(KERN_WARNING "NUMA disabled by user\n");
+ pr_warn("disabled by user\n");
return -1;
}
- min_common_depth = find_min_common_depth();
+ primary_domain_index = find_primary_domain_index();
- if (min_common_depth < 0) {
+ if (primary_domain_index < 0) {
/*
- * if we fail to parse min_common_depth from device tree
+ * if we fail to parse primary_domain_index from device tree
* mark the numa disabled, boot with numa disabled.
*/
numa_enabled = false;
- return min_common_depth;
+ return primary_domain_index;
}
- dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+ pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index);
+
+ /*
+ * If it is FORM2 initialize the distance table here.
+ */
+ if (affinity_form == FORM2_AFFINITY)
+ initialize_form2_numa_distance_lookup_table();
/*
* Even though we connect cpus to numa domains later in SMP
@@ -727,18 +933,30 @@ static int __init parse_numa_properties(void)
* each node to be onlined must have NODE_DATA etc backing it.
*/
for_each_present_cpu(i) {
+ __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
struct device_node *cpu;
- int nid = vphn_get_nid(i);
+ int nid = NUMA_NO_NODE;
- /*
- * Don't fall back to default_nid yet -- we will plug
- * cpus into nodes once the memory scan has discovered
- * the topology.
- */
- if (nid == NUMA_NO_NODE) {
+ memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
+
+ if (__vphn_get_associativity(i, vphn_assoc) == 0) {
+ nid = associativity_to_nid(vphn_assoc);
+ initialize_form1_numa_distance(vphn_assoc);
+ } else {
+
+ /*
+ * Don't fall back to default_nid yet -- we will plug
+ * cpus into nodes once the memory scan has discovered
+ * the topology.
+ */
cpu = of_get_cpu_node(i, NULL);
BUG_ON(!cpu);
- nid = of_node_to_nid_single(cpu);
+
+ associativity = of_get_associativity(cpu);
+ if (associativity) {
+ nid = associativity_to_nid(associativity);
+ initialize_form1_numa_distance(associativity);
+ }
of_node_put(cpu);
}
@@ -774,8 +992,11 @@ new_range:
* have associativity properties. If none, then
* everything goes to default_nid.
*/
- nid = of_node_to_nid_single(memory);
- if (nid < 0)
+ associativity = of_get_associativity(memory);
+ if (associativity) {
+ nid = associativity_to_nid(associativity);
+ initialize_form1_numa_distance(associativity);
+ } else
nid = default_nid;
fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
@@ -811,10 +1032,8 @@ static void __init setup_nonnuma(void)
unsigned int nid = 0;
int i;
- printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
- top_of_ram, total_ram);
- printk(KERN_DEBUG "Memory hole size: %ldMB\n",
- (top_of_ram - total_ram) >> 20);
+ pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram);
+ pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20);
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
fake_numa_create_new_node(end_pfn, &nid);
@@ -893,7 +1112,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
static void __init find_possible_nodes(void)
{
struct device_node *rtas;
- const __be32 *domains;
+ const __be32 *domains = NULL;
int prop_length, max_nodes;
u32 i;
@@ -909,9 +1128,14 @@ static void __init find_possible_nodes(void)
* it doesn't exist, then fallback on ibm,max-associativity-domains.
* Current denotes what the platform can support compared to max
* which denotes what the Hypervisor can support.
+ *
+ * If the LPAR is migratable, new nodes might be activated after a LPM,
+ * so we should consider the max number in that case.
*/
- domains = of_get_property(rtas, "ibm,current-associativity-domains",
- &prop_length);
+ if (!of_get_property(of_root, "ibm,migratable-partition", NULL))
+ domains = of_get_property(rtas,
+ "ibm,current-associativity-domains",
+ &prop_length);
if (!domains) {
domains = of_get_property(rtas, "ibm,max-associativity-domains",
&prop_length);
@@ -919,14 +1143,16 @@ static void __init find_possible_nodes(void)
goto out;
}
- max_nodes = of_read_number(&domains[min_common_depth], 1);
+ max_nodes = of_read_number(&domains[primary_domain_index], 1);
+ pr_info("Partition configured for %d NUMA nodes.\n", max_nodes);
+
for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
prop_length /= sizeof(int);
- if (prop_length > min_common_depth + 2)
+ if (prop_length > primary_domain_index + 2)
coregroup_enabled = 1;
out:
@@ -1014,9 +1240,6 @@ static int __init early_numa(char *p)
if (strstr(p, "off"))
numa_enabled = 0;
- if (strstr(p, "debug"))
- numa_debug = 1;
-
p = strstr(p, "fake=");
if (p)
cmdline = p + strlen("fake=");
@@ -1179,7 +1402,7 @@ static long vphn_get_associativity(unsigned long cpu,
switch (rc) {
case H_SUCCESS:
- dbg("VPHN hcall succeeded. Reset polling...\n");
+ pr_debug("VPHN hcall succeeded. Reset polling...\n");
goto out;
case H_FUNCTION:
@@ -1259,7 +1482,7 @@ int cpu_to_coregroup_id(int cpu)
goto out;
index = of_read_number(associativity, 1);
- if (index > min_common_depth + 1)
+ if (index > primary_domain_index + 1)
return of_read_number(&associativity[index - 1], 1);
out:
diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index 86da2a669680..fac932eb8f9a 100644
--- a/arch/powerpc/mm/ptdump/8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -75,8 +75,10 @@ static const struct flag_info flag_array[] = {
};
struct pgtable_level pg_level[5] = {
- {
- }, { /* pgd */
+ { /* pgd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* p4d */
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
}, { /* pud */
diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile
index 712762be3cb1..4050cbb55acf 100644
--- a/arch/powerpc/mm/ptdump/Makefile
+++ b/arch/powerpc/mm/ptdump/Makefile
@@ -5,5 +5,10 @@ obj-y += ptdump.o
obj-$(CONFIG_4xx) += shared.o
obj-$(CONFIG_PPC_8xx) += 8xx.o
obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o
-obj-$(CONFIG_PPC_BOOK3S_32) += shared.o bats.o segment_regs.o
-obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o hashpagetable.o
+obj-$(CONFIG_PPC_BOOK3S_32) += shared.o
+obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o
+
+ifdef CONFIG_PTDUMP_DEBUGFS
+obj-$(CONFIG_PPC_BOOK3S_32) += bats.o segment_regs.o
+obj-$(CONFIG_PPC_BOOK3S_64) += hashpagetable.o
+endif
diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c
index c4c628b03cf8..820c119013e4 100644
--- a/arch/powerpc/mm/ptdump/bats.c
+++ b/arch/powerpc/mm/ptdump/bats.c
@@ -7,7 +7,7 @@
*/
#include <linux/pgtable.h>
-#include <asm/debugfs.h>
+#include <linux/debugfs.h>
#include <asm/cpu_has_feature.h>
#include "ptdump.h"
@@ -57,7 +57,7 @@ static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool
#define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), mfspr(_u), _d)
-static int bats_show_603(struct seq_file *m, void *v)
+static int bats_show(struct seq_file *m, void *v)
{
seq_puts(m, "---[ Instruction Block Address Translation ]---\n");
@@ -88,22 +88,12 @@ static int bats_show_603(struct seq_file *m, void *v)
return 0;
}
-static int bats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, bats_show_603, NULL);
-}
-
-static const struct file_operations bats_fops = {
- .open = bats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(bats);
static int __init bats_init(void)
{
debugfs_create_file("block_address_translation", 0400,
- powerpc_debugfs_root, NULL, &bats_fops);
+ arch_debugfs_dir, NULL, &bats_fops);
return 0;
}
device_initcall(bats_init);
diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c
index 14f73868db66..5ad92d9dc5d1 100644
--- a/arch/powerpc/mm/ptdump/book3s64.c
+++ b/arch/powerpc/mm/ptdump/book3s64.c
@@ -103,8 +103,10 @@ static const struct flag_info flag_array[] = {
};
struct pgtable_level pg_level[5] = {
- {
- }, { /* pgd */
+ { /* pgd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* p4d */
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
}, { /* pud */
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
index ad6df9a2e7c8..c7f824d294b2 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -526,17 +526,7 @@ static int ptdump_show(struct seq_file *m, void *v)
return 0;
}
-static int ptdump_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
- .open = ptdump_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump);
static int ptdump_init(void)
{
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 5062c58b1e5b..bf251191e78d 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -16,6 +16,7 @@
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/ptdump.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <asm/fixmap.h>
@@ -54,11 +55,12 @@
*
*/
struct pg_state {
+ struct ptdump_state ptdump;
struct seq_file *seq;
const struct addr_marker *marker;
unsigned long start_address;
unsigned long start_pa;
- unsigned int level;
+ int level;
u64 current_flags;
bool check_wx;
unsigned long wx_pages;
@@ -102,6 +104,11 @@ static struct addr_marker address_markers[] = {
{ -1, NULL },
};
+static struct ptdump_range ptdump_range[] __ro_after_init = {
+ {TASK_SIZE_MAX, ~0UL},
+ {0, 0}
+};
+
#define pt_dump_seq_printf(m, fmt, args...) \
({ \
if (m) \
@@ -188,10 +195,9 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
}
-static void note_page_update_state(struct pg_state *st, unsigned long addr,
- unsigned int level, u64 val, unsigned long page_size)
+static void note_page_update_state(struct pg_state *st, unsigned long addr, int level, u64 val)
{
- u64 flag = val & pg_level[level].mask;
+ u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
u64 pa = val & PTE_RPN_MASK;
st->level = level;
@@ -205,15 +211,15 @@ static void note_page_update_state(struct pg_state *st, unsigned long addr,
}
}
-static void note_page(struct pg_state *st, unsigned long addr,
- unsigned int level, u64 val, unsigned long page_size)
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
{
- u64 flag = val & pg_level[level].mask;
+ u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
+ struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
/* At first no level is set */
- if (!st->level) {
+ if (st->level == -1) {
pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
- note_page_update_state(st, addr, level, val, page_size);
+ note_page_update_state(st, addr, level, val);
/*
* Dump the section of virtual memory when:
* - the PTE flags from one entry to the next differs.
@@ -242,95 +248,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
* Address indicates we have passed the end of the
* current section of virtual memory
*/
- note_page_update_state(st, addr, level, val, page_size);
- }
-}
-
-static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
-{
- pte_t *pte = pte_offset_kernel(pmd, 0);
- unsigned long addr;
- unsigned int i;
-
- for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
- addr = start + i * PAGE_SIZE;
- note_page(st, addr, 4, pte_val(*pte), PAGE_SIZE);
-
- }
-}
-
-static void walk_hugepd(struct pg_state *st, hugepd_t *phpd, unsigned long start,
- int pdshift, int level)
-{
-#ifdef CONFIG_ARCH_HAS_HUGEPD
- unsigned int i;
- int shift = hugepd_shift(*phpd);
- int ptrs_per_hpd = pdshift - shift > 0 ? 1 << (pdshift - shift) : 1;
-
- if (start & ((1 << shift) - 1))
- return;
-
- for (i = 0; i < ptrs_per_hpd; i++) {
- unsigned long addr = start + (i << shift);
- pte_t *pte = hugepte_offset(*phpd, addr, pdshift);
-
- note_page(st, addr, level + 1, pte_val(*pte), 1 << shift);
- }
-#endif
-}
-
-static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
-{
- pmd_t *pmd = pmd_offset(pud, 0);
- unsigned long addr;
- unsigned int i;
-
- for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
- addr = start + i * PMD_SIZE;
- if (!pmd_none(*pmd) && !pmd_is_leaf(*pmd))
- /* pmd exists */
- walk_pte(st, pmd, addr);
- else
- note_page(st, addr, 3, pmd_val(*pmd), PMD_SIZE);
- }
-}
-
-static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
-{
- pud_t *pud = pud_offset(p4d, 0);
- unsigned long addr;
- unsigned int i;
-
- for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
- addr = start + i * PUD_SIZE;
- if (!pud_none(*pud) && !pud_is_leaf(*pud))
- /* pud exists */
- walk_pmd(st, pud, addr);
- else
- note_page(st, addr, 2, pud_val(*pud), PUD_SIZE);
- }
-}
-
-static void walk_pagetables(struct pg_state *st)
-{
- unsigned int i;
- unsigned long addr = st->start_address & PGDIR_MASK;
- pgd_t *pgd = pgd_offset_k(addr);
-
- /*
- * Traverse the linux pagetable structure and dump pages that are in
- * the hash pagetable.
- */
- for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
- p4d_t *p4d = p4d_offset(pgd, 0);
-
- if (p4d_none(*p4d) || p4d_is_leaf(*p4d))
- note_page(st, addr, 1, p4d_val(*p4d), PGDIR_SIZE);
- else if (is_hugepd(__hugepd(p4d_val(*p4d))))
- walk_hugepd(st, (hugepd_t *)p4d, addr, PGDIR_SHIFT, 1);
- else
- /* p4d exists */
- walk_pud(st, p4d, addr);
+ note_page_update_state(st, addr, level, val);
}
}
@@ -383,32 +301,19 @@ static int ptdump_show(struct seq_file *m, void *v)
struct pg_state st = {
.seq = m,
.marker = address_markers,
- .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE,
+ .level = -1,
+ .ptdump = {
+ .note_page = note_page,
+ .range = ptdump_range,
+ }
};
-#ifdef CONFIG_PPC64
- if (!radix_enabled())
- st.start_address = KERN_VIRT_START;
-#endif
-
/* Traverse kernel page tables */
- walk_pagetables(&st);
- note_page(&st, 0, 0, 0, 0);
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
return 0;
}
-
-static int ptdump_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
- .open = ptdump_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump);
static void build_pgtable_complete_mask(void)
{
@@ -420,22 +325,24 @@ static void build_pgtable_complete_mask(void)
pg_level[i].mask |= pg_level[i].flag[j].mask;
}
-#ifdef CONFIG_PPC_DEBUG_WX
+#ifdef CONFIG_DEBUG_WX
void ptdump_check_wx(void)
{
struct pg_state st = {
.seq = NULL,
- .marker = address_markers,
+ .marker = (struct addr_marker[]) {
+ { 0, NULL},
+ { -1, NULL},
+ },
+ .level = -1,
.check_wx = true,
- .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE,
+ .ptdump = {
+ .note_page = note_page,
+ .range = ptdump_range,
+ }
};
-#ifdef CONFIG_PPC64
- if (!radix_enabled())
- st.start_address = KERN_VIRT_START;
-#endif
-
- walk_pagetables(&st);
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
if (st.wx_pages)
pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n",
@@ -445,12 +352,23 @@ void ptdump_check_wx(void)
}
#endif
-static int ptdump_init(void)
+static int __init ptdump_init(void)
{
+#ifdef CONFIG_PPC64
+ if (!radix_enabled())
+ ptdump_range[0].start = KERN_VIRT_START;
+ else
+ ptdump_range[0].start = PAGE_OFFSET;
+
+ ptdump_range[0].end = PAGE_OFFSET + (PGDIR_SIZE * PTRS_PER_PGD);
+#endif
+
populate_markers();
build_pgtable_complete_mask();
- debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
- &ptdump_fops);
+
+ if (IS_ENABLED(CONFIG_PTDUMP_DEBUGFS))
+ debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
+
return 0;
}
device_initcall(ptdump_init);
diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c
index 565048a0c9be..9df3af8d481f 100644
--- a/arch/powerpc/mm/ptdump/segment_regs.c
+++ b/arch/powerpc/mm/ptdump/segment_regs.c
@@ -6,7 +6,7 @@
* This dumps the content of Segment Registers
*/
-#include <asm/debugfs.h>
+#include <linux/debugfs.h>
static void seg_show(struct seq_file *m, int i)
{
@@ -41,21 +41,11 @@ static int sr_show(struct seq_file *m, void *v)
return 0;
}
-static int sr_open(struct inode *inode, struct file *file)
-{
- return single_open(file, sr_show, NULL);
-}
-
-static const struct file_operations sr_fops = {
- .open = sr_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(sr);
static int __init sr_init(void)
{
- debugfs_create_file("segment_registers", 0400, powerpc_debugfs_root,
+ debugfs_create_file("segment_registers", 0400, arch_debugfs_dir,
NULL, &sr_fops);
return 0;
}
diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
index c005fe041c18..03607ab90c66 100644
--- a/arch/powerpc/mm/ptdump/shared.c
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -68,8 +68,10 @@ static const struct flag_info flag_array[] = {
};
struct pgtable_level pg_level[5] = {
- {
- }, { /* pgd */
+ { /* pgd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* p4d */
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
}, { /* pud */
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index bb0ee716de91..73e62e9b179b 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -340,6 +340,13 @@ static inline void perf_read_regs(struct pt_regs *regs)
* If the PMU doesn't update the SIAR for non marked events use
* pt_regs.
*
+ * If regs is a kernel interrupt, always use SIAR. Some PMUs have an
+ * issue with regs_sipr not being in synch with SIAR in interrupt entry
+ * and return sequences, which can result in regs_sipr being true for
+ * kernel interrupts and SIAR, which has the effect of causing samples
+ * to pile up at mtmsrd MSR[EE] 0->1 or pending irq replay around
+ * interrupt entry/exit.
+ *
* If the PMU has HV/PR flags then check to see if they
* place the exception in userspace. If so, use pt_regs. In
* continuous sampling mode the SIAR and the PMU exception are
@@ -356,6 +363,8 @@ static inline void perf_read_regs(struct pt_regs *regs)
use_siar = 1;
else if ((ppmu->flags & PPMU_NO_CONT_SAMPLING))
use_siar = 0;
+ else if (!user_mode(regs))
+ use_siar = 1;
else if (!(ppmu->flags & PPMU_NO_SIPR) && regs_sipr(regs))
use_siar = 0;
else
@@ -2251,18 +2260,10 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
*/
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
- bool use_siar = regs_use_siar(regs);
unsigned long siar = mfspr(SPRN_SIAR);
- if (ppmu && (ppmu->flags & PPMU_P10_DD1)) {
- if (siar)
- return siar;
- else
- return regs->nip;
- } else if (use_siar && siar_valid(regs))
- return mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
- else if (use_siar)
- return 0; // no valid instruction pointer
+ if (regs_use_siar(regs) && siar_valid(regs) && siar)
+ return siar + perf_ip_adjust(regs);
else
return regs->nip;
}
diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c
index d48413e28c39..c756228a081f 100644
--- a/arch/powerpc/perf/hv-gpci.c
+++ b/arch/powerpc/perf/hv-gpci.c
@@ -175,7 +175,7 @@ static unsigned long single_gpci_request(u32 req, u32 starting_index,
*/
count = 0;
for (i = offset; i < offset + length; i++)
- count |= arg->bytes[i] << (i - offset);
+ count |= (u64)(arg->bytes[i]) << ((length - 1 - (i - offset)) * 8);
*value = count;
out:
diff --git a/arch/powerpc/platforms/44x/machine_check.c b/arch/powerpc/platforms/44x/machine_check.c
index a5c898bb9bab..5d19daacd78a 100644
--- a/arch/powerpc/platforms/44x/machine_check.c
+++ b/arch/powerpc/platforms/44x/machine_check.c
@@ -11,7 +11,7 @@
int machine_check_440A(struct pt_regs *regs)
{
- unsigned long reason = regs->dsisr;
+ unsigned long reason = regs->esr;
printk("Machine check in kernel mode.\n");
if (reason & ESR_IMCP){
@@ -48,7 +48,7 @@ int machine_check_440A(struct pt_regs *regs)
#ifdef CONFIG_PPC_47x
int machine_check_47x(struct pt_regs *regs)
{
- unsigned long reason = regs->dsisr;
+ unsigned long reason = regs->esr;
u32 mcsr;
printk(KERN_ERR "Machine check in kernel mode.\n");
diff --git a/arch/powerpc/platforms/4xx/machine_check.c b/arch/powerpc/platforms/4xx/machine_check.c
index a71c29892a91..a905da1d6f41 100644
--- a/arch/powerpc/platforms/4xx/machine_check.c
+++ b/arch/powerpc/platforms/4xx/machine_check.c
@@ -10,7 +10,7 @@
int machine_check_4xx(struct pt_regs *regs)
{
- unsigned long reason = regs->dsisr;
+ unsigned long reason = regs->esr;
if (reason & ESR_IMCP) {
printk("Instruction");
diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig
index b77cbb0a50e1..4142ebf01382 100644
--- a/arch/powerpc/platforms/85xx/Kconfig
+++ b/arch/powerpc/platforms/85xx/Kconfig
@@ -208,12 +208,6 @@ config TQM8560
select TQM85xx
select CPM2
-config SBC8548
- bool "Wind River SBC8548"
- select DEFAULT_UIMAGE
- help
- This option enables support for the Wind River SBC8548 board
-
config PPA8548
bool "Prodrive PPA8548"
help
diff --git a/arch/powerpc/platforms/85xx/Makefile b/arch/powerpc/platforms/85xx/Makefile
index d1dd0dca5ebf..60e4e97a929d 100644
--- a/arch/powerpc/platforms/85xx/Makefile
+++ b/arch/powerpc/platforms/85xx/Makefile
@@ -26,7 +26,6 @@ obj-$(CONFIG_CORENET_GENERIC) += corenet_generic.o
obj-$(CONFIG_FB_FSL_DIU) += t1042rdb_diu.o
obj-$(CONFIG_STX_GP3) += stx_gp3.o
obj-$(CONFIG_TQM85xx) += tqm85xx.o
-obj-$(CONFIG_SBC8548) += sbc8548.o
obj-$(CONFIG_PPA8548) += ppa8548.o
obj-$(CONFIG_SOCRATES) += socrates.o socrates_fpga_pic.o
obj-$(CONFIG_KSI8560) += ksi8560.o
diff --git a/arch/powerpc/platforms/85xx/sbc8548.c b/arch/powerpc/platforms/85xx/sbc8548.c
deleted file mode 100644
index e4acf5ce6b07..000000000000
--- a/arch/powerpc/platforms/85xx/sbc8548.c
+++ /dev/null
@@ -1,134 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Wind River SBC8548 setup and early boot code.
- *
- * Copyright 2007 Wind River Systems Inc.
- *
- * By Paul Gortmaker (see MAINTAINERS for contact information)
- *
- * Based largely on the MPC8548CDS support - Copyright 2005 Freescale Inc.
- */
-
-#include <linux/stddef.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/reboot.h>
-#include <linux/pci.h>
-#include <linux/kdev_t.h>
-#include <linux/major.h>
-#include <linux/console.h>
-#include <linux/delay.h>
-#include <linux/seq_file.h>
-#include <linux/initrd.h>
-#include <linux/interrupt.h>
-#include <linux/fsl_devices.h>
-#include <linux/of_platform.h>
-#include <linux/pgtable.h>
-
-#include <asm/page.h>
-#include <linux/atomic.h>
-#include <asm/time.h>
-#include <asm/io.h>
-#include <asm/machdep.h>
-#include <asm/ipic.h>
-#include <asm/pci-bridge.h>
-#include <asm/irq.h>
-#include <mm/mmu_decl.h>
-#include <asm/prom.h>
-#include <asm/udbg.h>
-#include <asm/mpic.h>
-
-#include <sysdev/fsl_soc.h>
-#include <sysdev/fsl_pci.h>
-
-#include "mpc85xx.h"
-
-static int sbc_rev;
-
-static void __init sbc8548_pic_init(void)
-{
- struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN,
- 0, 256, " OpenPIC ");
- BUG_ON(mpic == NULL);
- mpic_init(mpic);
-}
-
-/* Extract the HW Rev from the EPLD on the board */
-static int __init sbc8548_hw_rev(void)
-{
- struct device_node *np;
- struct resource res;
- unsigned int *rev;
- int board_rev = 0;
-
- np = of_find_compatible_node(NULL, NULL, "hw-rev");
- if (np == NULL) {
- printk("No HW-REV found in DTB.\n");
- return -ENODEV;
- }
-
- of_address_to_resource(np, 0, &res);
- of_node_put(np);
-
- rev = ioremap(res.start,sizeof(unsigned int));
- board_rev = (*rev) >> 28;
- iounmap(rev);
-
- return board_rev;
-}
-
-/*
- * Setup the architecture
- */
-static void __init sbc8548_setup_arch(void)
-{
- if (ppc_md.progress)
- ppc_md.progress("sbc8548_setup_arch()", 0);
-
- fsl_pci_assign_primary();
-
- sbc_rev = sbc8548_hw_rev();
-}
-
-static void sbc8548_show_cpuinfo(struct seq_file *m)
-{
- uint pvid, svid, phid1;
-
- pvid = mfspr(SPRN_PVR);
- svid = mfspr(SPRN_SVR);
-
- seq_printf(m, "Vendor\t\t: Wind River\n");
- seq_printf(m, "Machine\t\t: SBC8548 v%d\n", sbc_rev);
- seq_printf(m, "PVR\t\t: 0x%x\n", pvid);
- seq_printf(m, "SVR\t\t: 0x%x\n", svid);
-
- /* Display cpu Pll setting */
- phid1 = mfspr(SPRN_HID1);
- seq_printf(m, "PLL setting\t: 0x%x\n", ((phid1 >> 24) & 0x3f));
-}
-
-machine_arch_initcall(sbc8548, mpc85xx_common_publish_devices);
-
-/*
- * Called very early, device-tree isn't unflattened
- */
-static int __init sbc8548_probe(void)
-{
- return of_machine_is_compatible("SBC8548");
-}
-
-define_machine(sbc8548) {
- .name = "SBC8548",
- .probe = sbc8548_probe,
- .setup_arch = sbc8548_setup_arch,
- .init_IRQ = sbc8548_pic_init,
- .show_cpuinfo = sbc8548_show_cpuinfo,
- .get_irq = mpic_get_irq,
-#ifdef CONFIG_PCI
- .pcibios_fixup_bus = fsl_pcibios_fixup_bus,
- .pcibios_fixup_phb = fsl_pcibios_fixup_phb,
-#endif
- .calibrate_decr = generic_calibrate_decr,
- .progress = udbg_progress,
-};
diff --git a/arch/powerpc/platforms/86xx/Kconfig b/arch/powerpc/platforms/86xx/Kconfig
index 07a9d60c618a..be867abebc83 100644
--- a/arch/powerpc/platforms/86xx/Kconfig
+++ b/arch/powerpc/platforms/86xx/Kconfig
@@ -20,12 +20,6 @@ config MPC8641_HPCN
help
This option enables support for the MPC8641 HPCN board.
-config SBC8641D
- bool "Wind River SBC8641D"
- select DEFAULT_UIMAGE
- help
- This option enables support for the WRS SBC8641D board.
-
config MPC8610_HPCD
bool "Freescale MPC8610 HPCD"
select DEFAULT_UIMAGE
@@ -74,7 +68,7 @@ config MPC8641
select FSL_PCI if PCI
select PPC_UDBG_16550
select MPIC
- default y if MPC8641_HPCN || SBC8641D || GEF_SBC610 || GEF_SBC310 || GEF_PPC9A \
+ default y if MPC8641_HPCN || GEF_SBC610 || GEF_SBC310 || GEF_PPC9A \
|| MVME7100
config MPC8610
diff --git a/arch/powerpc/platforms/86xx/Makefile b/arch/powerpc/platforms/86xx/Makefile
index 2c04449be107..5bbe1475bf26 100644
--- a/arch/powerpc/platforms/86xx/Makefile
+++ b/arch/powerpc/platforms/86xx/Makefile
@@ -6,7 +6,6 @@
obj-y := pic.o common.o
obj-$(CONFIG_SMP) += mpc86xx_smp.o
obj-$(CONFIG_MPC8641_HPCN) += mpc86xx_hpcn.o
-obj-$(CONFIG_SBC8641D) += sbc8641d.o
obj-$(CONFIG_MPC8610_HPCD) += mpc8610_hpcd.o
obj-$(CONFIG_GEF_SBC610) += gef_sbc610.o
obj-$(CONFIG_GEF_SBC310) += gef_sbc310.o
diff --git a/arch/powerpc/platforms/86xx/sbc8641d.c b/arch/powerpc/platforms/86xx/sbc8641d.c
deleted file mode 100644
index dc23dd383d6e..000000000000
--- a/arch/powerpc/platforms/86xx/sbc8641d.c
+++ /dev/null
@@ -1,87 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SBC8641D board specific routines
- *
- * Copyright 2008 Wind River Systems Inc.
- *
- * By Paul Gortmaker (see MAINTAINERS for contact information)
- *
- * Based largely on the 8641 HPCN support by Freescale Semiconductor Inc.
- */
-
-#include <linux/stddef.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/kdev_t.h>
-#include <linux/delay.h>
-#include <linux/seq_file.h>
-#include <linux/of_platform.h>
-
-#include <asm/time.h>
-#include <asm/machdep.h>
-#include <asm/pci-bridge.h>
-#include <asm/prom.h>
-#include <mm/mmu_decl.h>
-#include <asm/udbg.h>
-
-#include <asm/mpic.h>
-
-#include <sysdev/fsl_pci.h>
-#include <sysdev/fsl_soc.h>
-
-#include "mpc86xx.h"
-
-static void __init
-sbc8641_setup_arch(void)
-{
- if (ppc_md.progress)
- ppc_md.progress("sbc8641_setup_arch()", 0);
-
- printk("SBC8641 board from Wind River\n");
-
-#ifdef CONFIG_SMP
- mpc86xx_smp_init();
-#endif
-
- fsl_pci_assign_primary();
-}
-
-
-static void
-sbc8641_show_cpuinfo(struct seq_file *m)
-{
- uint svid = mfspr(SPRN_SVR);
-
- seq_printf(m, "Vendor\t\t: Wind River Systems\n");
-
- seq_printf(m, "SVR\t\t: 0x%x\n", svid);
-}
-
-
-/*
- * Called very early, device-tree isn't unflattened
- */
-static int __init sbc8641_probe(void)
-{
- if (of_machine_is_compatible("wind,sbc8641"))
- return 1; /* Looks good */
-
- return 0;
-}
-
-machine_arch_initcall(sbc8641, mpc86xx_common_publish_devices);
-
-define_machine(sbc8641) {
- .name = "SBC8641D",
- .probe = sbc8641_probe,
- .setup_arch = sbc8641_setup_arch,
- .init_IRQ = mpc86xx_init_irq,
- .show_cpuinfo = sbc8641_show_cpuinfo,
- .get_irq = mpic_get_irq,
- .time_init = mpc86xx_time_init,
- .calibrate_decr = generic_calibrate_decr,
- .progress = udbg_progress,
-#ifdef CONFIG_PCI
- .pcibios_fixup_bus = fsl_pcibios_fixup_bus,
-#endif
-};
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
index ca2555b8a0c2..82335e364c44 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -12,8 +12,8 @@
#include <linux/export.h>
#include <linux/of_platform.h>
#include <linux/slab.h>
+#include <linux/debugfs.h>
-#include <asm/debugfs.h>
#include <asm/dcr.h>
#include <asm/machdep.h>
#include <asm/prom.h>
@@ -480,6 +480,6 @@ void axon_msi_debug_setup(struct device_node *dn, struct axon_msic *msic)
snprintf(name, sizeof(name), "msic_%d", of_node_to_nid(dn));
- debugfs_create_file(name, 0600, powerpc_debugfs_root, msic, &fops_msic);
+ debugfs_create_file(name, 0600, arch_debugfs_dir, msic, &fops_msic);
}
#endif /* DEBUG */
diff --git a/arch/powerpc/platforms/embedded6xx/holly.c b/arch/powerpc/platforms/embedded6xx/holly.c
index 85521b3e7098..7a85b117f7a4 100644
--- a/arch/powerpc/platforms/embedded6xx/holly.c
+++ b/arch/powerpc/platforms/embedded6xx/holly.c
@@ -251,7 +251,7 @@ static int ppc750_machine_check_exception(struct pt_regs *regs)
/* Are we prepared to handle this fault */
if ((entry = search_exception_tables(regs->nip)) != NULL) {
tsi108_clear_pci_cfg_error();
- regs_set_return_msr(regs, regs->msr | MSR_RI);
+ regs_set_recoverable(regs);
regs_set_return_ip(regs, extable_fixup(entry));
return 1;
}
diff --git a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c
index d8da6a483e59..9eb9abb5bce2 100644
--- a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c
+++ b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c
@@ -173,7 +173,7 @@ static int mpc7448_machine_check_exception(struct pt_regs *regs)
/* Are we prepared to handle this fault */
if ((entry = search_exception_tables(regs->nip)) != NULL) {
tsi108_clear_pci_cfg_error();
- regs_set_return_msr(regs, regs->msr | MSR_RI);
+ regs_set_recoverable(regs);
regs_set_return_ip(regs, extable_fixup(entry));
return 1;
}
diff --git a/arch/powerpc/platforms/pasemi/idle.c b/arch/powerpc/platforms/pasemi/idle.c
index 534b0317fc15..6087c70ed2ef 100644
--- a/arch/powerpc/platforms/pasemi/idle.c
+++ b/arch/powerpc/platforms/pasemi/idle.c
@@ -59,7 +59,7 @@ static int pasemi_system_reset_exception(struct pt_regs *regs)
restore_astate(hard_smp_processor_id());
/* everything handled */
- regs_set_return_msr(regs, regs->msr | MSR_RI);
+ regs_set_recoverable(regs);
return 1;
}
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 528a7e0cf83a..e3ffdc8e8567 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -199,12 +199,12 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
*/
power7_fastsleep_workaround_exit = false;
- get_online_cpus();
+ cpus_read_lock();
primary_thread_mask = cpu_online_cores_map();
on_each_cpu_mask(&primary_thread_mask,
pnv_fastsleep_workaround_apply,
&err, 1);
- put_online_cpus();
+ cpus_read_unlock();
if (err) {
pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply");
goto fail;
@@ -667,7 +667,6 @@ static unsigned long power9_idle_stop(unsigned long psscr)
sprs.purr = mfspr(SPRN_PURR);
sprs.spurr = mfspr(SPRN_SPURR);
sprs.dscr = mfspr(SPRN_DSCR);
- sprs.wort = mfspr(SPRN_WORT);
sprs.ciabr = mfspr(SPRN_CIABR);
sprs.mmcra = mfspr(SPRN_MMCRA);
@@ -785,7 +784,6 @@ core_woken:
mtspr(SPRN_PURR, sprs.purr);
mtspr(SPRN_SPURR, sprs.spurr);
mtspr(SPRN_DSCR, sprs.dscr);
- mtspr(SPRN_WORT, sprs.wort);
mtspr(SPRN_CIABR, sprs.ciabr);
mtspr(SPRN_MMCRA, sprs.mmcra);
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 537a4daed614..877720c64515 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -18,7 +18,6 @@
#include <linux/memory_hotplug.h>
#include <linux/numa.h>
#include <asm/machdep.h>
-#include <asm/debugfs.h>
#include <asm/cacheflush.h>
/* This enables us to keep track of the memory removed from each node. */
@@ -330,7 +329,7 @@ DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get,
static int memtrace_init(void)
{
memtrace_debugfs_dir = debugfs_create_dir("memtrace",
- powerpc_debugfs_root);
+ arch_debugfs_dir);
debugfs_create_file("enable", 0600, memtrace_debugfs_dir,
NULL, &memtrace_init_fops);
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index 7824cc364bc4..05d3832019b9 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -13,11 +13,11 @@
#include <linux/of_address.h>
#include <linux/of_platform.h>
#include <linux/crash_dump.h>
+#include <linux/debugfs.h>
#include <asm/opal.h>
#include <asm/io.h>
#include <asm/imc-pmu.h>
#include <asm/cputhreads.h>
-#include <asm/debugfs.h>
static struct dentry *imc_debugfs_parent;
@@ -56,7 +56,7 @@ static void export_imc_mode_and_cmd(struct device_node *node,
u32 cb_offset;
struct imc_mem_info *ptr = pmu_ptr->mem_info;
- imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
+ imc_debugfs_parent = debugfs_create_dir("imc", arch_debugfs_dir);
if (of_property_read_u32(node, "cb_offset", &cb_offset))
cb_offset = IMC_CNTL_BLK_OFFSET;
@@ -186,7 +186,7 @@ static void disable_nest_pmu_counters(void)
int nid, cpu;
const struct cpumask *l_cpumask;
- get_online_cpus();
+ cpus_read_lock();
for_each_node_with_cpus(nid) {
l_cpumask = cpumask_of_node(nid);
cpu = cpumask_first_and(l_cpumask, cpu_online_mask);
@@ -195,7 +195,7 @@ static void disable_nest_pmu_counters(void)
opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
get_hard_smp_processor_id(cpu));
}
- put_online_cpus();
+ cpus_read_unlock();
}
static void disable_core_pmu_counters(void)
@@ -203,7 +203,7 @@ static void disable_core_pmu_counters(void)
cpumask_t cores_map;
int cpu, rc;
- get_online_cpus();
+ cpus_read_lock();
/* Disable the IMC Core functions */
cores_map = cpu_online_cores_map();
for_each_cpu(cpu, &cores_map) {
@@ -213,7 +213,7 @@ static void disable_core_pmu_counters(void)
pr_err("%s: Failed to stop Core (cpu = %d)\n",
__FUNCTION__, cpu);
}
- put_online_cpus();
+ cpus_read_unlock();
}
int get_max_nest_dev(void)
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index 608569082ba0..1e5d51db40f8 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -10,13 +10,13 @@
#include <linux/bug.h>
#include <linux/io.h>
#include <linux/slab.h>
+#include <linux/debugfs.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/opal.h>
#include <asm/prom.h>
#include <linux/uaccess.h>
-#include <asm/debugfs.h>
#include <asm/isa-bridge.h>
static int opal_lpc_chip_id = -1;
@@ -371,7 +371,7 @@ static int opal_lpc_init_debugfs(void)
if (opal_lpc_chip_id < 0)
return -ENODEV;
- root = debugfs_create_dir("lpc", powerpc_debugfs_root);
+ root = debugfs_create_dir("lpc", arch_debugfs_dir);
rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO);
rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM);
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index fd510d961b8c..6b4eed2ef4fa 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -14,11 +14,11 @@
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/debugfs.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/opal.h>
-#include <asm/debugfs.h>
#include <asm/prom.h>
static u64 opal_scom_unmangle(u64 addr)
@@ -189,7 +189,7 @@ static int scom_debug_init(void)
if (!firmware_has_feature(FW_FEATURE_OPAL))
return 0;
- root = debugfs_create_dir("scom", powerpc_debugfs_root);
+ root = debugfs_create_dir("scom", arch_debugfs_dir);
if (!root)
return -1;
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 2835376e61a4..e9d18519e650 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -588,7 +588,7 @@ static int opal_recover_mce(struct pt_regs *regs,
{
int recovered = 0;
- if (!(regs->msr & MSR_RI)) {
+ if (regs_is_unrecoverable(regs)) {
/* If MSR_RI isn't set, we cannot recover */
pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
recovered = 0;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7de464679292..3dd35c327d1c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -20,6 +20,7 @@
#include <linux/iommu.h>
#include <linux/rculist.h>
#include <linux/sizes.h>
+#include <linux/debugfs.h>
#include <asm/sections.h>
#include <asm/io.h>
@@ -32,10 +33,10 @@
#include <asm/iommu.h>
#include <asm/tce.h>
#include <asm/xics.h>
-#include <asm/debugfs.h>
#include <asm/firmware.h>
#include <asm/pnv-pci.h>
#include <asm/mmzone.h>
+#include <asm/xive.h>
#include <misc/cxl-base.h>
@@ -1962,27 +1963,40 @@ void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
pe->dma_setup_done = true;
}
-int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
+/*
+ * Called from KVM in real mode to EOI passthru interrupts. The ICP
+ * EOI is handled directly in KVM in kvmppc_deliver_irq_passthru().
+ *
+ * The IRQ data is mapped in the PCI-MSI domain and the EOI OPAL call
+ * needs an HW IRQ number mapped in the XICS IRQ domain. The HW IRQ
+ * numbers of the in-the-middle MSI domain are vector numbers and it's
+ * good enough for OPAL. Use that.
+ */
+int64_t pnv_opal_pci_msi_eoi(struct irq_data *d)
{
- struct pnv_phb *phb = container_of(chip, struct pnv_phb,
- ioda.irq_chip);
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d->parent_data);
+ struct pnv_phb *phb = hose->private_data;
- return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+ return opal_pci_msi_eoi(phb->opal_id, d->parent_data->hwirq);
}
+/*
+ * The IRQ data is mapped in the XICS domain, with OPAL HW IRQ numbers
+ */
static void pnv_ioda2_msi_eoi(struct irq_data *d)
{
int64_t rc;
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
- struct irq_chip *chip = irq_data_get_irq_chip(d);
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+ struct pnv_phb *phb = hose->private_data;
- rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
+ rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
WARN_ON_ONCE(rc);
icp_native_eoi(d);
}
-
+/* P8/CXL only */
void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
{
struct irq_data *idata;
@@ -2004,27 +2018,32 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
}
irq_set_chip(virq, &phb->ioda.irq_chip);
+ irq_set_chip_data(virq, phb->hose);
}
+static struct irq_chip pnv_pci_msi_irq_chip;
+
/*
* Returns true iff chip is something that we could call
* pnv_opal_pci_msi_eoi for.
*/
bool is_pnv_opal_msi(struct irq_chip *chip)
{
- return chip->irq_eoi == pnv_ioda2_msi_eoi;
+ return chip == &pnv_pci_msi_irq_chip;
}
EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
-static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
- unsigned int hwirq, unsigned int virq,
- unsigned int is_64, struct msi_msg *msg)
+static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int xive_num,
+ unsigned int is_64, struct msi_msg *msg)
{
struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
- unsigned int xive_num = hwirq - phb->msi_base;
__be32 data;
int rc;
+ dev_dbg(&dev->dev, "%s: setup %s-bit MSI for vector #%d\n", __func__,
+ is_64 ? "64" : "32", xive_num);
+
/* No PE assigned ? bail out ... no MSI for you ! */
if (pe == NULL)
return -ENXIO;
@@ -2072,12 +2091,209 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
}
msg->data = be32_to_cpu(data);
- pnv_set_msi_irq_chip(phb, virq);
+ return 0;
+}
+
+/*
+ * The msi_free() op is called before irq_domain_free_irqs_top() when
+ * the handler data is still available. Use that to clear the XIVE
+ * controller.
+ */
+static void pnv_msi_ops_msi_free(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ unsigned int irq)
+{
+ if (xive_enabled())
+ xive_irq_free_data(irq);
+}
+
+static struct msi_domain_ops pnv_pci_msi_domain_ops = {
+ .msi_free = pnv_msi_ops_msi_free,
+};
+
+static void pnv_msi_shutdown(struct irq_data *d)
+{
+ d = d->parent_data;
+ if (d->chip->irq_shutdown)
+ d->chip->irq_shutdown(d);
+}
+
+static void pnv_msi_mask(struct irq_data *d)
+{
+ pci_msi_mask_irq(d);
+ irq_chip_mask_parent(d);
+}
+
+static void pnv_msi_unmask(struct irq_data *d)
+{
+ pci_msi_unmask_irq(d);
+ irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip pnv_pci_msi_irq_chip = {
+ .name = "PNV-PCI-MSI",
+ .irq_shutdown = pnv_msi_shutdown,
+ .irq_mask = pnv_msi_mask,
+ .irq_unmask = pnv_msi_unmask,
+ .irq_eoi = irq_chip_eoi_parent,
+};
+
+static struct msi_domain_info pnv_msi_domain_info = {
+ .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX),
+ .ops = &pnv_pci_msi_domain_ops,
+ .chip = &pnv_pci_msi_irq_chip,
+};
+
+static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg)
+{
+ struct msi_desc *entry = irq_data_get_msi_desc(d);
+ struct pci_dev *pdev = msi_desc_to_pci_dev(entry);
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+ struct pnv_phb *phb = hose->private_data;
+ int rc;
+
+ rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq,
+ entry->msi_attrib.is_64, msg);
+ if (rc)
+ dev_err(&pdev->dev, "Failed to setup %s-bit MSI #%ld : %d\n",
+ entry->msi_attrib.is_64 ? "64" : "32", d->hwirq, rc);
+}
+
+/*
+ * The IRQ data is mapped in the MSI domain in which HW IRQ numbers
+ * correspond to vector numbers.
+ */
+static void pnv_msi_eoi(struct irq_data *d)
+{
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+ struct pnv_phb *phb = hose->private_data;
+
+ if (phb->model == PNV_PHB_MODEL_PHB3) {
+ /*
+ * The EOI OPAL call takes an OPAL HW IRQ number but
+ * since it is translated into a vector number in
+ * OPAL, use that directly.
+ */
+ WARN_ON_ONCE(opal_pci_msi_eoi(phb->opal_id, d->hwirq));
+ }
+
+ irq_chip_eoi_parent(d);
+}
+
+static struct irq_chip pnv_msi_irq_chip = {
+ .name = "PNV-MSI",
+ .irq_shutdown = pnv_msi_shutdown,
+ .irq_mask = irq_chip_mask_parent,
+ .irq_unmask = irq_chip_unmask_parent,
+ .irq_eoi = pnv_msi_eoi,
+ .irq_set_affinity = irq_chip_set_affinity_parent,
+ .irq_compose_msi_msg = pnv_msi_compose_msg,
+};
+
+static int pnv_irq_parent_domain_alloc(struct irq_domain *domain,
+ unsigned int virq, int hwirq)
+{
+ struct irq_fwspec parent_fwspec;
+ int ret;
+
+ parent_fwspec.fwnode = domain->parent->fwnode;
+ parent_fwspec.param_count = 2;
+ parent_fwspec.param[0] = hwirq;
+ parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct pci_controller *hose = domain->host_data;
+ struct pnv_phb *phb = hose->private_data;
+ msi_alloc_info_t *info = arg;
+ struct pci_dev *pdev = msi_desc_to_pci_dev(info->desc);
+ int hwirq;
+ int i, ret;
+
+ hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, nr_irqs);
+ if (hwirq < 0) {
+ dev_warn(&pdev->dev, "failed to find a free MSI\n");
+ return -ENOSPC;
+ }
+
+ dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__,
+ hose->dn, virq, hwirq, nr_irqs);
+
+ for (i = 0; i < nr_irqs; i++) {
+ ret = pnv_irq_parent_domain_alloc(domain, virq + i,
+ phb->msi_base + hwirq + i);
+ if (ret)
+ goto out;
+
+ irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+ &pnv_msi_irq_chip, hose);
+ }
+
+ return 0;
+
+out:
+ irq_domain_free_irqs_parent(domain, virq, i - 1);
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, nr_irqs);
+ return ret;
+}
+
+static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+ struct pnv_phb *phb = hose->private_data;
+
+ pr_debug("%s bridge %pOF %d/%lx #%d\n", __func__, hose->dn,
+ virq, d->hwirq, nr_irqs);
+
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs);
+ /* XIVE domain is cleared through ->msi_free() */
+}
+
+static const struct irq_domain_ops pnv_irq_domain_ops = {
+ .alloc = pnv_irq_domain_alloc,
+ .free = pnv_irq_domain_free,
+};
+
+static int pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count)
+{
+ struct pnv_phb *phb = hose->private_data;
+ struct irq_domain *parent = irq_get_default_host();
+
+ hose->fwnode = irq_domain_alloc_named_id_fwnode("PNV-MSI", phb->opal_id);
+ if (!hose->fwnode)
+ return -ENOMEM;
- pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
- " address=%x_%08x data=%x PE# %x\n",
- pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
- msg->address_hi, msg->address_lo, data, pe->pe_number);
+ hose->dev_domain = irq_domain_create_hierarchy(parent, 0, count,
+ hose->fwnode,
+ &pnv_irq_domain_ops, hose);
+ if (!hose->dev_domain) {
+ pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n",
+ hose->dn, hose->global_number);
+ irq_domain_free_fwnode(hose->fwnode);
+ return -ENOMEM;
+ }
+
+ hose->msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(hose->dn),
+ &pnv_msi_domain_info,
+ hose->dev_domain);
+ if (!hose->msi_domain) {
+ pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n",
+ hose->dn, hose->global_number);
+ irq_domain_free_fwnode(hose->fwnode);
+ irq_domain_remove(hose->dev_domain);
+ return -ENOMEM;
+ }
return 0;
}
@@ -2102,10 +2318,10 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
return;
}
- phb->msi_setup = pnv_pci_ioda_msi_setup;
- phb->msi32_support = 1;
pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
count, phb->msi_base);
+
+ pnv_msi_allocate_domains(phb->hose, count);
}
static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
@@ -2259,7 +2475,7 @@ static void pnv_pci_ioda_create_dbgfs(void)
phb = hose->private_data;
sprintf(name, "PCI%04x", hose->global_number);
- phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
+ phb->dbgfs = debugfs_create_dir(name, arch_debugfs_dir);
debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs,
phb, &pnv_pci_diag_data_fops);
@@ -2709,8 +2925,6 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
.dma_dev_setup = pnv_pci_ioda_dma_dev_setup,
.dma_bus_setup = pnv_pci_ioda_dma_bus_setup,
.iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
- .setup_msi_irqs = pnv_setup_msi_irqs,
- .teardown_msi_irqs = pnv_teardown_msi_irqs,
.enable_device_hook = pnv_pci_enable_device_hook,
.release_device = pnv_pci_release_device,
.window_alignment = pnv_pci_window_alignment,
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 6bb3c52633fb..9a8391b983d1 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -160,73 +160,6 @@ exit:
}
EXPORT_SYMBOL_GPL(pnv_pci_set_power_state);
-int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
-{
- struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
- struct msi_desc *entry;
- struct msi_msg msg;
- int hwirq;
- unsigned int virq;
- int rc;
-
- if (WARN_ON(!phb) || !phb->msi_bmp.bitmap)
- return -ENODEV;
-
- if (pdev->no_64bit_msi && !phb->msi32_support)
- return -ENODEV;
-
- for_each_pci_msi_entry(entry, pdev) {
- if (!entry->msi_attrib.is_64 && !phb->msi32_support) {
- pr_warn("%s: Supports only 64-bit MSIs\n",
- pci_name(pdev));
- return -ENXIO;
- }
- hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1);
- if (hwirq < 0) {
- pr_warn("%s: Failed to find a free MSI\n",
- pci_name(pdev));
- return -ENOSPC;
- }
- virq = irq_create_mapping(NULL, phb->msi_base + hwirq);
- if (!virq) {
- pr_warn("%s: Failed to map MSI to linux irq\n",
- pci_name(pdev));
- msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
- return -ENOMEM;
- }
- rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq,
- virq, entry->msi_attrib.is_64, &msg);
- if (rc) {
- pr_warn("%s: Failed to setup MSI\n", pci_name(pdev));
- irq_dispose_mapping(virq);
- msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
- return rc;
- }
- irq_set_msi_desc(virq, entry);
- pci_write_msi_msg(virq, &msg);
- }
- return 0;
-}
-
-void pnv_teardown_msi_irqs(struct pci_dev *pdev)
-{
- struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
- struct msi_desc *entry;
- irq_hw_number_t hwirq;
-
- if (WARN_ON(!phb))
- return;
-
- for_each_pci_msi_entry(entry, pdev) {
- if (!entry->irq)
- continue;
- hwirq = virq_to_hw(entry->irq);
- irq_set_msi_desc(entry->irq, NULL);
- irq_dispose_mapping(entry->irq);
- msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, 1);
- }
-}
-
/* Nicely print the contents of the PE State Tables (PEST). */
static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size)
{
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index c8d4f222a86f..966a9eb64339 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -123,11 +123,7 @@ struct pnv_phb {
#endif
unsigned int msi_base;
- unsigned int msi32_support;
struct msi_bitmap msi_bmp;
- int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
- unsigned int hwirq, unsigned int virq,
- unsigned int is_64, struct msi_msg *msg);
int (*init_m64)(struct pnv_phb *phb);
int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
@@ -289,8 +285,6 @@ extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np);
extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
-extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
-extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn);
extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c
index 7ddc7ec6a7c0..ef710a715903 100644
--- a/arch/powerpc/platforms/ps3/htab.c
+++ b/arch/powerpc/platforms/ps3/htab.c
@@ -169,7 +169,8 @@ static void ps3_hpte_invalidate(unsigned long slot, unsigned long vpn,
spin_unlock_irqrestore(&ps3_htab_lock, flags);
}
-static void ps3_hpte_clear(void)
+/* Called during kexec sequence with MMU off */
+static notrace void ps3_hpte_clear(void)
{
unsigned long hpte_count = (1UL << ppc64_pft_size) >> 4;
u64 i;
diff --git a/arch/powerpc/platforms/ps3/mm.c b/arch/powerpc/platforms/ps3/mm.c
index a81eac35d900..9c44f335c0b9 100644
--- a/arch/powerpc/platforms/ps3/mm.c
+++ b/arch/powerpc/platforms/ps3/mm.c
@@ -195,9 +195,11 @@ fail:
/**
* ps3_mm_vas_destroy -
+ *
+ * called during kexec sequence with MMU off.
*/
-void ps3_mm_vas_destroy(void)
+notrace void ps3_mm_vas_destroy(void)
{
int result;
@@ -1243,9 +1245,11 @@ void __init ps3_mm_init(void)
/**
* ps3_mm_shutdown - final cleanup of address space
+ *
+ * called during kexec sequence with MMU off.
*/
-void ps3_mm_shutdown(void)
+notrace void ps3_mm_shutdown(void)
{
ps3_mm_region_destroy(&map.r1);
}
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index 982f069e4c31..352af5b14a0f 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -11,10 +11,10 @@
#include <linux/spinlock.h>
#include <asm/smp.h>
#include <linux/uaccess.h>
+#include <linux/debugfs.h>
#include <asm/firmware.h>
#include <asm/dtl.h>
#include <asm/lppaca.h>
-#include <asm/debugfs.h>
#include <asm/plpar_wrappers.h>
#include <asm/machdep.h>
@@ -338,7 +338,7 @@ static int dtl_init(void)
/* set up common debugfs structure */
- dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root);
+ dtl_dir = debugfs_create_dir("dtl", arch_debugfs_dir);
debugfs_create_x8("dtl_event_mask", 0600, dtl_dir, &dtl_event_mask);
debugfs_create_u32("dtl_buf_entries", 0400, dtl_dir, &dtl_buf_entries);
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
index 4c7b7f5a2ebc..f162156b7b68 100644
--- a/arch/powerpc/platforms/pseries/firmware.c
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -119,10 +119,11 @@ struct vec5_fw_feature {
static __initdata struct vec5_fw_feature
vec5_fw_features_table[] = {
- {FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY},
+ {FW_FEATURE_FORM1_AFFINITY, OV5_FORM1_AFFINITY},
{FW_FEATURE_PRRN, OV5_PRRN},
{FW_FEATURE_DRMEM_V2, OV5_DRMEM_V2},
{FW_FEATURE_DRC_INFO, OV5_DRC_INFO},
+ {FW_FEATURE_FORM2_AFFINITY, OV5_FORM2_AFFINITY},
};
static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 7e970f81d8ff..d646c22e94ab 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -39,6 +39,12 @@
/* This version can't take the spinlock, because it never returns */
static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE;
+/*
+ * Record the CPU ids used on each nodes.
+ * Protected by cpu_add_remove_lock.
+ */
+static cpumask_var_t node_recorded_ids_map[MAX_NUMNODES];
+
static void rtas_stop_self(void)
{
static struct rtas_args args;
@@ -139,72 +145,148 @@ static void pseries_cpu_die(unsigned int cpu)
paca_ptrs[cpu]->cpu_start = 0;
}
+/**
+ * find_cpu_id_range - found a linear ranger of @nthreads free CPU ids.
+ * @nthreads : the number of threads (cpu ids)
+ * @assigned_node : the node it belongs to or NUMA_NO_NODE if free ids from any
+ * node can be peek.
+ * @cpu_mask: the returned CPU mask.
+ *
+ * Returns 0 on success.
+ */
+static int find_cpu_id_range(unsigned int nthreads, int assigned_node,
+ cpumask_var_t *cpu_mask)
+{
+ cpumask_var_t candidate_mask;
+ unsigned int cpu, node;
+ int rc = -ENOSPC;
+
+ if (!zalloc_cpumask_var(&candidate_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_clear(*cpu_mask);
+ for (cpu = 0; cpu < nthreads; cpu++)
+ cpumask_set_cpu(cpu, *cpu_mask);
+
+ BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask));
+
+ /* Get a bitmap of unoccupied slots. */
+ cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
+
+ if (assigned_node != NUMA_NO_NODE) {
+ /*
+ * Remove free ids previously assigned on the other nodes. We
+ * can walk only online nodes because once a node became online
+ * it is not turned offlined back.
+ */
+ for_each_online_node(node) {
+ if (node == assigned_node)
+ continue;
+ cpumask_andnot(candidate_mask, candidate_mask,
+ node_recorded_ids_map[node]);
+ }
+ }
+
+ if (cpumask_empty(candidate_mask))
+ goto out;
+
+ while (!cpumask_empty(*cpu_mask)) {
+ if (cpumask_subset(*cpu_mask, candidate_mask))
+ /* Found a range where we can insert the new cpu(s) */
+ break;
+ cpumask_shift_left(*cpu_mask, *cpu_mask, nthreads);
+ }
+
+ if (!cpumask_empty(*cpu_mask))
+ rc = 0;
+
+out:
+ free_cpumask_var(candidate_mask);
+ return rc;
+}
+
/*
* Update cpu_present_mask and paca(s) for a new cpu node. The wrinkle
- * here is that a cpu device node may represent up to two logical cpus
+ * here is that a cpu device node may represent multiple logical cpus
* in the SMT case. We must honor the assumption in other code that
* the logical ids for sibling SMT threads x and y are adjacent, such
* that x^1 == y and y^1 == x.
*/
static int pseries_add_processor(struct device_node *np)
{
- unsigned int cpu;
- cpumask_var_t candidate_mask, tmp;
- int err = -ENOSPC, len, nthreads, i;
+ int len, nthreads, node, cpu, assigned_node;
+ int rc = 0;
+ cpumask_var_t cpu_mask;
const __be32 *intserv;
intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
if (!intserv)
return 0;
- zalloc_cpumask_var(&candidate_mask, GFP_KERNEL);
- zalloc_cpumask_var(&tmp, GFP_KERNEL);
-
nthreads = len / sizeof(u32);
- for (i = 0; i < nthreads; i++)
- cpumask_set_cpu(i, tmp);
- cpu_maps_update_begin();
+ if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
- BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask));
+ /*
+ * Fetch from the DT nodes read by dlpar_configure_connector() the NUMA
+ * node id the added CPU belongs to.
+ */
+ node = of_node_to_nid(np);
+ if (node < 0 || !node_possible(node))
+ node = first_online_node;
- /* Get a bitmap of unoccupied slots. */
- cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
- if (cpumask_empty(candidate_mask)) {
- /* If we get here, it most likely means that NR_CPUS is
- * less than the partition's max processors setting.
+ BUG_ON(node == NUMA_NO_NODE);
+ assigned_node = node;
+
+ cpu_maps_update_begin();
+
+ rc = find_cpu_id_range(nthreads, node, &cpu_mask);
+ if (rc && nr_node_ids > 1) {
+ /*
+ * Try again, considering the free CPU ids from the other node.
*/
- printk(KERN_ERR "Cannot add cpu %pOF; this system configuration"
- " supports %d logical cpus.\n", np,
- num_possible_cpus());
- goto out_unlock;
+ node = NUMA_NO_NODE;
+ rc = find_cpu_id_range(nthreads, NUMA_NO_NODE, &cpu_mask);
}
- while (!cpumask_empty(tmp))
- if (cpumask_subset(tmp, candidate_mask))
- /* Found a range where we can insert the new cpu(s) */
- break;
- else
- cpumask_shift_left(tmp, tmp, nthreads);
-
- if (cpumask_empty(tmp)) {
- printk(KERN_ERR "Unable to find space in cpu_present_mask for"
- " processor %pOFn with %d thread(s)\n", np,
- nthreads);
- goto out_unlock;
+ if (rc) {
+ pr_err("Cannot add cpu %pOF; this system configuration"
+ " supports %d logical cpus.\n", np, num_possible_cpus());
+ goto out;
}
- for_each_cpu(cpu, tmp) {
+ for_each_cpu(cpu, cpu_mask) {
BUG_ON(cpu_present(cpu));
set_cpu_present(cpu, true);
set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++));
}
- err = 0;
-out_unlock:
+
+ /* Record the newly used CPU ids for the associate node. */
+ cpumask_or(node_recorded_ids_map[assigned_node],
+ node_recorded_ids_map[assigned_node], cpu_mask);
+
+ /*
+ * If node is set to NUMA_NO_NODE, CPU ids have be reused from
+ * another node, remove them from its mask.
+ */
+ if (node == NUMA_NO_NODE) {
+ cpu = cpumask_first(cpu_mask);
+ pr_warn("Reusing free CPU ids %d-%d from another node\n",
+ cpu, cpu + nthreads - 1);
+ for_each_online_node(node) {
+ if (node == assigned_node)
+ continue;
+ cpumask_andnot(node_recorded_ids_map[node],
+ node_recorded_ids_map[node],
+ cpu_mask);
+ }
+ }
+
+out:
cpu_maps_update_done();
- free_cpumask_var(candidate_mask);
- free_cpumask_var(tmp);
- return err;
+ free_cpumask_var(cpu_mask);
+ return rc;
}
/*
@@ -498,6 +580,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
return saved_rc;
}
+ update_numa_distance(dn);
+
rc = dlpar_online_cpu(dn);
if (rc) {
saved_rc = rc;
@@ -908,6 +992,7 @@ static struct notifier_block pseries_smp_nb = {
static int __init pseries_cpu_hotplug_init(void)
{
int qcss_tok;
+ unsigned int node;
#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
ppc_md.cpu_probe = dlpar_cpu_probe;
@@ -929,8 +1014,18 @@ static int __init pseries_cpu_hotplug_init(void)
smp_ops->cpu_die = pseries_cpu_die;
/* Processors can be added/removed only on LPAR */
- if (firmware_has_feature(FW_FEATURE_LPAR))
+ if (firmware_has_feature(FW_FEATURE_LPAR)) {
+ for_each_node(node) {
+ alloc_bootmem_cpumask_var(&node_recorded_ids_map[node]);
+
+ /* Record ids of CPU added at boot time */
+ cpumask_or(node_recorded_ids_map[node],
+ node_recorded_ids_map[node],
+ cpumask_of_node(node));
+ }
+
of_reconfig_notifier_register(&pseries_smp_nb);
+ }
return 0;
}
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 377d852f5a9a..44246ba59768 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
return -ENODEV;
}
+ update_numa_distance(lmb_node);
+
dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (!dr_node) {
dlpar_free_cc_nodes(lmb_node);
@@ -211,13 +213,11 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb)
{
unsigned long section_nr;
- struct mem_section *mem_sect;
struct memory_block *mem_block;
section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr));
- mem_sect = __nr_to_section(section_nr);
- mem_block = find_memory_block(mem_sect);
+ mem_block = find_memory_block(section_nr);
return mem_block;
}
@@ -979,6 +979,10 @@ static int pseries_memory_notifier(struct notifier_block *nb,
case OF_RECONFIG_DETACH_NODE:
err = pseries_remove_mem_node(rd->dn);
break;
+ case OF_RECONFIG_UPDATE_PROPERTY:
+ if (!strcmp(rd->dn->name,
+ "ibm,dynamic-reconfiguration-memory"))
+ drmem_update_lmbs(rd->prop);
}
return notifier_from_errno(err);
}
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 0c55b991f665..dab5c56ffd0e 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,28 +53,31 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
};
-static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+static struct iommu_table *iommu_pseries_alloc_table(int node)
{
- struct iommu_table_group *table_group;
struct iommu_table *tbl;
- table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
- node);
- if (!table_group)
- return NULL;
-
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
if (!tbl)
- goto free_group;
+ return NULL;
INIT_LIST_HEAD_RCU(&tbl->it_group_list);
kref_init(&tbl->it_kref);
+ return tbl;
+}
+
+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+{
+ struct iommu_table_group *table_group;
- table_group->tables[0] = tbl;
+ table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
+ if (!table_group)
+ return NULL;
- return table_group;
+ table_group->tables[0] = iommu_pseries_alloc_table(node);
+ if (table_group->tables[0])
+ return table_group;
-free_group:
kfree(table_group);
return NULL;
}
@@ -107,6 +110,8 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index,
u64 proto_tce;
__be64 *tcep;
u64 rpn;
+ const unsigned long tceshift = tbl->it_page_shift;
+ const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
proto_tce = TCE_PCI_READ; // Read allowed
@@ -117,10 +122,10 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index,
while (npages--) {
/* can't move this out since we might cross MEMBLOCK boundary */
- rpn = __pa(uaddr) >> TCE_SHIFT;
- *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
+ rpn = __pa(uaddr) >> tceshift;
+ *tcep = cpu_to_be64(proto_tce | rpn << tceshift);
- uaddr += TCE_PAGE_SIZE;
+ uaddr += pagesize;
tcep++;
}
return 0;
@@ -146,7 +151,7 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
return be64_to_cpu(*tcep);
}
-static void tce_free_pSeriesLP(unsigned long liobn, long, long);
+static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
@@ -166,12 +171,12 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
proto_tce |= TCE_PCI_WRITE;
while (npages--) {
- tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift;
+ tce = proto_tce | rpn << tceshift;
rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
ret = (int)rc;
- tce_free_pSeriesLP(liobn, tcenum_start,
+ tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
(npages_start - (npages + 1)));
break;
}
@@ -205,10 +210,11 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
long tcenum_start = tcenum, npages_start = npages;
int ret = 0;
unsigned long flags;
+ const unsigned long tceshift = tbl->it_page_shift;
if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
return tce_build_pSeriesLP(tbl->it_index, tcenum,
- tbl->it_page_shift, npages, uaddr,
+ tceshift, npages, uaddr,
direction, attrs);
}
@@ -225,13 +231,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
if (!tcep) {
local_irq_restore(flags);
return tce_build_pSeriesLP(tbl->it_index, tcenum,
- tbl->it_page_shift,
+ tceshift,
npages, uaddr, direction, attrs);
}
__this_cpu_write(tce_page, tcep);
}
- rpn = __pa(uaddr) >> TCE_SHIFT;
+ rpn = __pa(uaddr) >> tceshift;
proto_tce = TCE_PCI_READ;
if (direction != DMA_TO_DEVICE)
proto_tce |= TCE_PCI_WRITE;
@@ -245,12 +251,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
for (l = 0; l < limit; l++) {
- tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
+ tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift);
rpn++;
}
rc = plpar_tce_put_indirect((u64)tbl->it_index,
- (u64)tcenum << 12,
+ (u64)tcenum << tceshift,
(u64)__pa(tcep),
limit);
@@ -277,12 +283,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
return ret;
}
-static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long npages)
+static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
+ long npages)
{
u64 rc;
while (npages--) {
- rc = plpar_tce_put((u64)liobn, (u64)tcenum << 12, 0);
+ rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0);
if (rc && printk_ratelimit()) {
printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
@@ -301,9 +308,11 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n
u64 rc;
if (!firmware_has_feature(FW_FEATURE_STUFF_TCE))
- return tce_free_pSeriesLP(tbl->it_index, tcenum, npages);
+ return tce_free_pSeriesLP(tbl->it_index, tcenum,
+ tbl->it_page_shift, npages);
- rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
+ rc = plpar_tce_stuff((u64)tbl->it_index,
+ (u64)tcenum << tbl->it_page_shift, 0, npages);
if (rc && printk_ratelimit()) {
printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
@@ -319,7 +328,8 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
u64 rc;
unsigned long tce_ret;
- rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret);
+ rc = plpar_tce_get((u64)tbl->it_index,
+ (u64)tcenum << tbl->it_page_shift, &tce_ret);
if (rc && printk_ratelimit()) {
printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
@@ -339,7 +349,7 @@ struct dynamic_dma_window_prop {
__be32 window_shift; /* ilog2(tce_window_size) */
};
-struct direct_window {
+struct dma_win {
struct device_node *device;
const struct dynamic_dma_window_prop *prop;
struct list_head list;
@@ -359,12 +369,13 @@ struct ddw_create_response {
u32 addr_lo;
};
-static LIST_HEAD(direct_window_list);
+static LIST_HEAD(dma_win_list);
/* prevents races between memory on/offline and window creation */
-static DEFINE_SPINLOCK(direct_window_list_lock);
+static DEFINE_SPINLOCK(dma_win_list_lock);
/* protects initializing window twice for same device */
-static DEFINE_MUTEX(direct_window_init_mutex);
+static DEFINE_MUTEX(dma_win_init_mutex);
#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -491,6 +502,24 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
}
+static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno,
+ unsigned long liobn, unsigned long win_addr,
+ unsigned long window_size, unsigned long page_shift,
+ void *base, struct iommu_table_ops *table_ops)
+{
+ tbl->it_busno = busno;
+ tbl->it_index = liobn;
+ tbl->it_offset = win_addr >> page_shift;
+ tbl->it_size = window_size >> page_shift;
+ tbl->it_page_shift = page_shift;
+ tbl->it_base = (unsigned long)base;
+ tbl->it_blocksize = 16;
+ tbl->it_type = TCE_PCI;
+ tbl->it_ops = table_ops;
+}
+
+struct iommu_table_ops iommu_table_pseries_ops;
+
static void iommu_table_setparms(struct pci_controller *phb,
struct device_node *dn,
struct iommu_table *tbl)
@@ -499,8 +528,13 @@ static void iommu_table_setparms(struct pci_controller *phb,
const unsigned long *basep;
const u32 *sizep;
- node = phb->dn;
+ /* Test if we are going over 2GB of DMA space */
+ if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {
+ udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+ panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+ }
+ node = phb->dn;
basep = of_get_property(node, "linux,tce-base", NULL);
sizep = of_get_property(node, "linux,tce-size", NULL);
if (basep == NULL || sizep == NULL) {
@@ -509,33 +543,18 @@ static void iommu_table_setparms(struct pci_controller *phb,
return;
}
- tbl->it_base = (unsigned long)__va(*basep);
+ iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur,
+ phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
+ __va(*basep), &iommu_table_pseries_ops);
if (!is_kdump_kernel())
memset((void *)tbl->it_base, 0, *sizep);
- tbl->it_busno = phb->bus->number;
- tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
-
- /* Units of tce entries */
- tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
-
- /* Test if we are going over 2GB of DMA space */
- if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
- udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
- panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
- }
-
phb->dma_window_base_cur += phb->dma_window_size;
-
- /* Set the tce table size - measured in entries */
- tbl->it_size = phb->dma_window_size >> tbl->it_page_shift;
-
- tbl->it_index = 0;
- tbl->it_blocksize = 16;
- tbl->it_type = TCE_PCI;
}
+struct iommu_table_ops iommu_table_lpar_multi_ops;
+
/*
* iommu_table_setparms_lpar
*
@@ -547,17 +566,13 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
struct iommu_table_group *table_group,
const __be32 *dma_window)
{
- unsigned long offset, size;
+ unsigned long offset, size, liobn;
- of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size);
+ of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
+
+ iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL,
+ &iommu_table_lpar_multi_ops);
- tbl->it_busno = phb->bus->number;
- tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
- tbl->it_base = 0;
- tbl->it_blocksize = 16;
- tbl->it_type = TCE_PCI;
- tbl->it_offset = offset >> tbl->it_page_shift;
- tbl->it_size = size >> tbl->it_page_shift;
table_group->tce32_start = offset;
table_group->tce32_size = size;
@@ -637,7 +652,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
tbl = pci->table_group->tables[0];
iommu_table_setparms(pci->phb, dn, tbl);
- tbl->it_ops = &iommu_table_pseries_ops;
+
if (!iommu_init_table(tbl, pci->phb->node, 0, 0))
panic("Failed to initialize iommu table");
@@ -698,7 +713,10 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
dn);
- /* Find nearest ibm,dma-window, walking up the device tree */
+ /*
+ * Find nearest ibm,dma-window (default DMA window), walking up the
+ * device tree
+ */
for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
if (dma_window != NULL)
@@ -720,7 +738,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
tbl = ppci->table_group->tables[0];
iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
ppci->table_group, dma_window);
- tbl->it_ops = &iommu_table_lpar_multi_ops;
+
if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
panic("Failed to initialize iommu table");
iommu_register_group(ppci->table_group,
@@ -750,7 +768,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
tbl = PCI_DN(dn)->table_group->tables[0];
iommu_table_setparms(phb, dn, tbl);
- tbl->it_ops = &iommu_table_pseries_ops;
+
if (!iommu_init_table(tbl, phb->node, 0, 0))
panic("Failed to initialize iommu table");
@@ -785,17 +803,10 @@ static int __init disable_ddw_setup(char *str)
early_param("disable_ddw", disable_ddw_setup);
-static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
- struct property *win)
+static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp)
{
- struct dynamic_dma_window_prop *dwp;
- u64 liobn;
int ret;
- dwp = win->value;
- liobn = (u64)be32_to_cpu(dwp->liobn);
-
- /* clear the whole window, note the arg is in kernel pages */
ret = tce_clearrange_multi_pSeriesLP(0,
1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
if (ret)
@@ -804,94 +815,136 @@ static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
else
pr_debug("%pOF successfully cleared tces in window.\n",
np);
+}
+
+/*
+ * Call only if DMA window is clean.
+ */
+static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn)
+{
+ int ret;
ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
- pr_warn("%pOF: failed to remove direct window: rtas returned "
+ pr_warn("%pOF: failed to remove DMA window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
- pr_debug("%pOF: successfully removed direct window: rtas returned "
+ pr_debug("%pOF: successfully removed DMA window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
}
-static void remove_ddw(struct device_node *np, bool remove_prop)
+static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
+ struct property *win)
+{
+ struct dynamic_dma_window_prop *dwp;
+ u64 liobn;
+
+ dwp = win->value;
+ liobn = (u64)be32_to_cpu(dwp->liobn);
+
+ clean_dma_window(np, dwp);
+ __remove_dma_window(np, ddw_avail, liobn);
+}
+
+static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_name)
{
struct property *win;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
int ret = 0;
+ win = of_find_property(np, win_name, NULL);
+ if (!win)
+ return -EINVAL;
+
ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
&ddw_avail[0], DDW_APPLICABLE_SIZE);
if (ret)
- return;
+ return 0;
- win = of_find_property(np, DIRECT64_PROPNAME, NULL);
- if (!win)
- return;
if (win->length >= sizeof(struct dynamic_dma_window_prop))
remove_dma_window(np, ddw_avail, win);
if (!remove_prop)
- return;
+ return 0;
ret = of_remove_property(np, win);
if (ret)
- pr_warn("%pOF: failed to remove direct window property: %d\n",
+ pr_warn("%pOF: failed to remove DMA window property: %d\n",
np, ret);
+ return 0;
}
-static u64 find_existing_ddw(struct device_node *pdn, int *window_shift)
+static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift)
{
- struct direct_window *window;
- const struct dynamic_dma_window_prop *direct64;
- u64 dma_addr = 0;
+ struct dma_win *window;
+ const struct dynamic_dma_window_prop *dma64;
+ bool found = false;
- spin_lock(&direct_window_list_lock);
+ spin_lock(&dma_win_list_lock);
/* check if we already created a window and dupe that config if so */
- list_for_each_entry(window, &direct_window_list, list) {
+ list_for_each_entry(window, &dma_win_list, list) {
if (window->device == pdn) {
- direct64 = window->prop;
- dma_addr = be64_to_cpu(direct64->dma_base);
- *window_shift = be32_to_cpu(direct64->window_shift);
+ dma64 = window->prop;
+ *dma_addr = be64_to_cpu(dma64->dma_base);
+ *window_shift = be32_to_cpu(dma64->window_shift);
+ found = true;
break;
}
}
- spin_unlock(&direct_window_list_lock);
+ spin_unlock(&dma_win_list_lock);
- return dma_addr;
+ return found;
}
-static int find_existing_ddw_windows(void)
+static struct dma_win *ddw_list_new_entry(struct device_node *pdn,
+ const struct dynamic_dma_window_prop *dma64)
{
- int len;
- struct device_node *pdn;
- struct direct_window *window;
- const struct dynamic_dma_window_prop *direct64;
+ struct dma_win *window;
- if (!firmware_has_feature(FW_FEATURE_LPAR))
- return 0;
+ window = kzalloc(sizeof(*window), GFP_KERNEL);
+ if (!window)
+ return NULL;
- for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
- direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
- if (!direct64)
- continue;
+ window->device = pdn;
+ window->prop = dma64;
- window = kzalloc(sizeof(*window), GFP_KERNEL);
- if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
- kfree(window);
- remove_ddw(pdn, true);
+ return window;
+}
+
+static void find_existing_ddw_windows_named(const char *name)
+{
+ int len;
+ struct device_node *pdn;
+ struct dma_win *window;
+ const struct dynamic_dma_window_prop *dma64;
+
+ for_each_node_with_property(pdn, name) {
+ dma64 = of_get_property(pdn, name, &len);
+ if (!dma64 || len < sizeof(*dma64)) {
+ remove_ddw(pdn, true, name);
continue;
}
- window->device = pdn;
- window->prop = direct64;
- spin_lock(&direct_window_list_lock);
- list_add(&window->list, &direct_window_list);
- spin_unlock(&direct_window_list_lock);
+ window = ddw_list_new_entry(pdn, dma64);
+ if (!window)
+ break;
+
+ spin_lock(&dma_win_list_lock);
+ list_add(&window->list, &dma_win_list);
+ spin_unlock(&dma_win_list_lock);
}
+}
+
+static int find_existing_ddw_windows(void)
+{
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ return 0;
+
+ find_existing_ddw_windows_named(DIRECT64_PROPNAME);
+ find_existing_ddw_windows_named(DMA64_PROPNAME);
return 0;
}
@@ -1130,6 +1183,35 @@ static int iommu_get_page_shift(u32 query_page_size)
return 0;
}
+static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr,
+ u32 page_shift, u32 window_shift)
+{
+ struct dynamic_dma_window_prop *ddwprop;
+ struct property *win64;
+
+ win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
+ if (!win64)
+ return NULL;
+
+ win64->name = kstrdup(propname, GFP_KERNEL);
+ ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
+ win64->value = ddwprop;
+ win64->length = sizeof(*ddwprop);
+ if (!win64->name || !win64->value) {
+ kfree(win64->name);
+ kfree(win64->value);
+ kfree(win64);
+ return NULL;
+ }
+
+ ddwprop->liobn = cpu_to_be32(liobn);
+ ddwprop->dma_base = cpu_to_be64(dma_addr);
+ ddwprop->tce_shift = cpu_to_be32(page_shift);
+ ddwprop->window_shift = cpu_to_be32(window_shift);
+
+ return win64;
+}
+
/*
* If the PE supports dynamic dma windows, and there is space for a table
* that can map all pages in a linear offset, then setup such a table,
@@ -1139,34 +1221,39 @@ static int iommu_get_page_shift(u32 query_page_size)
* pdn: the parent pe node with the ibm,dma_window property
* Future: also check if we can remap the base window for our base page size
*
- * returns the dma offset for use by the direct mapped DMA code.
+ * returns true if can map all pages (direct mapping), false otherwise..
*/
-static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
+static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
{
int len = 0, ret;
int max_ram_len = order_base_2(ddw_memory_hotplug_max());
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
- u64 dma_addr;
+ u64 win_addr;
+ const char *win_name;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
- struct direct_window *window;
+ struct dma_win *window;
struct property *win64;
- struct dynamic_dma_window_prop *ddwprop;
+ bool ddw_enabled = false;
struct failed_ddw_pdn *fpdn;
- bool default_win_removed = false;
+ bool default_win_removed = false, direct_mapping = false;
bool pmem_present;
+ struct pci_dn *pci = PCI_DN(pdn);
+ struct iommu_table *tbl = pci->table_group->tables[0];
dn = of_find_node_by_type(NULL, "ibm,pmemory");
pmem_present = dn != NULL;
of_node_put(dn);
- mutex_lock(&direct_window_init_mutex);
+ mutex_lock(&dma_win_init_mutex);
- dma_addr = find_existing_ddw(pdn, &len);
- if (dma_addr != 0)
+ if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) {
+ direct_mapping = (len >= max_ram_len);
+ ddw_enabled = true;
goto out_unlock;
+ }
/*
* If we already went through this for a previous function of
@@ -1240,12 +1327,12 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
page_shift = iommu_get_page_shift(query.page_size);
if (!page_shift) {
- dev_dbg(&dev->dev, "no supported direct page size in mask %x",
- query.page_size);
+ dev_dbg(&dev->dev, "no supported page size in mask %x",
+ query.page_size);
goto out_failed;
}
- /* verify the window * number of ptes will map the partition */
- /* check largest block * page size > max memory hotplug addr */
+
+
/*
* The "ibm,pmemory" can appear anywhere in the address space.
* Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
@@ -1261,81 +1348,127 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
dev_info(&dev->dev, "Skipping ibm,pmemory");
}
+ /* check if the available block * number of ptes will map everything */
if (query.largest_available_block < (1ULL << (len - page_shift))) {
dev_dbg(&dev->dev,
"can't map partition max 0x%llx with %llu %llu-sized pages\n",
1ULL << len,
query.largest_available_block,
1ULL << page_shift);
- goto out_failed;
- }
- win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
- if (!win64) {
- dev_info(&dev->dev,
- "couldn't allocate property for 64bit dma window\n");
- goto out_failed;
- }
- win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
- win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
- win64->length = sizeof(*ddwprop);
- if (!win64->name || !win64->value) {
- dev_info(&dev->dev,
- "couldn't allocate property name and value\n");
- goto out_free_prop;
+
+ /* DDW + IOMMU on single window may fail if there is any allocation */
+ if (default_win_removed && iommu_table_in_use(tbl)) {
+ dev_dbg(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
+ goto out_failed;
+ }
+
+ len = order_base_2(query.largest_available_block << page_shift);
+ win_name = DMA64_PROPNAME;
+ } else {
+ direct_mapping = true;
+ win_name = DIRECT64_PROPNAME;
}
ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
if (ret != 0)
- goto out_free_prop;
-
- ddwprop->liobn = cpu_to_be32(create.liobn);
- ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) |
- create.addr_lo);
- ddwprop->tce_shift = cpu_to_be32(page_shift);
- ddwprop->window_shift = cpu_to_be32(len);
+ goto out_failed;
dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",
create.liobn, dn);
- window = kzalloc(sizeof(*window), GFP_KERNEL);
- if (!window)
- goto out_clear_window;
+ win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
+ win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len);
- ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
- win64->value, tce_setrange_multi_pSeriesLP_walk);
- if (ret) {
- dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
- dn, ret);
- goto out_free_window;
+ if (!win64) {
+ dev_info(&dev->dev,
+ "couldn't allocate property, property name, or value\n");
+ goto out_remove_win;
}
ret = of_add_property(pdn, win64);
if (ret) {
- dev_err(&dev->dev, "unable to add dma window property for %pOF: %d",
- pdn, ret);
- goto out_free_window;
+ dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d",
+ pdn, ret);
+ goto out_free_prop;
}
- window->device = pdn;
- window->prop = ddwprop;
- spin_lock(&direct_window_list_lock);
- list_add(&window->list, &direct_window_list);
- spin_unlock(&direct_window_list_lock);
+ window = ddw_list_new_entry(pdn, win64->value);
+ if (!window)
+ goto out_del_prop;
+
+ if (direct_mapping) {
+ /* DDW maps the whole partition, so enable direct DMA mapping */
+ ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
+ win64->value, tce_setrange_multi_pSeriesLP_walk);
+ if (ret) {
+ dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",
+ dn, ret);
+
+ /* Make sure to clean DDW if any TCE was set*/
+ clean_dma_window(pdn, win64->value);
+ goto out_del_list;
+ }
+ } else {
+ struct iommu_table *newtbl;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {
+ const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;
+
+ /* Look for MMIO32 */
+ if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM)
+ break;
+ }
+
+ if (i == ARRAY_SIZE(pci->phb->mem_resources))
+ goto out_del_list;
+
+ /* New table for using DDW instead of the default DMA window */
+ newtbl = iommu_pseries_alloc_table(pci->phb->node);
+ if (!newtbl) {
+ dev_dbg(&dev->dev, "couldn't create new IOMMU table\n");
+ goto out_del_list;
+ }
+
+ iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr,
+ 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops);
+ iommu_init_table(newtbl, pci->phb->node, pci->phb->mem_resources[i].start,
+ pci->phb->mem_resources[i].end);
- dma_addr = be64_to_cpu(ddwprop->dma_base);
+ pci->table_group->tables[1] = newtbl;
+
+ /* Keep default DMA window stuct if removed */
+ if (default_win_removed) {
+ tbl->it_size = 0;
+ kfree(tbl->it_map);
+ }
+
+ set_iommu_table_base(&dev->dev, newtbl);
+ }
+
+ spin_lock(&dma_win_list_lock);
+ list_add(&window->list, &dma_win_list);
+ spin_unlock(&dma_win_list_lock);
+
+ dev->dev.archdata.dma_offset = win_addr;
+ ddw_enabled = true;
goto out_unlock;
-out_free_window:
+out_del_list:
kfree(window);
-out_clear_window:
- remove_ddw(pdn, true);
+out_del_prop:
+ of_remove_property(pdn, win64);
out_free_prop:
kfree(win64->name);
kfree(win64->value);
kfree(win64);
+out_remove_win:
+ /* DDW is clean, so it's ok to call this directly. */
+ __remove_dma_window(pdn, ddw_avail, create.liobn);
+
out_failed:
if (default_win_removed)
reset_dma_window(dev, pdn);
@@ -1347,17 +1480,17 @@ out_failed:
list_add(&fpdn->list, &failed_ddw_pdn_list);
out_unlock:
- mutex_unlock(&direct_window_init_mutex);
+ mutex_unlock(&dma_win_init_mutex);
/*
* If we have persistent memory and the window size is only as big
* as RAM, then we failed to create a window to cover persistent
* memory and need to set the DMA limit.
*/
- if (pmem_present && dma_addr && (len == max_ram_len))
- dev->dev.bus_dma_limit = dma_addr + (1ULL << len);
+ if (pmem_present && ddw_enabled && direct_mapping && len == max_ram_len)
+ dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len);
- return dma_addr;
+ return ddw_enabled && direct_mapping;
}
static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
@@ -1399,7 +1532,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
tbl = pci->table_group->tables[0];
iommu_table_setparms_lpar(pci->phb, pdn, tbl,
pci->table_group, dma_window);
- tbl->it_ops = &iommu_table_lpar_multi_ops;
+
iommu_init_table(tbl, pci->phb->node, 0, 0);
iommu_register_group(pci->table_group,
pci_domain_nr(pci->phb->bus), 0);
@@ -1436,11 +1569,8 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
break;
}
- if (pdn && PCI_DN(pdn)) {
- pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn);
- if (pdev->dev.archdata.dma_offset)
- return true;
- }
+ if (pdn && PCI_DN(pdn))
+ return enable_ddw(pdev, pdn);
return false;
}
@@ -1448,29 +1578,29 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
void *data)
{
- struct direct_window *window;
+ struct dma_win *window;
struct memory_notify *arg = data;
int ret = 0;
switch (action) {
case MEM_GOING_ONLINE:
- spin_lock(&direct_window_list_lock);
- list_for_each_entry(window, &direct_window_list, list) {
+ spin_lock(&dma_win_list_lock);
+ list_for_each_entry(window, &dma_win_list, list) {
ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
arg->nr_pages, window->prop);
/* XXX log error */
}
- spin_unlock(&direct_window_list_lock);
+ spin_unlock(&dma_win_list_lock);
break;
case MEM_CANCEL_ONLINE:
case MEM_OFFLINE:
- spin_lock(&direct_window_list_lock);
- list_for_each_entry(window, &direct_window_list, list) {
+ spin_lock(&dma_win_list_lock);
+ list_for_each_entry(window, &dma_win_list, list) {
ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
arg->nr_pages, window->prop);
/* XXX log error */
}
- spin_unlock(&direct_window_list_lock);
+ spin_unlock(&dma_win_list_lock);
break;
default:
break;
@@ -1491,7 +1621,7 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
struct of_reconfig_data *rd = data;
struct device_node *np = rd->dn;
struct pci_dn *pci = PCI_DN(np);
- struct direct_window *window;
+ struct dma_win *window;
switch (action) {
case OF_RECONFIG_DETACH_NODE:
@@ -1502,20 +1632,22 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
* we have to remove the property when releasing
* the device node.
*/
- remove_ddw(np, false);
+ if (remove_ddw(np, false, DIRECT64_PROPNAME))
+ remove_ddw(np, false, DMA64_PROPNAME);
+
if (pci && pci->table_group)
iommu_pseries_free_group(pci->table_group,
np->full_name);
- spin_lock(&direct_window_list_lock);
- list_for_each_entry(window, &direct_window_list, list) {
+ spin_lock(&dma_win_list_lock);
+ list_for_each_entry(window, &dma_win_list, list) {
if (window->device == np) {
list_del(&window->list);
kfree(window);
break;
}
}
- spin_unlock(&direct_window_list_lock);
+ spin_unlock(&dma_win_list_lock);
break;
default:
err = NOTIFY_DONE;
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index dab356e3ff87..3df6bdfea475 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -22,6 +22,8 @@
#include <linux/workqueue.h>
#include <linux/proc_fs.h>
#include <linux/pgtable.h>
+#include <linux/debugfs.h>
+
#include <asm/processor.h>
#include <asm/mmu.h>
#include <asm/page.h>
@@ -39,7 +41,6 @@
#include <asm/kexec.h>
#include <asm/fadump.h>
#include <asm/asm-prototypes.h>
-#include <asm/debugfs.h>
#include <asm/dtl.h>
#include "pseries.h"
@@ -261,7 +262,7 @@ static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu)
if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
return -EIO;
- return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
+ return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
}
static int cpu_home_node_dispatch_distance(int disp_cpu)
@@ -281,7 +282,7 @@ static int cpu_home_node_dispatch_distance(int disp_cpu)
if (!disp_cpu_assoc || !vcpu_assoc)
return -EIO;
- return cpu_distance(disp_cpu_assoc, vcpu_assoc);
+ return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc);
}
static void update_vcpu_disp_stat(int disp_cpu)
@@ -801,7 +802,8 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
return -1;
}
-static void manual_hpte_clear_all(void)
+/* Called during kexec sequence with MMU off */
+static notrace void manual_hpte_clear_all(void)
{
unsigned long size_bytes = 1UL << ppc64_pft_size;
unsigned long hpte_count = size_bytes >> 4;
@@ -834,7 +836,8 @@ static void manual_hpte_clear_all(void)
}
}
-static int hcall_hpte_clear_all(void)
+/* Called during kexec sequence with MMU off */
+static notrace int hcall_hpte_clear_all(void)
{
int rc;
@@ -845,7 +848,8 @@ static int hcall_hpte_clear_all(void)
return rc;
}
-static void pseries_hpte_clear_all(void)
+/* Called during kexec sequence with MMU off */
+static notrace void pseries_hpte_clear_all(void)
{
int rc;
@@ -2016,7 +2020,7 @@ static int __init vpa_debugfs_init(void)
if (!firmware_has_feature(FW_FEATURE_SPLPAR))
return 0;
- vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root);
+ vpa_dir = debugfs_create_dir("vpa", arch_debugfs_dir);
/* set up the per-cpu vpa file*/
for_each_possible_cpu(i) {
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index 637300330507..1b305e411862 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -13,6 +13,7 @@
#include <asm/hw_irq.h>
#include <asm/ppc-pci.h>
#include <asm/machdep.h>
+#include <asm/xive.h>
#include "pseries.h"
@@ -110,21 +111,6 @@ static int rtas_query_irq_number(struct pci_dn *pdn, int offset)
return rtas_ret[0];
}
-static void rtas_teardown_msi_irqs(struct pci_dev *pdev)
-{
- struct msi_desc *entry;
-
- for_each_pci_msi_entry(entry, pdev) {
- if (!entry->irq)
- continue;
-
- irq_set_msi_desc(entry->irq, NULL);
- irq_dispose_mapping(entry->irq);
- }
-
- rtas_disable_msi(pdev);
-}
-
static int check_req(struct pci_dev *pdev, int nvec, char *prop_name)
{
struct device_node *dn;
@@ -164,12 +150,12 @@ static int check_req_msix(struct pci_dev *pdev, int nvec)
/* Quota calculation */
-static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
+static struct device_node *__find_pe_total_msi(struct device_node *node, int *total)
{
struct device_node *dn;
const __be32 *p;
- dn = of_node_get(pci_device_to_OF_node(dev));
+ dn = of_node_get(node);
while (dn) {
p = of_get_property(dn, "ibm,pe-total-#msi", NULL);
if (p) {
@@ -185,6 +171,11 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
return NULL;
}
+static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
+{
+ return __find_pe_total_msi(pci_device_to_OF_node(dev), total);
+}
+
static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
{
struct device_node *dn;
@@ -368,12 +359,11 @@ static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev)
pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0);
}
-static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
+static int rtas_prepare_msi_irqs(struct pci_dev *pdev, int nvec_in, int type,
+ msi_alloc_info_t *arg)
{
struct pci_dn *pdn;
- int hwirq, virq, i, quota, rc;
- struct msi_desc *entry;
- struct msi_msg msg;
+ int quota, rc;
int nvec = nvec_in;
int use_32bit_msi_hack = 0;
@@ -451,53 +441,229 @@ again:
return rc;
}
- i = 0;
- for_each_pci_msi_entry(entry, pdev) {
- hwirq = rtas_query_irq_number(pdn, i++);
- if (hwirq < 0) {
- pr_debug("rtas_msi: error (%d) getting hwirq\n", rc);
- return hwirq;
- }
+ return 0;
+}
- /*
- * Depending on the number of online CPUs in the original
- * kernel, it is likely for CPU #0 to be offline in a kdump
- * kernel. The associated IRQs in the affinity mappings
- * provided by irq_create_affinity_masks() are thus not
- * started by irq_startup(), as per-design for managed IRQs.
- * This can be a problem with multi-queue block devices driven
- * by blk-mq : such a non-started IRQ is very likely paired
- * with the single queue enforced by blk-mq during kdump (see
- * blk_mq_alloc_tag_set()). This causes the device to remain
- * silent and likely hangs the guest at some point.
- *
- * We don't really care for fine-grained affinity when doing
- * kdump actually : simply ignore the pre-computed affinity
- * masks in this case and let the default mask with all CPUs
- * be used when creating the IRQ mappings.
- */
- if (is_kdump_kernel())
- virq = irq_create_mapping(NULL, hwirq);
- else
- virq = irq_create_mapping_affinity(NULL, hwirq,
- entry->affinity);
+static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct msi_desc *desc = first_pci_msi_entry(pdev);
+ int type = desc->msi_attrib.is_msix ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
- if (!virq) {
- pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq);
- return -ENOSPC;
- }
+ return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+}
+
+/*
+ * ->msi_free() is called before irq_domain_free_irqs_top() when the
+ * handler data is still available. Use that to clear the XIVE
+ * controller data.
+ */
+static void pseries_msi_ops_msi_free(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ unsigned int irq)
+{
+ if (xive_enabled())
+ xive_irq_free_data(irq);
+}
+
+/*
+ * RTAS can not disable one MSI at a time. It's all or nothing. Do it
+ * at the end after all IRQs have been freed.
+ */
+static void pseries_msi_domain_free_irqs(struct irq_domain *domain,
+ struct device *dev)
+{
+ if (WARN_ON_ONCE(!dev_is_pci(dev)))
+ return;
+
+ __msi_domain_free_irqs(domain, dev);
+
+ rtas_disable_msi(to_pci_dev(dev));
+}
+
+static struct msi_domain_ops pseries_pci_msi_domain_ops = {
+ .msi_prepare = pseries_msi_ops_prepare,
+ .msi_free = pseries_msi_ops_msi_free,
+ .domain_free_irqs = pseries_msi_domain_free_irqs,
+};
+
+static void pseries_msi_shutdown(struct irq_data *d)
+{
+ d = d->parent_data;
+ if (d->chip->irq_shutdown)
+ d->chip->irq_shutdown(d);
+}
+
+static void pseries_msi_mask(struct irq_data *d)
+{
+ pci_msi_mask_irq(d);
+ irq_chip_mask_parent(d);
+}
+
+static void pseries_msi_unmask(struct irq_data *d)
+{
+ pci_msi_unmask_irq(d);
+ irq_chip_unmask_parent(d);
+}
- dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq);
- irq_set_msi_desc(virq, entry);
+static struct irq_chip pseries_pci_msi_irq_chip = {
+ .name = "pSeries-PCI-MSI",
+ .irq_shutdown = pseries_msi_shutdown,
+ .irq_mask = pseries_msi_mask,
+ .irq_unmask = pseries_msi_unmask,
+ .irq_eoi = irq_chip_eoi_parent,
+};
+
+static struct msi_domain_info pseries_msi_domain_info = {
+ .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX),
+ .ops = &pseries_pci_msi_domain_ops,
+ .chip = &pseries_pci_msi_irq_chip,
+};
+
+static void pseries_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __pci_read_msi_msg(irq_data_get_msi_desc(data), msg);
+}
+
+static struct irq_chip pseries_msi_irq_chip = {
+ .name = "pSeries-MSI",
+ .irq_shutdown = pseries_msi_shutdown,
+ .irq_mask = irq_chip_mask_parent,
+ .irq_unmask = irq_chip_unmask_parent,
+ .irq_eoi = irq_chip_eoi_parent,
+ .irq_set_affinity = irq_chip_set_affinity_parent,
+ .irq_compose_msi_msg = pseries_msi_compose_msg,
+};
+
+static int pseries_irq_parent_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ struct irq_fwspec parent_fwspec;
+ int ret;
+
+ parent_fwspec.fwnode = domain->parent->fwnode;
+ parent_fwspec.param_count = 2;
+ parent_fwspec.param[0] = hwirq;
+ parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct pci_controller *phb = domain->host_data;
+ msi_alloc_info_t *info = arg;
+ struct msi_desc *desc = info->desc;
+ struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
+ int hwirq;
+ int i, ret;
+
+ hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_attrib.entry_nr);
+ if (hwirq < 0) {
+ dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
+ return hwirq;
+ }
+
+ dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__,
+ phb->dn, virq, hwirq, nr_irqs);
+
+ for (i = 0; i < nr_irqs; i++) {
+ ret = pseries_irq_parent_domain_alloc(domain, virq + i, hwirq + i);
+ if (ret)
+ goto out;
+
+ irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+ &pseries_msi_irq_chip, domain->host_data);
+ }
+
+ return 0;
+
+out:
+ /* TODO: handle RTAS cleanup in ->msi_finish() ? */
+ irq_domain_free_irqs_parent(domain, virq, i - 1);
+ return ret;
+}
+
+static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+ struct pci_controller *phb = irq_data_get_irq_chip_data(d);
+
+ pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
+
+ /* XIVE domain data is cleared through ->msi_free() */
+}
- /* Read config space back so we can restore after reset */
- __pci_read_msi_msg(entry, &msg);
- entry->msg = msg;
+static const struct irq_domain_ops pseries_irq_domain_ops = {
+ .alloc = pseries_irq_domain_alloc,
+ .free = pseries_irq_domain_free,
+};
+
+static int __pseries_msi_allocate_domains(struct pci_controller *phb,
+ unsigned int count)
+{
+ struct irq_domain *parent = irq_get_default_host();
+
+ phb->fwnode = irq_domain_alloc_named_id_fwnode("pSeries-MSI",
+ phb->global_number);
+ if (!phb->fwnode)
+ return -ENOMEM;
+
+ phb->dev_domain = irq_domain_create_hierarchy(parent, 0, count,
+ phb->fwnode,
+ &pseries_irq_domain_ops, phb);
+ if (!phb->dev_domain) {
+ pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n",
+ phb->dn, phb->global_number);
+ irq_domain_free_fwnode(phb->fwnode);
+ return -ENOMEM;
+ }
+
+ phb->msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(phb->dn),
+ &pseries_msi_domain_info,
+ phb->dev_domain);
+ if (!phb->msi_domain) {
+ pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n",
+ phb->dn, phb->global_number);
+ irq_domain_free_fwnode(phb->fwnode);
+ irq_domain_remove(phb->dev_domain);
+ return -ENOMEM;
}
return 0;
}
+int pseries_msi_allocate_domains(struct pci_controller *phb)
+{
+ int count;
+
+ if (!__find_pe_total_msi(phb->dn, &count)) {
+ pr_err("PCI: failed to find MSIs for bridge %pOF (domain %d)\n",
+ phb->dn, phb->global_number);
+ return -ENOSPC;
+ }
+
+ return __pseries_msi_allocate_domains(phb, count);
+}
+
+void pseries_msi_free_domains(struct pci_controller *phb)
+{
+ if (phb->msi_domain)
+ irq_domain_remove(phb->msi_domain);
+ if (phb->dev_domain)
+ irq_domain_remove(phb->dev_domain);
+ if (phb->fwnode)
+ irq_domain_free_fwnode(phb->fwnode);
+}
+
static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
{
/* No LSI -> leave MSIs (if any) configured */
@@ -518,8 +684,6 @@ static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
static int rtas_msi_init(void)
{
- struct pci_controller *phb;
-
query_token = rtas_token("ibm,query-interrupt-source-number");
change_token = rtas_token("ibm,change-msi");
@@ -531,16 +695,6 @@ static int rtas_msi_init(void)
pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n");
- WARN_ON(pseries_pci_controller_ops.setup_msi_irqs);
- pseries_pci_controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
- pseries_pci_controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
-
- list_for_each_entry(phb, &hose_list, list_node) {
- WARN_ON(phb->controller_ops.setup_msi_irqs);
- phb->controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
- phb->controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
- }
-
WARN_ON(ppc_md.pci_irq_fixup);
ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup;
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
index a8f9140a24fa..90c9d3531694 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -33,6 +33,8 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn)
pci_devs_phb_init_dynamic(phb);
+ pseries_msi_allocate_domains(phb);
+
/* Create EEH devices for the PHB */
eeh_phb_pe_create(phb);
@@ -74,6 +76,8 @@ int remove_phb_dynamic(struct pci_controller *phb)
}
}
+ pseries_msi_free_domains(phb);
+
/* Remove the PCI bus and unregister the bridge device from sysfs */
phb->bus = NULL;
pci_remove_bus(b);
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 1f051a786fb3..3544778e06d0 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -85,6 +85,8 @@ struct pci_host_bridge;
int pseries_root_bridge_prepare(struct pci_host_bridge *bridge);
extern struct pci_controller_ops pseries_pci_controller_ops;
+int pseries_msi_allocate_domains(struct pci_controller *phb);
+void pseries_msi_free_domains(struct pci_controller *phb);
unsigned long pseries_memory_block_size(void);
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 167f2e1b8d39..56092dccfdb8 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -783,7 +783,7 @@ static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt)
{
int recovered = 0;
- if (!(regs->msr & MSR_RI)) {
+ if (regs_is_unrecoverable(regs)) {
/* If MSR_RI isn't set, we cannot recover */
pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
recovered = 0;
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 0dfaa6ab44cc..f79126f16258 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -486,6 +486,8 @@ static void __init pSeries_discover_phbs(void)
/* create pci_dn's for DT nodes under this PHB */
pci_devs_phb_init_dynamic(phb);
+
+ pseries_msi_allocate_domains(phb);
}
of_node_put(root);
diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
index 1d829e257996..87f001b4c4e4 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -63,6 +63,9 @@ void __init svm_swiotlb_init(void)
int set_memory_encrypted(unsigned long addr, int numpages)
{
+ if (!mem_encrypt_active())
+ return 0;
+
if (!PAGE_ALIGNED(addr))
return -EINVAL;
@@ -73,6 +76,9 @@ int set_memory_encrypted(unsigned long addr, int numpages)
int set_memory_decrypted(unsigned long addr, int numpages)
{
+ if (!mem_encrypt_active())
+ return 0;
+
if (!PAGE_ALIGNED(addr))
return -EINVAL;
diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c
index b5c1cf1bc64d..b043e3936d21 100644
--- a/arch/powerpc/platforms/pseries/vas.c
+++ b/arch/powerpc/platforms/pseries/vas.c
@@ -184,7 +184,7 @@ static int h_get_nx_fault(u32 winid, u64 buffer)
* Note: The hypervisor forwards an interrupt for each fault request.
* So one fault CRB to process for each H_GET_NX_FAULT hcall.
*/
-irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data)
+static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data)
{
struct pseries_vas_window *txwin = data;
struct coprocessor_request_block crb;
diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c
index 5a95b8ea23d8..ff7906b48ca1 100644
--- a/arch/powerpc/sysdev/fsl_rio.c
+++ b/arch/powerpc/sysdev/fsl_rio.c
@@ -108,7 +108,7 @@ int fsl_rio_mcheck_exception(struct pt_regs *regs)
__func__);
out_be32((u32 *)(rio_regs_win + RIO_LTLEDCSR),
0);
- regs_set_return_msr(regs, regs->msr | MSR_RI);
+ regs_set_recoverable(regs);
regs_set_return_ip(regs, extable_fixup(entry));
return 1;
}
diff --git a/arch/powerpc/sysdev/xics/ics-native.c b/arch/powerpc/sysdev/xics/ics-native.c
index d450502f4053..dec7d93a8ba1 100644
--- a/arch/powerpc/sysdev/xics/ics-native.c
+++ b/arch/powerpc/sysdev/xics/ics-native.c
@@ -131,19 +131,15 @@ static struct irq_chip ics_native_irq_chip = {
.irq_retrigger = xics_retrigger,
};
-static int ics_native_map(struct ics *ics, unsigned int virq)
+static int ics_native_check(struct ics *ics, unsigned int hw_irq)
{
- unsigned int vec = (unsigned int)virq_to_hw(virq);
struct ics_native *in = to_ics_native(ics);
- pr_devel("%s: vec=0x%x\n", __func__, vec);
+ pr_devel("%s: hw_irq=0x%x\n", __func__, hw_irq);
- if (vec < in->ibase || vec >= (in->ibase + in->icount))
+ if (hw_irq < in->ibase || hw_irq >= (in->ibase + in->icount))
return -EINVAL;
- irq_set_chip_and_handler(virq, &ics_native_irq_chip, handle_fasteoi_irq);
- irq_set_chip_data(virq, ics);
-
return 0;
}
@@ -177,10 +173,11 @@ static int ics_native_host_match(struct ics *ics, struct device_node *node)
}
static struct ics ics_native_template = {
- .map = ics_native_map,
+ .check = ics_native_check,
.mask_unknown = ics_native_mask_unknown,
.get_server = ics_native_get_server,
.host_match = ics_native_host_match,
+ .chip = &ics_native_irq_chip,
};
static int __init ics_native_add_one(struct device_node *np)
diff --git a/arch/powerpc/sysdev/xics/ics-opal.c b/arch/powerpc/sysdev/xics/ics-opal.c
index 823f6c9664cd..c4d95d8beb6f 100644
--- a/arch/powerpc/sysdev/xics/ics-opal.c
+++ b/arch/powerpc/sysdev/xics/ics-opal.c
@@ -62,17 +62,6 @@ static void ics_opal_unmask_irq(struct irq_data *d)
static unsigned int ics_opal_startup(struct irq_data *d)
{
-#ifdef CONFIG_PCI_MSI
- /*
- * The generic MSI code returns with the interrupt disabled on the
- * card, using the MSI mask bits. Firmware doesn't appear to unmask
- * at that level, so we do it here by hand.
- */
- if (irq_data_get_msi_desc(d))
- pci_msi_unmask_irq(d);
-#endif
-
- /* unmask it */
ics_opal_unmask_irq(d);
return 0;
}
@@ -133,7 +122,7 @@ static int ics_opal_set_affinity(struct irq_data *d,
}
server = ics_opal_mangle_server(wanted_server);
- pr_devel("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n",
+ pr_debug("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n",
d->irq, hw_irq, wanted_server, server);
rc = opal_set_xive(hw_irq, server, priority);
@@ -157,26 +146,13 @@ static struct irq_chip ics_opal_irq_chip = {
.irq_retrigger = xics_retrigger,
};
-static int ics_opal_map(struct ics *ics, unsigned int virq);
-static void ics_opal_mask_unknown(struct ics *ics, unsigned long vec);
-static long ics_opal_get_server(struct ics *ics, unsigned long vec);
-
static int ics_opal_host_match(struct ics *ics, struct device_node *node)
{
return 1;
}
-/* Only one global & state struct ics */
-static struct ics ics_hal = {
- .map = ics_opal_map,
- .mask_unknown = ics_opal_mask_unknown,
- .get_server = ics_opal_get_server,
- .host_match = ics_opal_host_match,
-};
-
-static int ics_opal_map(struct ics *ics, unsigned int virq)
+static int ics_opal_check(struct ics *ics, unsigned int hw_irq)
{
- unsigned int hw_irq = (unsigned int)virq_to_hw(virq);
int64_t rc;
__be16 server;
int8_t priority;
@@ -189,9 +165,6 @@ static int ics_opal_map(struct ics *ics, unsigned int virq)
if (rc != OPAL_SUCCESS)
return -ENXIO;
- irq_set_chip_and_handler(virq, &ics_opal_irq_chip, handle_fasteoi_irq);
- irq_set_chip_data(virq, &ics_hal);
-
return 0;
}
@@ -222,6 +195,15 @@ static long ics_opal_get_server(struct ics *ics, unsigned long vec)
return ics_opal_unmangle_server(be16_to_cpu(server));
}
+/* Only one global & state struct ics */
+static struct ics ics_hal = {
+ .check = ics_opal_check,
+ .mask_unknown = ics_opal_mask_unknown,
+ .get_server = ics_opal_get_server,
+ .host_match = ics_opal_host_match,
+ .chip = &ics_opal_irq_chip,
+};
+
int __init ics_opal_init(void)
{
if (!firmware_has_feature(FW_FEATURE_OPAL))
diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c
index 4cf18000f07c..b9da317b7a2d 100644
--- a/arch/powerpc/sysdev/xics/ics-rtas.c
+++ b/arch/powerpc/sysdev/xics/ics-rtas.c
@@ -24,19 +24,6 @@ static int ibm_set_xive;
static int ibm_int_on;
static int ibm_int_off;
-static int ics_rtas_map(struct ics *ics, unsigned int virq);
-static void ics_rtas_mask_unknown(struct ics *ics, unsigned long vec);
-static long ics_rtas_get_server(struct ics *ics, unsigned long vec);
-static int ics_rtas_host_match(struct ics *ics, struct device_node *node);
-
-/* Only one global & state struct ics */
-static struct ics ics_rtas = {
- .map = ics_rtas_map,
- .mask_unknown = ics_rtas_mask_unknown,
- .get_server = ics_rtas_get_server,
- .host_match = ics_rtas_host_match,
-};
-
static void ics_rtas_unmask_irq(struct irq_data *d)
{
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
@@ -70,15 +57,6 @@ static void ics_rtas_unmask_irq(struct irq_data *d)
static unsigned int ics_rtas_startup(struct irq_data *d)
{
-#ifdef CONFIG_PCI_MSI
- /*
- * The generic MSI code returns with the interrupt disabled on the
- * card, using the MSI mask bits. Firmware doesn't appear to unmask
- * at that level, so we do it here by hand.
- */
- if (irq_data_get_msi_desc(d))
- pci_msi_unmask_irq(d);
-#endif
/* unmask it */
ics_rtas_unmask_irq(d);
return 0;
@@ -146,6 +124,9 @@ static int ics_rtas_set_affinity(struct irq_data *d,
return -1;
}
+ pr_debug("%s: irq %d [hw 0x%x] server: 0x%x\n", __func__, d->irq,
+ hw_irq, irq_server);
+
status = rtas_call_reentrant(ibm_set_xive, 3, 1, NULL,
hw_irq, irq_server, xics_status[1]);
@@ -169,9 +150,8 @@ static struct irq_chip ics_rtas_irq_chip = {
.irq_retrigger = xics_retrigger,
};
-static int ics_rtas_map(struct ics *ics, unsigned int virq)
+static int ics_rtas_check(struct ics *ics, unsigned int hw_irq)
{
- unsigned int hw_irq = (unsigned int)virq_to_hw(virq);
int status[2];
int rc;
@@ -183,9 +163,6 @@ static int ics_rtas_map(struct ics *ics, unsigned int virq)
if (rc)
return -ENXIO;
- irq_set_chip_and_handler(virq, &ics_rtas_irq_chip, handle_fasteoi_irq);
- irq_set_chip_data(virq, &ics_rtas);
-
return 0;
}
@@ -213,6 +190,15 @@ static int ics_rtas_host_match(struct ics *ics, struct device_node *node)
return !of_device_is_compatible(node, "chrp,iic");
}
+/* Only one global & state struct ics */
+static struct ics ics_rtas = {
+ .check = ics_rtas_check,
+ .mask_unknown = ics_rtas_mask_unknown,
+ .get_server = ics_rtas_get_server,
+ .host_match = ics_rtas_host_match,
+ .chip = &ics_rtas_irq_chip,
+};
+
__init int ics_rtas_init(void)
{
ibm_get_xive = rtas_token("ibm,get-xive");
diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c
index b14c502e56a8..5c1a157a83b8 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -38,7 +38,7 @@ DEFINE_PER_CPU(struct xics_cppr, xics_cppr);
struct irq_domain *xics_host;
-static LIST_HEAD(ics_list);
+static struct ics *xics_ics;
void xics_update_irq_servers(void)
{
@@ -111,12 +111,11 @@ void xics_setup_cpu(void)
void xics_mask_unknown_vec(unsigned int vec)
{
- struct ics *ics;
-
pr_err("Interrupt 0x%x (real) is invalid, disabling it.\n", vec);
- list_for_each_entry(ics, &ics_list, link)
- ics->mask_unknown(ics, vec);
+ if (WARN_ON(!xics_ics))
+ return;
+ xics_ics->mask_unknown(xics_ics, vec);
}
@@ -133,7 +132,7 @@ static void xics_request_ipi(void)
* IPIs are marked IRQF_PERCPU. The handler was set in map.
*/
BUG_ON(request_irq(ipi, icp_ops->ipi_action,
- IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
+ IRQF_NO_DEBUG | IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
}
void __init xics_smp_probe(void)
@@ -184,6 +183,8 @@ void xics_migrate_irqs_away(void)
unsigned int irq, virq;
struct irq_desc *desc;
+ pr_debug("%s: CPU %u\n", __func__, cpu);
+
/* If we used to be the default server, move to the new "boot_cpuid" */
if (hw_cpu == xics_default_server)
xics_update_irq_servers();
@@ -198,7 +199,7 @@ void xics_migrate_irqs_away(void)
struct irq_chip *chip;
long server;
unsigned long flags;
- struct ics *ics;
+ struct irq_data *irqd;
/* We can't set affinity on ISA interrupts */
if (virq < NR_IRQS_LEGACY)
@@ -206,9 +207,11 @@ void xics_migrate_irqs_away(void)
/* We only need to migrate enabled IRQS */
if (!desc->action)
continue;
- if (desc->irq_data.domain != xics_host)
+ /* We need a mapping in the XICS IRQ domain */
+ irqd = irq_domain_get_irq_data(xics_host, virq);
+ if (!irqd)
continue;
- irq = desc->irq_data.hwirq;
+ irq = irqd_to_hwirq(irqd);
/* We need to get IPIs still. */
if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
continue;
@@ -219,13 +222,10 @@ void xics_migrate_irqs_away(void)
raw_spin_lock_irqsave(&desc->lock, flags);
/* Locate interrupt server */
- server = -1;
- ics = irq_desc_get_chip_data(desc);
- if (ics)
- server = ics->get_server(ics, irq);
+ server = xics_ics->get_server(xics_ics, irq);
if (server < 0) {
- printk(KERN_ERR "%s: Can't find server for irq %d\n",
- __func__, irq);
+ pr_err("%s: Can't find server for irq %d/%x\n",
+ __func__, virq, irq);
goto unlock;
}
@@ -307,13 +307,9 @@ int xics_get_irq_server(unsigned int virq, const struct cpumask *cpumask,
static int xics_host_match(struct irq_domain *h, struct device_node *node,
enum irq_domain_bus_token bus_token)
{
- struct ics *ics;
-
- list_for_each_entry(ics, &ics_list, link)
- if (ics->host_match(ics, node))
- return 1;
-
- return 0;
+ if (WARN_ON(!xics_ics))
+ return 0;
+ return xics_ics->host_match(xics_ics, node) ? 1 : 0;
}
/* Dummies */
@@ -327,12 +323,10 @@ static struct irq_chip xics_ipi_chip = {
.irq_unmask = xics_ipi_unmask,
};
-static int xics_host_map(struct irq_domain *h, unsigned int virq,
- irq_hw_number_t hw)
+static int xics_host_map(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
{
- struct ics *ics;
-
- pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hw);
+ pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hwirq);
/*
* Mark interrupts as edge sensitive by default so that resend
@@ -342,18 +336,23 @@ static int xics_host_map(struct irq_domain *h, unsigned int virq,
irq_clear_status_flags(virq, IRQ_LEVEL);
/* Don't call into ICS for IPIs */
- if (hw == XICS_IPI) {
+ if (hwirq == XICS_IPI) {
irq_set_chip_and_handler(virq, &xics_ipi_chip,
handle_percpu_irq);
return 0;
}
- /* Let the ICS setup the chip data */
- list_for_each_entry(ics, &ics_list, link)
- if (ics->map(ics, virq) == 0)
- return 0;
+ if (WARN_ON(!xics_ics))
+ return -EINVAL;
+
+ if (xics_ics->check(xics_ics, hwirq))
+ return -EINVAL;
+
+ /* No chip data for the XICS domain */
+ irq_domain_set_info(domain, virq, hwirq, xics_ics->chip,
+ NULL, handle_fasteoi_irq, NULL, NULL);
- return -EINVAL;
+ return 0;
}
static int xics_host_xlate(struct irq_domain *h, struct device_node *ct,
@@ -412,22 +411,76 @@ int xics_retrigger(struct irq_data *data)
return 0;
}
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+static int xics_host_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec,
+ unsigned long *hwirq, unsigned int *type)
+{
+ return xics_host_xlate(d, to_of_node(fwspec->fwnode), fwspec->param,
+ fwspec->param_count, hwirq, type);
+}
+
+static int xics_host_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_fwspec *fwspec = arg;
+ irq_hw_number_t hwirq;
+ unsigned int type = IRQ_TYPE_NONE;
+ int i, rc;
+
+ rc = xics_host_domain_translate(domain, fwspec, &hwirq, &type);
+ if (rc)
+ return rc;
+
+ pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs);
+
+ for (i = 0; i < nr_irqs; i++)
+ irq_domain_set_info(domain, virq + i, hwirq + i, xics_ics->chip,
+ xics_ics, handle_fasteoi_irq, NULL, NULL);
+
+ return 0;
+}
+
+static void xics_host_domain_free(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ pr_debug("%s %d #%d\n", __func__, virq, nr_irqs);
+}
+#endif
+
static const struct irq_domain_ops xics_host_ops = {
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ .alloc = xics_host_domain_alloc,
+ .free = xics_host_domain_free,
+ .translate = xics_host_domain_translate,
+#endif
.match = xics_host_match,
.map = xics_host_map,
.xlate = xics_host_xlate,
};
-static void __init xics_init_host(void)
+static int __init xics_allocate_domain(void)
{
- xics_host = irq_domain_add_tree(NULL, &xics_host_ops, NULL);
- BUG_ON(xics_host == NULL);
+ struct fwnode_handle *fn;
+
+ fn = irq_domain_alloc_named_fwnode("XICS");
+ if (!fn)
+ return -ENOMEM;
+
+ xics_host = irq_domain_create_tree(fn, &xics_host_ops, NULL);
+ if (!xics_host) {
+ irq_domain_free_fwnode(fn);
+ return -ENOMEM;
+ }
+
irq_set_default_host(xics_host);
+ return 0;
}
void __init xics_register_ics(struct ics *ics)
{
- list_add(&ics->link, &ics_list);
+ if (WARN_ONCE(xics_ics, "XICS: Source Controller is already defined !"))
+ return;
+ xics_ics = ics;
}
static void __init xics_get_server_size(void)
@@ -484,6 +537,8 @@ void __init xics_init(void)
/* Initialize common bits */
xics_get_server_size();
xics_update_irq_servers();
- xics_init_host();
+ rc = xics_allocate_domain();
+ if (rc < 0)
+ pr_err("XICS: Failed to create IRQ domain");
xics_setup_cpu();
}
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index 8183ca343675..c732ce5a3e1a 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -21,7 +21,6 @@
#include <linux/msi.h>
#include <linux/vmalloc.h>
-#include <asm/debugfs.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/smp.h>
@@ -313,11 +312,10 @@ void xmon_xive_get_irq_all(void)
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
- struct irq_data *d = irq_desc_get_irq_data(desc);
- unsigned int hwirq = (unsigned int)irqd_to_hwirq(d);
+ struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, i);
- if (d->domain == xive_irq_domain)
- xmon_xive_get_irq_config(hwirq, d);
+ if (d)
+ xmon_xive_get_irq_config(irqd_to_hwirq(d), d);
}
}
@@ -617,16 +615,6 @@ static unsigned int xive_irq_startup(struct irq_data *d)
pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n",
d->irq, hw_irq, d);
-#ifdef CONFIG_PCI_MSI
- /*
- * The generic MSI code returns with the interrupt disabled on the
- * card, using the MSI mask bits. Firmware doesn't appear to unmask
- * at that level, so we do it here by hand.
- */
- if (irq_data_get_msi_desc(d))
- pci_msi_unmask_irq(d);
-#endif
-
/* Pick a target */
target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d));
if (target == XIVE_INVALID_TARGET) {
@@ -714,16 +702,12 @@ static int xive_irq_set_affinity(struct irq_data *d,
u32 target, old_target;
int rc = 0;
- pr_devel("xive_irq_set_affinity: irq %d\n", d->irq);
+ pr_debug("%s: irq %d/%x\n", __func__, d->irq, hw_irq);
/* Is this valid ? */
if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids)
return -EINVAL;
- /* Don't do anything if the interrupt isn't started */
- if (!irqd_is_started(d))
- return IRQ_SET_MASK_OK;
-
/*
* If existing target is already in the new mask, and is
* online then do nothing.
@@ -759,7 +743,7 @@ static int xive_irq_set_affinity(struct irq_data *d,
return rc;
}
- pr_devel(" target: 0x%x\n", target);
+ pr_debug(" target: 0x%x\n", target);
xd->target = target;
/* Give up previous target */
@@ -990,6 +974,8 @@ EXPORT_SYMBOL_GPL(is_xive_irq);
void xive_cleanup_irq_data(struct xive_irq_data *xd)
{
+ pr_debug("%s for HW %x\n", __func__, xd->hw_irq);
+
if (xd->eoi_mmio) {
iounmap(xd->eoi_mmio);
if (xd->eoi_mmio == xd->trig_mmio)
@@ -1031,7 +1017,7 @@ static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
return 0;
}
-static void xive_irq_free_data(unsigned int virq)
+void xive_irq_free_data(unsigned int virq)
{
struct xive_irq_data *xd = irq_get_handler_data(virq);
@@ -1041,6 +1027,7 @@ static void xive_irq_free_data(unsigned int virq)
xive_cleanup_irq_data(xd);
kfree(xd);
}
+EXPORT_SYMBOL_GPL(xive_irq_free_data);
#ifdef CONFIG_SMP
@@ -1179,7 +1166,7 @@ static int xive_request_ipi(unsigned int cpu)
return 0;
ret = request_irq(xid->irq, xive_muxed_ipi_action,
- IRQF_PERCPU | IRQF_NO_THREAD,
+ IRQF_NO_DEBUG | IRQF_PERCPU | IRQF_NO_THREAD,
xid->name, NULL);
WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret);
@@ -1379,7 +1366,71 @@ static void xive_irq_domain_debug_show(struct seq_file *m, struct irq_domain *d,
}
#endif
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+static int xive_irq_domain_translate(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ unsigned long *hwirq,
+ unsigned int *type)
+{
+ return xive_irq_domain_xlate(d, to_of_node(fwspec->fwnode),
+ fwspec->param, fwspec->param_count,
+ hwirq, type);
+}
+
+static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_fwspec *fwspec = arg;
+ irq_hw_number_t hwirq;
+ unsigned int type = IRQ_TYPE_NONE;
+ int i, rc;
+
+ rc = xive_irq_domain_translate(domain, fwspec, &hwirq, &type);
+ if (rc)
+ return rc;
+
+ pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs);
+
+ for (i = 0; i < nr_irqs; i++) {
+ /* TODO: call xive_irq_domain_map() */
+
+ /*
+ * Mark interrupts as edge sensitive by default so that resend
+ * actually works. Will fix that up below if needed.
+ */
+ irq_clear_status_flags(virq, IRQ_LEVEL);
+
+ /* allocates and sets handler data */
+ rc = xive_irq_alloc_data(virq + i, hwirq + i);
+ if (rc)
+ return rc;
+
+ irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+ &xive_irq_chip, domain->host_data);
+ irq_set_handler(virq + i, handle_fasteoi_irq);
+ }
+
+ return 0;
+}
+
+static void xive_irq_domain_free(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ int i;
+
+ pr_debug("%s %d #%d\n", __func__, virq, nr_irqs);
+
+ for (i = 0; i < nr_irqs; i++)
+ xive_irq_free_data(virq + i);
+}
+#endif
+
static const struct irq_domain_ops xive_irq_domain_ops = {
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ .alloc = xive_irq_domain_alloc,
+ .free = xive_irq_domain_free,
+ .translate = xive_irq_domain_translate,
+#endif
.match = xive_irq_domain_match,
.map = xive_irq_domain_map,
.unmap = xive_irq_domain_unmap,
@@ -1717,9 +1768,9 @@ static int xive_core_debug_show(struct seq_file *m, void *private)
xive_debug_show_cpu(m, cpu);
for_each_irq_desc(i, desc) {
- struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, i);
- if (d->domain == xive_irq_domain)
+ if (d)
xive_debug_show_irq(m, d);
}
return 0;
@@ -1729,7 +1780,7 @@ DEFINE_SHOW_ATTRIBUTE(xive_core_debug);
int xive_core_debug_init(void)
{
if (xive_enabled())
- debugfs_create_file("xive", 0400, powerpc_debugfs_root,
+ debugfs_create_file("xive", 0400, arch_debugfs_dir,
NULL, &xive_core_debug_fops);
return 0;
}
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 57e3f1540435..1aec282cd650 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -41,6 +41,7 @@ static u32 xive_queue_shift;
static u32 xive_pool_vps = XIVE_INVALID_VP;
static struct kmem_cache *xive_provision_cache;
static bool xive_has_single_esc;
+static bool xive_has_save_restore;
int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
{
@@ -588,6 +589,9 @@ bool __init xive_native_init(void)
if (of_get_property(np, "single-escalation-support", NULL) != NULL)
xive_has_single_esc = true;
+ if (of_get_property(np, "vp-save-restore", NULL))
+ xive_has_save_restore = true;
+
/* Configure Thread Management areas for KVM */
for_each_possible_cpu(cpu)
kvmppc_set_xive_tima(cpu, r.start, tima);
@@ -752,6 +756,12 @@ bool xive_native_has_single_escalation(void)
}
EXPORT_SYMBOL_GPL(xive_native_has_single_escalation);
+bool xive_native_has_save_restore(void)
+{
+ return xive_has_save_restore;
+}
+EXPORT_SYMBOL_GPL(xive_native_has_save_restore);
+
int xive_native_get_queue_info(u32 vp_id, u32 prio,
u64 *out_qpage,
u64 *out_qsize,
diff --git a/arch/powerpc/tools/head_check.sh b/arch/powerpc/tools/head_check.sh
index e32d3162e5ed..689907cda996 100644
--- a/arch/powerpc/tools/head_check.sh
+++ b/arch/powerpc/tools/head_check.sh
@@ -49,30 +49,30 @@ vmlinux="$2"
$nm "$vmlinux" | grep -e " [TA] _stext$" -e " t start_first_256B$" -e " a text_start$" -e " t start_text$" > .tmp_symbols.txt
-vma=$(cat .tmp_symbols.txt | grep -e " [TA] _stext$" | cut -d' ' -f1)
+vma=$(grep -e " [TA] _stext$" .tmp_symbols.txt | cut -d' ' -f1)
-expected_start_head_addr=$vma
+expected_start_head_addr="$vma"
-start_head_addr=$(cat .tmp_symbols.txt | grep " t start_first_256B$" | cut -d' ' -f1)
+start_head_addr=$(grep " t start_first_256B$" .tmp_symbols.txt | cut -d' ' -f1)
if [ "$start_head_addr" != "$expected_start_head_addr" ]; then
- echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr"
- echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option"
- echo "ERROR: see comments in arch/powerpc/tools/head_check.sh"
+ echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr" 1>&2
+ echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" 1>&2
+ echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" 1>&2
exit 1
fi
-top_vma=$(echo $vma | cut -d'0' -f1)
+top_vma=$(echo "$vma" | cut -d'0' -f1)
-expected_start_text_addr=$(cat .tmp_symbols.txt | grep " a text_start$" | cut -d' ' -f1 | sed "s/^0/$top_vma/")
+expected_start_text_addr=$(grep " a text_start$" .tmp_symbols.txt | cut -d' ' -f1 | sed "s/^0/$top_vma/")
-start_text_addr=$(cat .tmp_symbols.txt | grep " t start_text$" | cut -d' ' -f1)
+start_text_addr=$(grep " t start_text$" .tmp_symbols.txt | cut -d' ' -f1)
if [ "$start_text_addr" != "$expected_start_text_addr" ]; then
- echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr"
- echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option"
- echo "ERROR: see comments in arch/powerpc/tools/head_check.sh"
+ echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr" 1>&2
+ echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" 1>&2
+ echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" 1>&2
exit 1
fi
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index da4d7f225a40..dd8241c009e5 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -26,8 +26,8 @@
#include <linux/ctype.h>
#include <linux/highmem.h>
#include <linux/security.h>
+#include <linux/debugfs.h>
-#include <asm/debugfs.h>
#include <asm/ptrace.h>
#include <asm/smp.h>
#include <asm/string.h>
@@ -482,16 +482,6 @@ static inline void get_output_lock(void) {}
static inline void release_output_lock(void) {}
#endif
-static inline int unrecoverable_excp(struct pt_regs *regs)
-{
-#if defined(CONFIG_4xx) || defined(CONFIG_PPC_BOOK3E)
- /* We have no MSR_RI bit on 4xx or Book3e, so we simply return false */
- return 0;
-#else
- return ((regs->msr & MSR_RI) == 0);
-#endif
-}
-
static void xmon_touch_watchdogs(void)
{
touch_softlockup_watchdog_sync();
@@ -565,7 +555,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi)
bp = NULL;
if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) == (MSR_IR|MSR_64BIT))
bp = at_breakpoint(regs->nip);
- if (bp || unrecoverable_excp(regs))
+ if (bp || regs_is_unrecoverable(regs))
fromipi = 0;
if (!fromipi) {
@@ -577,7 +567,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi)
cpu, BP_NUM(bp));
xmon_print_symbol(regs->nip, " ", ")\n");
}
- if (unrecoverable_excp(regs))
+ if (regs_is_unrecoverable(regs))
printf("WARNING: exception is not recoverable, "
"can't continue\n");
release_output_lock();
@@ -693,7 +683,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi)
printf("Stopped at breakpoint %tx (", BP_NUM(bp));
xmon_print_symbol(regs->nip, " ", ")\n");
}
- if (unrecoverable_excp(regs))
+ if (regs_is_unrecoverable(regs))
printf("WARNING: exception is not recoverable, "
"can't continue\n");
remove_bpts();
@@ -4077,8 +4067,8 @@ DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get,
static int __init setup_xmon_dbgfs(void)
{
- debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL,
- &xmon_dbgfs_ops);
+ debugfs_create_file("xmon", 0600, arch_debugfs_dir, NULL,
+ &xmon_dbgfs_ops);
return 0;
}
device_initcall(setup_xmon_dbgfs);
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 93720b015bb6..fc818c86cc9b 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -819,38 +819,22 @@ static void __init reserve_crashkernel(void)
crash_size = PAGE_ALIGN(crash_size);
- if (crash_base == 0) {
- /*
- * Current riscv boot protocol requires 2MB alignment for
- * RV64 and 4MB alignment for RV32 (hugepage size)
- */
- crash_base = memblock_find_in_range(search_start, search_end,
- crash_size, PMD_SIZE);
-
- if (crash_base == 0) {
- pr_warn("crashkernel: couldn't allocate %lldKB\n",
- crash_size >> 10);
- return;
- }
- } else {
- /* User specifies base address explicitly. */
- if (!memblock_is_region_memory(crash_base, crash_size)) {
- pr_warn("crashkernel: requested region is not memory\n");
- return;
- }
-
- if (memblock_is_region_reserved(crash_base, crash_size)) {
- pr_warn("crashkernel: requested region is reserved\n");
- return;
- }
-
+ if (crash_base) {
+ search_start = crash_base;
+ search_end = crash_base + crash_size;
+ }
- if (!IS_ALIGNED(crash_base, PMD_SIZE)) {
- pr_warn("crashkernel: requested region is misaligned\n");
- return;
- }
+ /*
+ * Current riscv boot protocol requires 2MB alignment for
+ * RV64 and 4MB alignment for RV32 (hugepage size)
+ */
+ crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE,
+ search_start, search_end);
+ if (crash_base == 0) {
+ pr_warn("crashkernel: couldn't allocate %lldKB\n",
+ crash_size >> 10);
+ return;
}
- memblock_reserve(crash_base, crash_size);
pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n",
crash_base, crash_base + crash_size, crash_size >> 20);
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index fe14beb338e5..5a01872f5984 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -677,8 +677,9 @@ static void __init reserve_crashkernel(void)
return;
}
low = crash_base ?: low;
- crash_base = memblock_find_in_range(low, high, crash_size,
- KEXEC_CRASH_MEM_ALIGN);
+ crash_base = memblock_phys_alloc_range(crash_size,
+ KEXEC_CRASH_MEM_ALIGN,
+ low, high);
}
if (!crash_base) {
@@ -687,8 +688,10 @@ static void __init reserve_crashkernel(void)
return;
}
- if (register_memory_notifier(&kdump_mem_nb))
+ if (register_memory_notifier(&kdump_mem_nb)) {
+ memblock_free(crash_base, crash_size);
return;
+ }
if (!oldmem_data.start && MACHINE_IS_VM)
diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size));
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index aa705e1bd0dc..aa9d68b8ee14 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -449,3 +449,5 @@
444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 common process_mrelease sys_process_mrelease sys_process_mrelease
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 212632d57db9..a834e4672f72 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -822,7 +822,7 @@ void do_secure_storage_access(struct pt_regs *regs)
break;
case KERNEL_FAULT:
page = phys_to_page(addr);
- if (unlikely(!try_get_page(page)))
+ if (unlikely(!try_get_compound_head(page, 1)))
break;
rc = arch_make_page_accessible(page);
put_page(page);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index f3db3caa8447..f14e7e61cd8e 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -187,9 +187,9 @@ static void pv_init(void)
return;
/* make sure bounce buffers are shared */
+ swiotlb_force = SWIOTLB_FORCE;
swiotlb_init(1);
swiotlb_update_mem_attributes();
- swiotlb_force = SWIOTLB_FORCE;
}
void __init mem_init(void)
diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h
index 4486a865ff62..372afa82fee6 100644
--- a/arch/sh/include/asm/cacheflush.h
+++ b/arch/sh/include/asm/cacheflush.h
@@ -63,6 +63,8 @@ static inline void flush_anon_page(struct vm_area_struct *vma,
if (boot_cpu_data.dcache.n_aliases && PageAnon(page))
__flush_anon_page(page, vmaddr);
}
+
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
static inline void flush_kernel_vmap_range(void *addr, int size)
{
__flush_wback_region(addr, size);
@@ -72,12 +74,6 @@ static inline void invalidate_kernel_vmap_range(void *addr, int size)
__flush_invalidate_region(addr, size);
}
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-static inline void flush_kernel_dcache_page(struct page *page)
-{
- flush_dcache_page(page);
-}
-
extern void copy_to_user_page(struct vm_area_struct *vma,
struct page *page, unsigned long vaddr, void *dst, const void *src,
unsigned long len);
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 7bbd6700ae4b..208f131659c5 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -449,3 +449,5 @@
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 common process_mrelease sys_process_mrelease
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index f520e9cd2c78..7893104718c2 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -492,3 +492,5 @@
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 common process_mrelease sys_process_mrelease
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index a5beae6daf20..61f18b72552b 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -452,3 +452,4 @@
445 i386 landlock_add_rule sys_landlock_add_rule
446 i386 landlock_restrict_self sys_landlock_restrict_self
447 i386 memfd_secret sys_memfd_secret
+448 i386 process_mrelease sys_process_mrelease
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f6b57799c1ea..807b6a1de8e8 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -369,6 +369,7 @@
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
447 common memfd_secret sys_memfd_secret
+448 common process_mrelease sys_process_mrelease
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 294ed4392a0e..10562885f5fc 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -109,14 +109,13 @@ static u32 __init allocate_aperture(void)
* memory. Unfortunately we cannot move it up because that would
* make the IOMMU useless.
*/
- addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
- aper_size, aper_size);
+ addr = memblock_phys_alloc_range(aper_size, aper_size,
+ GART_MIN_ADDR, GART_MAX_ADDR);
if (!addr) {
pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
addr, addr + aper_size - 1, aper_size >> 10);
return 0;
}
- memblock_reserve(addr, aper_size);
pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
addr, addr + aper_size - 1, aper_size >> 10);
register_nosave_region(addr >> PAGE_SHIFT,
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index aa15132228da..525876e7b9f4 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -154,7 +154,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
if (num_entries > LDT_ENTRIES)
return NULL;
- new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
+ new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT);
if (!new_ldt)
return NULL;
@@ -168,9 +168,9 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
* than PAGE_SIZE.
*/
if (alloc_size > PAGE_SIZE)
- new_ldt->entries = vzalloc(alloc_size);
+ new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
else
- new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
+ new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!new_ldt->entries) {
kfree(new_ldt);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 75ef19aa8903..23a14d82e783 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -127,14 +127,12 @@ __ref void *alloc_low_pages(unsigned int num)
unsigned long ret = 0;
if (min_pfn_mapped < max_pfn_mapped) {
- ret = memblock_find_in_range(
+ ret = memblock_phys_alloc_range(
+ PAGE_SIZE * num, PAGE_SIZE,
min_pfn_mapped << PAGE_SHIFT,
- max_pfn_mapped << PAGE_SHIFT,
- PAGE_SIZE * num , PAGE_SIZE);
+ max_pfn_mapped << PAGE_SHIFT);
}
- if (ret)
- memblock_reserve(ret, PAGE_SIZE * num);
- else if (can_use_brk_pgt)
+ if (!ret && can_use_brk_pgt)
ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
if (!ret)
@@ -610,8 +608,17 @@ static void __init memory_map_top_down(unsigned long map_start,
unsigned long addr;
unsigned long mapped_ram_size = 0;
- /* xen has big range in reserved near end of ram, skip it at first.*/
- addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
+ /*
+ * Systems that have many reserved areas near top of the memory,
+ * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will
+ * require lots of 4K mappings which may exhaust pgt_buf.
+ * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure
+ * there is enough mapped memory that can be allocated from
+ * memblock.
+ */
+ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
+ map_end);
+ memblock_free(addr, PMD_SIZE);
real_end = addr + PMD_SIZE;
/* step_size need to be small so pgt_buf from BRK could cover it */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index e94da744386f..a1b5c71099e6 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -376,15 +376,14 @@ static int __init numa_alloc_distance(void)
cnt++;
size = cnt * cnt * sizeof(numa_distance[0]);
- phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
- size, PAGE_SIZE);
+ phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0,
+ PFN_PHYS(max_pfn_mapped));
if (!phys) {
pr_warn("Warning: can't allocate distance table!\n");
/* don't retry until explicitly reset */
numa_distance = (void *)1LU;
return -ENOMEM;
}
- memblock_reserve(phys, size);
numa_distance = __va(phys);
numa_distance_cnt = cnt;
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 87d77cc52f86..737491b13728 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -447,13 +447,12 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
if (numa_dist_cnt) {
u64 phys;
- phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
- phys_size, PAGE_SIZE);
+ phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0,
+ PFN_PHYS(max_pfn_mapped));
if (!phys) {
pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
goto no_emu;
}
- memblock_reserve(phys, phys_size);
phys_dist = __va(phys);
for (i = 0; i < numa_dist_cnt; i++)
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 6534c92d0f83..31b5856010cb 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -28,7 +28,7 @@ void __init reserve_real_mode(void)
WARN_ON(slab_is_available());
/* Has to be under 1M so we can execute real-mode AP code. */
- mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+ mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1<<20);
if (!mem)
pr_info("No sub-1M memory is available for the trampoline\n");
else
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index b3d1bc8a9095..104b327f8ac9 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -417,3 +417,5 @@
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 common process_mrelease sys_process_mrelease
diff --git a/block/blk-map.c b/block/blk-map.c
index d1448aaad980..4526adde0156 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -309,7 +309,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
static void bio_invalidate_vmalloc_pages(struct bio *bio)
{
-#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
+#ifdef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE
if (bio->bi_private && !op_is_write(bio_op(bio))) {
unsigned long i, len = 0;
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index a37a1532a575..f9383736fa0f 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -583,8 +583,8 @@ void __init acpi_table_upgrade(void)
}
acpi_tables_addr =
- memblock_find_in_range(0, ACPI_TABLE_UPGRADE_MAX_PHYS,
- all_tables_size, PAGE_SIZE);
+ memblock_phys_alloc_range(all_tables_size, PAGE_SIZE,
+ 0, ACPI_TABLE_UPGRADE_MAX_PHYS);
if (!acpi_tables_addr) {
WARN_ON(1);
return;
@@ -599,7 +599,6 @@ void __init acpi_table_upgrade(void)
* Both memblock_reserve and e820__range_add (via arch_reserve_mem_area)
* works fine.
*/
- memblock_reserve(acpi_tables_addr, all_tables_size);
arch_reserve_mem_area(acpi_tables_addr, all_tables_size);
/*
diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 4cc4e117727d..46c503486e96 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -279,13 +279,10 @@ static int __init numa_alloc_distance(void)
int i, j;
size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);
- phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),
- size, PAGE_SIZE);
+ phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn));
if (WARN_ON(!phys))
return -ENOMEM;
- memblock_reserve(phys, size);
-
numa_distance = __va(phys);
numa_distance_cnt = nr_node_ids;
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 688a592ad569..e65dd803a453 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -27,6 +27,7 @@
#include <linux/netdevice.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
+#include <linux/swiotlb.h>
#include <linux/sysfs.h>
#include <linux/dma-map-ops.h> /* for dma_default_coherent */
@@ -2851,6 +2852,9 @@ void device_initialize(struct device *dev)
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
dev->dma_coherent = dma_default_coherent;
#endif
+#ifdef CONFIG_SWIOTLB
+ dev->dma_io_tlb_mem = &io_tlb_default_mem;
+#endif
}
EXPORT_SYMBOL_GPL(device_initialize);
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index aa31a21f33d7..e3fd2dbf4eea 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -578,9 +578,9 @@ static struct memory_block *find_memory_block_by_id(unsigned long block_id)
/*
* Called under device_hotplug_lock.
*/
-struct memory_block *find_memory_block(struct mem_section *section)
+struct memory_block *find_memory_block(unsigned long section_nr)
{
- unsigned long block_id = memory_block_id(__section_nr(section));
+ unsigned long block_id = memory_block_id(section_nr);
return find_memory_block_by_id(block_id);
}
diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig
index e873f9ea2e65..c5b3dc97396a 100644
--- a/drivers/clk/Kconfig
+++ b/drivers/clk/Kconfig
@@ -403,6 +403,7 @@ source "drivers/clk/mediatek/Kconfig"
source "drivers/clk/meson/Kconfig"
source "drivers/clk/mstar/Kconfig"
source "drivers/clk/mvebu/Kconfig"
+source "drivers/clk/pistachio/Kconfig"
source "drivers/clk/qcom/Kconfig"
source "drivers/clk/ralink/Kconfig"
source "drivers/clk/renesas/Kconfig"
diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile
index 2b91d34c582b..e42312121e51 100644
--- a/drivers/clk/Makefile
+++ b/drivers/clk/Makefile
@@ -97,7 +97,7 @@ obj-y += mstar/
obj-y += mvebu/
obj-$(CONFIG_ARCH_MXS) += mxs/
obj-$(CONFIG_COMMON_CLK_NXP) += nxp/
-obj-$(CONFIG_MACH_PISTACHIO) += pistachio/
+obj-$(CONFIG_COMMON_CLK_PISTACHIO) += pistachio/
obj-$(CONFIG_COMMON_CLK_PXA) += pxa/
obj-$(CONFIG_COMMON_CLK_QCOM) += qcom/
obj-y += ralink/
diff --git a/drivers/clk/pistachio/Kconfig b/drivers/clk/pistachio/Kconfig
new file mode 100644
index 000000000000..d00f7b4a25fc
--- /dev/null
+++ b/drivers/clk/pistachio/Kconfig
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config COMMON_CLK_PISTACHIO
+ bool "Support for IMG Pistachio SoC clock controllers"
+ depends on MIPS || COMPILE_TEST
+ help
+ Support for the IMG Pistachio SoC clock controller.
+ Say Y if you want to include clock support.
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index eb661b539a3e..0f5e3983951a 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -234,8 +234,9 @@ config CLKSRC_LPC32XX
Support for the LPC32XX clocksource.
config CLKSRC_PISTACHIO
- bool "Clocksource for Pistachio SoC" if COMPILE_TEST
+ bool "Clocksource for Pistachio SoC"
depends on HAS_IOMEM
+ depends on MIPS || COMPILE_TEST
select TIMER_OF
help
Enables the clocksource for the Pistachio SoC.
diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 23a06cba392c..5a2cf5f91ccb 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -36,6 +36,7 @@
#define MAX_PSTATE_SHIFT 32
#define LPSTATE_SHIFT 48
#define GPSTATE_SHIFT 56
+#define MAX_NR_CHIPS 32
#define MAX_RAMP_DOWN_TIME 5120
/*
@@ -1046,12 +1047,20 @@ static int init_chip_info(void)
unsigned int *chip;
unsigned int cpu, i;
unsigned int prev_chip_id = UINT_MAX;
+ cpumask_t *chip_cpu_mask;
int ret = 0;
chip = kcalloc(num_possible_cpus(), sizeof(*chip), GFP_KERNEL);
if (!chip)
return -ENOMEM;
+ /* Allocate a chip cpu mask large enough to fit mask for all chips */
+ chip_cpu_mask = kcalloc(MAX_NR_CHIPS, sizeof(cpumask_t), GFP_KERNEL);
+ if (!chip_cpu_mask) {
+ ret = -ENOMEM;
+ goto free_and_return;
+ }
+
for_each_possible_cpu(cpu) {
unsigned int id = cpu_to_chip_id(cpu);
@@ -1059,22 +1068,25 @@ static int init_chip_info(void)
prev_chip_id = id;
chip[nr_chips++] = id;
}
+ cpumask_set_cpu(cpu, &chip_cpu_mask[nr_chips-1]);
}
chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
if (!chips) {
ret = -ENOMEM;
- goto free_and_return;
+ goto out_free_chip_cpu_mask;
}
for (i = 0; i < nr_chips; i++) {
chips[i].id = chip[i];
- cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
+ cpumask_copy(&chips[i].mask, &chip_cpu_mask[i]);
INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
for_each_cpu(cpu, &chips[i].mask)
per_cpu(chip_info, cpu) = &chips[i];
}
+out_free_chip_cpu_mask:
+ kfree(chip_cpu_mask);
free_and_return:
kfree(chip);
return ret;
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index a2b5c6f60cf0..7e7ab5597d7a 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -346,11 +346,9 @@ static int pseries_cpuidle_driver_init(void)
static void __init fixup_cede0_latency(void)
{
struct xcede_latency_payload *payload;
- u64 min_latency_us;
+ u64 min_xcede_latency_us = UINT_MAX;
int i;
- min_latency_us = dedicated_states[1].exit_latency; // CEDE latency
-
if (parse_cede_parameters())
return;
@@ -358,42 +356,45 @@ static void __init fixup_cede0_latency(void)
nr_xcede_records);
payload = &xcede_latency_parameter.payload;
+
+ /*
+ * The CEDE idle state maps to CEDE(0). While the hypervisor
+ * does not advertise CEDE(0) exit latency values, it does
+ * advertise the latency values of the extended CEDE states.
+ * We use the lowest advertised exit latency value as a proxy
+ * for the exit latency of CEDE(0).
+ */
for (i = 0; i < nr_xcede_records; i++) {
struct xcede_latency_record *record = &payload->records[i];
+ u8 hint = record->hint;
u64 latency_tb = be64_to_cpu(record->latency_ticks);
u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC);
- if (latency_us == 0)
- pr_warn("cpuidle: xcede record %d has an unrealistic latency of 0us.\n", i);
-
- if (latency_us < min_latency_us)
- min_latency_us = latency_us;
- }
-
- /*
- * By default, we assume that CEDE(0) has exit latency 10us,
- * since there is no way for us to query from the platform.
- *
- * However, if the wakeup latency of an Extended CEDE state is
- * smaller than 10us, then we can be sure that CEDE(0)
- * requires no more than that.
- *
- * Perform the fix-up.
- */
- if (min_latency_us < dedicated_states[1].exit_latency) {
/*
- * We set a minimum of 1us wakeup latency for cede0 to
- * distinguish it from snooze
+ * We expect the exit latency of an extended CEDE
+ * state to be non-zero, it to since it takes at least
+ * a few nanoseconds to wakeup the idle CPU and
+ * dispatch the virtual processor into the Linux
+ * Guest.
+ *
+ * So we consider only non-zero value for performing
+ * the fixup of CEDE(0) latency.
*/
- u64 cede0_latency = 1;
+ if (latency_us == 0) {
+ pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. Exit latency = 0us\n",
+ i, hint);
+ continue;
+ }
- if (min_latency_us > cede0_latency)
- cede0_latency = min_latency_us - 1;
+ if (latency_us < min_xcede_latency_us)
+ min_xcede_latency_us = latency_us;
+ }
- dedicated_states[1].exit_latency = cede0_latency;
- dedicated_states[1].target_residency = 10 * (cede0_latency);
+ if (min_xcede_latency_us != UINT_MAX) {
+ dedicated_states[1].exit_latency = min_xcede_latency_us;
+ dedicated_states[1].target_residency = 10 * (min_xcede_latency_us);
pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n",
- cede0_latency);
+ min_xcede_latency_us);
}
}
@@ -402,7 +403,7 @@ static void __init fixup_cede0_latency(void)
* pseries_idle_probe()
* Choose state table for shared versus dedicated partition
*/
-static int pseries_idle_probe(void)
+static int __init pseries_idle_probe(void)
{
if (cpuidle_disable != IDLE_NO_OVERRIDE)
@@ -419,7 +420,21 @@ static int pseries_idle_probe(void)
cpuidle_state_table = shared_states;
max_idle_state = ARRAY_SIZE(shared_states);
} else {
- fixup_cede0_latency();
+ /*
+ * Use firmware provided latency values
+ * starting with POWER10 platforms. In the
+ * case that we are running on a POWER10
+ * platform but in an earlier compat mode, we
+ * can still use the firmware provided values.
+ *
+ * However, on platforms prior to POWER10, we
+ * cannot rely on the accuracy of the firmware
+ * provided latency values. On such platforms,
+ * go with the conservative default estimate
+ * of 10us.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10))
+ fixup_cede0_latency();
cpuidle_state_table = dedicated_states;
max_idle_state = NR_DEDICATED_STATES;
}
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
index 13b217f75055..e5ae9c06510c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
@@ -42,7 +42,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj)
max_order = MAX_ORDER;
#ifdef CONFIG_SWIOTLB
- if (is_swiotlb_active()) {
+ if (is_swiotlb_active(obj->base.dev->dev)) {
unsigned int max_segment;
max_segment = swiotlb_max_segment();
diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c b/drivers/gpu/drm/nouveau/nouveau_ttm.c
index f4c2e46b6fe1..2ca9d9a9e5d5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_ttm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c
@@ -276,7 +276,7 @@ nouveau_ttm_init(struct nouveau_drm *drm)
}
#if IS_ENABLED(CONFIG_SWIOTLB) && IS_ENABLED(CONFIG_X86)
- need_swiotlb = is_swiotlb_active();
+ need_swiotlb = is_swiotlb_active(dev->dev);
#endif
ret = ttm_device_init(&drm->ttm.bdev, &nouveau_bo_driver, drm->dev->dev,
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f61516c17589..8ad8618b3530 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -79,16 +79,57 @@ config IOMMU_DEBUGFS
debug/iommu directory, and then populate a subdirectory with
entries as required.
-config IOMMU_DEFAULT_PASSTHROUGH
- bool "IOMMU passthrough by default"
+choice
+ prompt "IOMMU default domain type"
depends on IOMMU_API
+ default IOMMU_DEFAULT_DMA_LAZY if AMD_IOMMU || INTEL_IOMMU
+ default IOMMU_DEFAULT_DMA_STRICT
help
- Enable passthrough by default, removing the need to pass in
- iommu.passthrough=on or iommu=pt through command line. If this
- is enabled, you can still disable with iommu.passthrough=off
- or iommu=nopt depending on the architecture.
+ Choose the type of IOMMU domain used to manage DMA API usage by
+ device drivers. The options here typically represent different
+ levels of tradeoff between robustness/security and performance,
+ depending on the IOMMU driver. Not all IOMMUs support all options.
+ This choice can be overridden at boot via the command line, and for
+ some devices also at runtime via sysfs.
- If unsure, say N here.
+ If unsure, keep the default.
+
+config IOMMU_DEFAULT_DMA_STRICT
+ bool "Translated - Strict"
+ help
+ Trusted devices use translation to restrict their access to only
+ DMA-mapped pages, with strict TLB invalidation on unmap. Equivalent
+ to passing "iommu.passthrough=0 iommu.strict=1" on the command line.
+
+ Untrusted devices always use this mode, with an additional layer of
+ bounce-buffering such that they cannot gain access to any unrelated
+ data within a mapped page.
+
+config IOMMU_DEFAULT_DMA_LAZY
+ bool "Translated - Lazy"
+ help
+ Trusted devices use translation to restrict their access to only
+ DMA-mapped pages, but with "lazy" batched TLB invalidation. This
+ mode allows higher performance with some IOMMUs due to reduced TLB
+ flushing, but at the cost of reduced isolation since devices may be
+ able to access memory for some time after it has been unmapped.
+ Equivalent to passing "iommu.passthrough=0 iommu.strict=0" on the
+ command line.
+
+ If this mode is not supported by the IOMMU driver, the effective
+ runtime default will fall back to IOMMU_DEFAULT_DMA_STRICT.
+
+config IOMMU_DEFAULT_PASSTHROUGH
+ bool "Passthrough"
+ help
+ Trusted devices are identity-mapped, giving them unrestricted access
+ to memory with minimal performance overhead. Equivalent to passing
+ "iommu.passthrough=1" (historically "iommu=pt") on the command line.
+
+ If this mode is not supported by the IOMMU driver, the effective
+ runtime default will fall back to IOMMU_DEFAULT_DMA_STRICT.
+
+endchoice
config OF_IOMMU
def_bool y
@@ -249,6 +290,20 @@ config SPAPR_TCE_IOMMU
Enables bits of IOMMU API required by VFIO. The iommu_ops
is not implemented as it is not necessary for VFIO.
+config APPLE_DART
+ tristate "Apple DART IOMMU Support"
+ depends on ARCH_APPLE || (COMPILE_TEST && !GENERIC_ATOMIC64)
+ select IOMMU_API
+ select IOMMU_IO_PGTABLE_LPAE
+ default ARCH_APPLE
+ help
+ Support for Apple DART (Device Address Resolution Table) IOMMUs
+ found in Apple ARM SoCs like the M1.
+ This IOMMU is required for most peripherals using DMA to access
+ the main memory.
+
+ Say Y here if you are using an Apple SoC.
+
# ARM IOMMU support
config ARM_SMMU
tristate "ARM Ltd. System MMU (SMMU) Support"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index c0fb0ba88143..bc7f730edbb0 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
obj-$(CONFIG_IOMMU_SVA_LIB) += iommu-sva-lib.o io-pgfault.o
obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o
+obj-$(CONFIG_APPLE_DART) += apple-dart.o
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 94c1a7a9876d..8dbe61e2b3c1 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -779,12 +779,6 @@ extern u16 amd_iommu_last_bdf;
/* allocation bitmap for domain ids */
extern unsigned long *amd_iommu_pd_alloc_bitmap;
-/*
- * If true, the addresses will be flushed on unmap time, not when
- * they are reused
- */
-extern bool amd_iommu_unmap_flush;
-
/* Smallest max PASID supported by any IOMMU in the system */
extern u32 amd_iommu_max_pasid;
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 46280e6e1535..bdcf167b4afe 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -161,7 +161,6 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
to handle */
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
-bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
system */
@@ -1850,8 +1849,11 @@ static int __init iommu_init_pci(struct amd_iommu *iommu)
if (ret)
return ret;
- if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
+ if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) {
+ pr_info("Using strict mode due to virtualization\n");
+ iommu_set_dma_strict();
amd_iommu_np_cache = true;
+ }
init_iommu_perf_ctr(iommu);
@@ -3098,8 +3100,10 @@ static int __init parse_amd_iommu_intr(char *str)
static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
- if (strncmp(str, "fullflush", 9) == 0)
- amd_iommu_unmap_flush = true;
+ if (strncmp(str, "fullflush", 9) == 0) {
+ pr_warn("amd_iommu=fullflush deprecated; use iommu.strict=1 instead\n");
+ iommu_set_dma_strict();
+ }
if (strncmp(str, "force_enable", 12) == 0)
amd_iommu_force_enable = true;
if (strncmp(str, "off", 3) == 0)
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
index bb0ee5c9fde7..182c93a43efd 100644
--- a/drivers/iommu/amd/io_pgtable.c
+++ b/drivers/iommu/amd/io_pgtable.c
@@ -493,9 +493,6 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo
unsigned long offset_mask, pte_pgsize;
u64 *pte, __pte;
- if (pgtable->mode == PAGE_MODE_NONE)
- return iova;
-
pte = fetch_pte(pgtable, iova, &pte_pgsize);
if (!pte || !IOMMU_PTE_PRESENT(*pte))
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 811a49a95d04..1722bb161841 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -425,9 +425,11 @@ static void amd_iommu_report_rmp_hw_error(volatile u32 *event)
if (pdev)
dev_data = dev_iommu_priv_get(&pdev->dev);
- if (dev_data && __ratelimit(&dev_data->rs)) {
- pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
- vmg_tag, spa, flags);
+ if (dev_data) {
+ if (__ratelimit(&dev_data->rs)) {
+ pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
+ vmg_tag, spa, flags);
+ }
} else {
pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
@@ -456,9 +458,11 @@ static void amd_iommu_report_rmp_fault(volatile u32 *event)
if (pdev)
dev_data = dev_iommu_priv_get(&pdev->dev);
- if (dev_data && __ratelimit(&dev_data->rs)) {
- pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
- vmg_tag, gpa, flags_rmp, flags);
+ if (dev_data) {
+ if (__ratelimit(&dev_data->rs)) {
+ pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
+ vmg_tag, gpa, flags_rmp, flags);
+ }
} else {
pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
@@ -480,11 +484,13 @@ static void amd_iommu_report_page_fault(u16 devid, u16 domain_id,
if (pdev)
dev_data = dev_iommu_priv_get(&pdev->dev);
- if (dev_data && __ratelimit(&dev_data->rs)) {
- pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
- domain_id, address, flags);
- } else if (printk_ratelimit()) {
- pr_err("Event logged [IO_PAGE_FAULT device=%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
+ if (dev_data) {
+ if (__ratelimit(&dev_data->rs)) {
+ pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
+ domain_id, address, flags);
+ }
+ } else {
+ pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
domain_id, address, flags);
}
@@ -1261,15 +1267,52 @@ static void __domain_flush_pages(struct protection_domain *domain,
}
static void domain_flush_pages(struct protection_domain *domain,
- u64 address, size_t size)
+ u64 address, size_t size, int pde)
{
- __domain_flush_pages(domain, address, size, 0);
+ if (likely(!amd_iommu_np_cache)) {
+ __domain_flush_pages(domain, address, size, pde);
+ return;
+ }
+
+ /*
+ * When NpCache is on, we infer that we run in a VM and use a vIOMMU.
+ * In such setups it is best to avoid flushes of ranges which are not
+ * naturally aligned, since it would lead to flushes of unmodified
+ * PTEs. Such flushes would require the hypervisor to do more work than
+ * necessary. Therefore, perform repeated flushes of aligned ranges
+ * until you cover the range. Each iteration flushes the smaller
+ * between the natural alignment of the address that we flush and the
+ * greatest naturally aligned region that fits in the range.
+ */
+ while (size != 0) {
+ int addr_alignment = __ffs(address);
+ int size_alignment = __fls(size);
+ int min_alignment;
+ size_t flush_size;
+
+ /*
+ * size is always non-zero, but address might be zero, causing
+ * addr_alignment to be negative. As the casting of the
+ * argument in __ffs(address) to long might trim the high bits
+ * of the address on x86-32, cast to long when doing the check.
+ */
+ if (likely((unsigned long)address != 0))
+ min_alignment = min(addr_alignment, size_alignment);
+ else
+ min_alignment = size_alignment;
+
+ flush_size = 1ul << min_alignment;
+
+ __domain_flush_pages(domain, address, flush_size, pde);
+ address += flush_size;
+ size -= flush_size;
+ }
}
/* Flush the whole IO/TLB for a given protection domain - including PDE */
void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain)
{
- __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
+ domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
}
void amd_iommu_domain_flush_complete(struct protection_domain *domain)
@@ -1296,7 +1339,7 @@ static void domain_flush_np_cache(struct protection_domain *domain,
unsigned long flags;
spin_lock_irqsave(&domain->lock, flags);
- domain_flush_pages(domain, iova, size);
+ domain_flush_pages(domain, iova, size, 1);
amd_iommu_domain_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -1707,14 +1750,9 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev)
static void amd_iommu_probe_finalize(struct device *dev)
{
- struct iommu_domain *domain;
-
/* Domains are initialized for this device - have a look what we ended up with */
- domain = iommu_get_domain_for_dev(dev);
- if (domain->type == IOMMU_DOMAIN_DMA)
- iommu_setup_dma_ops(dev, 0, U64_MAX);
- else
- set_dma_ops(dev, NULL);
+ set_dma_ops(dev, NULL);
+ iommu_setup_dma_ops(dev, 0, U64_MAX);
}
static void amd_iommu_release_device(struct device *dev)
@@ -1775,12 +1813,6 @@ void amd_iommu_domain_update(struct protection_domain *domain)
static void __init amd_iommu_init_dma_ops(void)
{
swiotlb = (iommu_default_passthrough() || sme_me_mask) ? 1 : 0;
-
- if (amd_iommu_unmap_flush)
- pr_info("IO/TLB flush on unmap enabled\n");
- else
- pr_info("Lazy IO/TLB flushing enabled\n");
- iommu_set_dma_strict(amd_iommu_unmap_flush);
}
int __init amd_iommu_init_api(void)
@@ -1924,16 +1956,7 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
domain->domain.geometry.aperture_end = ~0ULL;
domain->domain.geometry.force_aperture = true;
- if (type == IOMMU_DOMAIN_DMA &&
- iommu_get_dma_cookie(&domain->domain) == -ENOMEM)
- goto free_domain;
-
return &domain->domain;
-
-free_domain:
- protection_domain_free(domain);
-
- return NULL;
}
static void amd_iommu_domain_free(struct iommu_domain *dom)
@@ -1950,9 +1973,6 @@ static void amd_iommu_domain_free(struct iommu_domain *dom)
if (!dom)
return;
- if (dom->type == IOMMU_DOMAIN_DMA)
- iommu_put_dma_cookie(&domain->domain);
-
if (domain->flags & PD_IOMMUV2_MASK)
free_gcr3_table(domain);
@@ -2022,6 +2042,16 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
return ret;
}
+static void amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
+ unsigned long iova, size_t size)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ struct io_pgtable_ops *ops = &domain->iop.iop.ops;
+
+ if (ops->map)
+ domain_flush_np_cache(domain, iova, size);
+}
+
static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
phys_addr_t paddr, size_t page_size, int iommu_prot,
gfp_t gfp)
@@ -2040,26 +2070,50 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
if (iommu_prot & IOMMU_WRITE)
prot |= IOMMU_PROT_IW;
- if (ops->map) {
+ if (ops->map)
ret = ops->map(ops, iova, paddr, page_size, prot, gfp);
- domain_flush_np_cache(domain, iova, page_size);
- }
return ret;
}
+static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t size)
+{
+ /*
+ * AMD's IOMMU can flush as many pages as necessary in a single flush.
+ * Unless we run in a virtual machine, which can be inferred according
+ * to whether "non-present cache" is on, it is probably best to prefer
+ * (potentially) too extensive TLB flushing (i.e., more misses) over
+ * mutliple TLB flushes (i.e., more flushes). For virtual machines the
+ * hypervisor needs to synchronize the host IOMMU PTEs with those of
+ * the guest, and the trade-off is different: unnecessary TLB flushes
+ * should be avoided.
+ */
+ if (amd_iommu_np_cache &&
+ iommu_iotlb_gather_is_disjoint(gather, iova, size))
+ iommu_iotlb_sync(domain, gather);
+
+ iommu_iotlb_gather_add_range(gather, iova, size);
+}
+
static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
size_t page_size,
struct iommu_iotlb_gather *gather)
{
struct protection_domain *domain = to_pdomain(dom);
struct io_pgtable_ops *ops = &domain->iop.iop.ops;
+ size_t r;
if ((amd_iommu_pgtable == AMD_IOMMU_V1) &&
(domain->iop.mode == PAGE_MODE_NONE))
return 0;
- return (ops->unmap) ? ops->unmap(ops, iova, page_size, gather) : 0;
+ r = (ops->unmap) ? ops->unmap(ops, iova, page_size, gather) : 0;
+
+ amd_iommu_iotlb_gather_add_page(dom, gather, iova, page_size);
+
+ return r;
}
static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2162,7 +2216,13 @@ static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather)
{
- amd_iommu_flush_iotlb_all(domain);
+ struct protection_domain *dom = to_pdomain(domain);
+ unsigned long flags;
+
+ spin_lock_irqsave(&dom->lock, flags);
+ domain_flush_pages(dom, gather->start, gather->end - gather->start, 1);
+ amd_iommu_domain_flush_complete(dom);
+ spin_unlock_irqrestore(&dom->lock, flags);
}
static int amd_iommu_def_domain_type(struct device *dev)
@@ -2191,6 +2251,7 @@ const struct iommu_ops amd_iommu_ops = {
.attach_dev = amd_iommu_attach_device,
.detach_dev = amd_iommu_detach_device,
.map = amd_iommu_map,
+ .iotlb_sync_map = amd_iommu_iotlb_sync_map,
.unmap = amd_iommu_unmap,
.iova_to_phys = amd_iommu_iova_to_phys,
.probe_device = amd_iommu_probe_device,
diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c
index f8d4ad421e07..a9e568276c99 100644
--- a/drivers/iommu/amd/iommu_v2.c
+++ b/drivers/iommu/amd/iommu_v2.c
@@ -6,6 +6,7 @@
#define pr_fmt(fmt) "AMD-Vi: " fmt
+#include <linux/refcount.h>
#include <linux/mmu_notifier.h>
#include <linux/amd-iommu.h>
#include <linux/mm_types.h>
@@ -33,7 +34,7 @@ struct pri_queue {
struct pasid_state {
struct list_head list; /* For global state-list */
- atomic_t count; /* Reference count */
+ refcount_t count; /* Reference count */
unsigned mmu_notifier_count; /* Counting nested mmu_notifier
calls */
struct mm_struct *mm; /* mm_struct for the faults */
@@ -242,7 +243,7 @@ static struct pasid_state *get_pasid_state(struct device_state *dev_state,
ret = *ptr;
if (ret)
- atomic_inc(&ret->count);
+ refcount_inc(&ret->count);
out_unlock:
spin_unlock_irqrestore(&dev_state->lock, flags);
@@ -257,14 +258,14 @@ static void free_pasid_state(struct pasid_state *pasid_state)
static void put_pasid_state(struct pasid_state *pasid_state)
{
- if (atomic_dec_and_test(&pasid_state->count))
+ if (refcount_dec_and_test(&pasid_state->count))
wake_up(&pasid_state->wq);
}
static void put_pasid_state_wait(struct pasid_state *pasid_state)
{
- atomic_dec(&pasid_state->count);
- wait_event(pasid_state->wq, !atomic_read(&pasid_state->count));
+ refcount_dec(&pasid_state->count);
+ wait_event(pasid_state->wq, !refcount_read(&pasid_state->count));
free_pasid_state(pasid_state);
}
@@ -624,7 +625,7 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid,
goto out;
- atomic_set(&pasid_state->count, 1);
+ refcount_set(&pasid_state->count, 1);
init_waitqueue_head(&pasid_state->wq);
spin_lock_init(&pasid_state->lock);
diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
new file mode 100644
index 000000000000..559db9259e65
--- /dev/null
+++ b/drivers/iommu/apple-dart.c
@@ -0,0 +1,923 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Apple DART (Device Address Resolution Table) IOMMU driver
+ *
+ * Copyright (C) 2021 The Asahi Linux Contributors
+ *
+ * Based on arm/arm-smmu/arm-ssmu.c and arm/arm-smmu-v3/arm-smmu-v3.c
+ * Copyright (C) 2013 ARM Limited
+ * Copyright (C) 2015 ARM Limited
+ * and on exynos-iommu.c
+ * Copyright (c) 2011,2016 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/atomic.h>
+#include <linux/bitfield.h>
+#include <linux/clk.h>
+#include <linux/dev_printk.h>
+#include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io-pgtable.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_iommu.h>
+#include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/swab.h>
+#include <linux/types.h>
+
+#define DART_MAX_STREAMS 16
+#define DART_MAX_TTBR 4
+#define MAX_DARTS_PER_DEVICE 2
+
+#define DART_STREAM_ALL 0xffff
+
+#define DART_PARAMS1 0x00
+#define DART_PARAMS_PAGE_SHIFT GENMASK(27, 24)
+
+#define DART_PARAMS2 0x04
+#define DART_PARAMS_BYPASS_SUPPORT BIT(0)
+
+#define DART_STREAM_COMMAND 0x20
+#define DART_STREAM_COMMAND_BUSY BIT(2)
+#define DART_STREAM_COMMAND_INVALIDATE BIT(20)
+
+#define DART_STREAM_SELECT 0x34
+
+#define DART_ERROR 0x40
+#define DART_ERROR_STREAM GENMASK(27, 24)
+#define DART_ERROR_CODE GENMASK(11, 0)
+#define DART_ERROR_FLAG BIT(31)
+
+#define DART_ERROR_READ_FAULT BIT(4)
+#define DART_ERROR_WRITE_FAULT BIT(3)
+#define DART_ERROR_NO_PTE BIT(2)
+#define DART_ERROR_NO_PMD BIT(1)
+#define DART_ERROR_NO_TTBR BIT(0)
+
+#define DART_CONFIG 0x60
+#define DART_CONFIG_LOCK BIT(15)
+
+#define DART_STREAM_COMMAND_BUSY_TIMEOUT 100
+
+#define DART_ERROR_ADDR_HI 0x54
+#define DART_ERROR_ADDR_LO 0x50
+
+#define DART_TCR(sid) (0x100 + 4 * (sid))
+#define DART_TCR_TRANSLATE_ENABLE BIT(7)
+#define DART_TCR_BYPASS0_ENABLE BIT(8)
+#define DART_TCR_BYPASS1_ENABLE BIT(12)
+
+#define DART_TTBR(sid, idx) (0x200 + 16 * (sid) + 4 * (idx))
+#define DART_TTBR_VALID BIT(31)
+#define DART_TTBR_SHIFT 12
+
+/*
+ * Private structure associated with each DART device.
+ *
+ * @dev: device struct
+ * @regs: mapped MMIO region
+ * @irq: interrupt number, can be shared with other DARTs
+ * @clks: clocks associated with this DART
+ * @num_clks: number of @clks
+ * @lock: lock for hardware operations involving this dart
+ * @pgsize: pagesize supported by this DART
+ * @supports_bypass: indicates if this DART supports bypass mode
+ * @force_bypass: force bypass mode due to pagesize mismatch?
+ * @sid2group: maps stream ids to iommu_groups
+ * @iommu: iommu core device
+ */
+struct apple_dart {
+ struct device *dev;
+
+ void __iomem *regs;
+
+ int irq;
+ struct clk_bulk_data *clks;
+ int num_clks;
+
+ spinlock_t lock;
+
+ u32 pgsize;
+ u32 supports_bypass : 1;
+ u32 force_bypass : 1;
+
+ struct iommu_group *sid2group[DART_MAX_STREAMS];
+ struct iommu_device iommu;
+};
+
+/*
+ * Convenience struct to identify streams.
+ *
+ * The normal variant is used inside apple_dart_master_cfg which isn't written
+ * to concurrently.
+ * The atomic variant is used inside apple_dart_domain where we have to guard
+ * against races from potential parallel calls to attach/detach_device.
+ * Note that even inside the atomic variant the apple_dart pointer is not
+ * protected: This pointer is initialized once under the domain init mutex
+ * and never changed again afterwards. Devices with different dart pointers
+ * cannot be attached to the same domain.
+ *
+ * @dart dart pointer
+ * @sid stream id bitmap
+ */
+struct apple_dart_stream_map {
+ struct apple_dart *dart;
+ unsigned long sidmap;
+};
+struct apple_dart_atomic_stream_map {
+ struct apple_dart *dart;
+ atomic64_t sidmap;
+};
+
+/*
+ * This structure is attached to each iommu domain handled by a DART.
+ *
+ * @pgtbl_ops: pagetable ops allocated by io-pgtable
+ * @finalized: true if the domain has been completely initialized
+ * @init_lock: protects domain initialization
+ * @stream_maps: streams attached to this domain (valid for DMA/UNMANAGED only)
+ * @domain: core iommu domain pointer
+ */
+struct apple_dart_domain {
+ struct io_pgtable_ops *pgtbl_ops;
+
+ bool finalized;
+ struct mutex init_lock;
+ struct apple_dart_atomic_stream_map stream_maps[MAX_DARTS_PER_DEVICE];
+
+ struct iommu_domain domain;
+};
+
+/*
+ * This structure is attached to devices with dev_iommu_priv_set() on of_xlate
+ * and contains a list of streams bound to this device.
+ * So far the worst case seen is a single device with two streams
+ * from different darts, such that this simple static array is enough.
+ *
+ * @streams: streams for this device
+ */
+struct apple_dart_master_cfg {
+ struct apple_dart_stream_map stream_maps[MAX_DARTS_PER_DEVICE];
+};
+
+/*
+ * Helper macro to iterate over apple_dart_master_cfg.stream_maps and
+ * apple_dart_domain.stream_maps
+ *
+ * @i int used as loop variable
+ * @base pointer to base struct (apple_dart_master_cfg or apple_dart_domain)
+ * @stream pointer to the apple_dart_streams struct for each loop iteration
+ */
+#define for_each_stream_map(i, base, stream_map) \
+ for (i = 0, stream_map = &(base)->stream_maps[0]; \
+ i < MAX_DARTS_PER_DEVICE && stream_map->dart; \
+ stream_map = &(base)->stream_maps[++i])
+
+static struct platform_driver apple_dart_driver;
+static const struct iommu_ops apple_dart_iommu_ops;
+static const struct iommu_flush_ops apple_dart_tlb_ops;
+
+static struct apple_dart_domain *to_dart_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct apple_dart_domain, domain);
+}
+
+static void
+apple_dart_hw_enable_translation(struct apple_dart_stream_map *stream_map)
+{
+ int sid;
+
+ for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
+ writel(DART_TCR_TRANSLATE_ENABLE,
+ stream_map->dart->regs + DART_TCR(sid));
+}
+
+static void apple_dart_hw_disable_dma(struct apple_dart_stream_map *stream_map)
+{
+ int sid;
+
+ for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
+ writel(0, stream_map->dart->regs + DART_TCR(sid));
+}
+
+static void
+apple_dart_hw_enable_bypass(struct apple_dart_stream_map *stream_map)
+{
+ int sid;
+
+ WARN_ON(!stream_map->dart->supports_bypass);
+ for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
+ writel(DART_TCR_BYPASS0_ENABLE | DART_TCR_BYPASS1_ENABLE,
+ stream_map->dart->regs + DART_TCR(sid));
+}
+
+static void apple_dart_hw_set_ttbr(struct apple_dart_stream_map *stream_map,
+ u8 idx, phys_addr_t paddr)
+{
+ int sid;
+
+ WARN_ON(paddr & ((1 << DART_TTBR_SHIFT) - 1));
+ for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
+ writel(DART_TTBR_VALID | (paddr >> DART_TTBR_SHIFT),
+ stream_map->dart->regs + DART_TTBR(sid, idx));
+}
+
+static void apple_dart_hw_clear_ttbr(struct apple_dart_stream_map *stream_map,
+ u8 idx)
+{
+ int sid;
+
+ for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
+ writel(0, stream_map->dart->regs + DART_TTBR(sid, idx));
+}
+
+static void
+apple_dart_hw_clear_all_ttbrs(struct apple_dart_stream_map *stream_map)
+{
+ int i;
+
+ for (i = 0; i < DART_MAX_TTBR; ++i)
+ apple_dart_hw_clear_ttbr(stream_map, i);
+}
+
+static int
+apple_dart_hw_stream_command(struct apple_dart_stream_map *stream_map,
+ u32 command)
+{
+ unsigned long flags;
+ int ret;
+ u32 command_reg;
+
+ spin_lock_irqsave(&stream_map->dart->lock, flags);
+
+ writel(stream_map->sidmap, stream_map->dart->regs + DART_STREAM_SELECT);
+ writel(command, stream_map->dart->regs + DART_STREAM_COMMAND);
+
+ ret = readl_poll_timeout_atomic(
+ stream_map->dart->regs + DART_STREAM_COMMAND, command_reg,
+ !(command_reg & DART_STREAM_COMMAND_BUSY), 1,
+ DART_STREAM_COMMAND_BUSY_TIMEOUT);
+
+ spin_unlock_irqrestore(&stream_map->dart->lock, flags);
+
+ if (ret) {
+ dev_err(stream_map->dart->dev,
+ "busy bit did not clear after command %x for streams %lx\n",
+ command, stream_map->sidmap);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int
+apple_dart_hw_invalidate_tlb(struct apple_dart_stream_map *stream_map)
+{
+ return apple_dart_hw_stream_command(stream_map,
+ DART_STREAM_COMMAND_INVALIDATE);
+}
+
+static int apple_dart_hw_reset(struct apple_dart *dart)
+{
+ u32 config;
+ struct apple_dart_stream_map stream_map;
+
+ config = readl(dart->regs + DART_CONFIG);
+ if (config & DART_CONFIG_LOCK) {
+ dev_err(dart->dev, "DART is locked down until reboot: %08x\n",
+ config);
+ return -EINVAL;
+ }
+
+ stream_map.dart = dart;
+ stream_map.sidmap = DART_STREAM_ALL;
+ apple_dart_hw_disable_dma(&stream_map);
+ apple_dart_hw_clear_all_ttbrs(&stream_map);
+
+ /* clear any pending errors before the interrupt is unmasked */
+ writel(readl(dart->regs + DART_ERROR), dart->regs + DART_ERROR);
+
+ return apple_dart_hw_invalidate_tlb(&stream_map);
+}
+
+static void apple_dart_domain_flush_tlb(struct apple_dart_domain *domain)
+{
+ int i;
+ struct apple_dart_atomic_stream_map *domain_stream_map;
+ struct apple_dart_stream_map stream_map;
+
+ for_each_stream_map(i, domain, domain_stream_map) {
+ stream_map.dart = domain_stream_map->dart;
+ stream_map.sidmap = atomic64_read(&domain_stream_map->sidmap);
+ apple_dart_hw_invalidate_tlb(&stream_map);
+ }
+}
+
+static void apple_dart_flush_iotlb_all(struct iommu_domain *domain)
+{
+ apple_dart_domain_flush_tlb(to_dart_domain(domain));
+}
+
+static void apple_dart_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ apple_dart_domain_flush_tlb(to_dart_domain(domain));
+}
+
+static void apple_dart_iotlb_sync_map(struct iommu_domain *domain,
+ unsigned long iova, size_t size)
+{
+ apple_dart_domain_flush_tlb(to_dart_domain(domain));
+}
+
+static void apple_dart_tlb_flush_all(void *cookie)
+{
+ apple_dart_domain_flush_tlb(cookie);
+}
+
+static void apple_dart_tlb_flush_walk(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ apple_dart_domain_flush_tlb(cookie);
+}
+
+static const struct iommu_flush_ops apple_dart_tlb_ops = {
+ .tlb_flush_all = apple_dart_tlb_flush_all,
+ .tlb_flush_walk = apple_dart_tlb_flush_walk,
+};
+
+static phys_addr_t apple_dart_iova_to_phys(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct apple_dart_domain *dart_domain = to_dart_domain(domain);
+ struct io_pgtable_ops *ops = dart_domain->pgtbl_ops;
+
+ if (!ops)
+ return 0;
+
+ return ops->iova_to_phys(ops, iova);
+}
+
+static int apple_dart_map_pages(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t pgsize,
+ size_t pgcount, int prot, gfp_t gfp,
+ size_t *mapped)
+{
+ struct apple_dart_domain *dart_domain = to_dart_domain(domain);
+ struct io_pgtable_ops *ops = dart_domain->pgtbl_ops;
+
+ if (!ops)
+ return -ENODEV;
+
+ return ops->map_pages(ops, iova, paddr, pgsize, pgcount, prot, gfp,
+ mapped);
+}
+
+static size_t apple_dart_unmap_pages(struct iommu_domain *domain,
+ unsigned long iova, size_t pgsize,
+ size_t pgcount,
+ struct iommu_iotlb_gather *gather)
+{
+ struct apple_dart_domain *dart_domain = to_dart_domain(domain);
+ struct io_pgtable_ops *ops = dart_domain->pgtbl_ops;
+
+ return ops->unmap_pages(ops, iova, pgsize, pgcount, gather);
+}
+
+static void
+apple_dart_setup_translation(struct apple_dart_domain *domain,
+ struct apple_dart_stream_map *stream_map)
+{
+ int i;
+ struct io_pgtable_cfg *pgtbl_cfg =
+ &io_pgtable_ops_to_pgtable(domain->pgtbl_ops)->cfg;
+
+ for (i = 0; i < pgtbl_cfg->apple_dart_cfg.n_ttbrs; ++i)
+ apple_dart_hw_set_ttbr(stream_map, i,
+ pgtbl_cfg->apple_dart_cfg.ttbr[i]);
+ for (; i < DART_MAX_TTBR; ++i)
+ apple_dart_hw_clear_ttbr(stream_map, i);
+
+ apple_dart_hw_enable_translation(stream_map);
+ apple_dart_hw_invalidate_tlb(stream_map);
+}
+
+static int apple_dart_finalize_domain(struct iommu_domain *domain,
+ struct apple_dart_master_cfg *cfg)
+{
+ struct apple_dart_domain *dart_domain = to_dart_domain(domain);
+ struct apple_dart *dart = cfg->stream_maps[0].dart;
+ struct io_pgtable_cfg pgtbl_cfg;
+ int ret = 0;
+ int i;
+
+ mutex_lock(&dart_domain->init_lock);
+
+ if (dart_domain->finalized)
+ goto done;
+
+ for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
+ dart_domain->stream_maps[i].dart = cfg->stream_maps[i].dart;
+ atomic64_set(&dart_domain->stream_maps[i].sidmap,
+ cfg->stream_maps[i].sidmap);
+ }
+
+ pgtbl_cfg = (struct io_pgtable_cfg){
+ .pgsize_bitmap = dart->pgsize,
+ .ias = 32,
+ .oas = 36,
+ .coherent_walk = 1,
+ .tlb = &apple_dart_tlb_ops,
+ .iommu_dev = dart->dev,
+ };
+
+ dart_domain->pgtbl_ops =
+ alloc_io_pgtable_ops(APPLE_DART, &pgtbl_cfg, domain);
+ if (!dart_domain->pgtbl_ops) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
+ domain->geometry.aperture_start = 0;
+ domain->geometry.aperture_end = DMA_BIT_MASK(32);
+ domain->geometry.force_aperture = true;
+
+ dart_domain->finalized = true;
+
+done:
+ mutex_unlock(&dart_domain->init_lock);
+ return ret;
+}
+
+static int
+apple_dart_mod_streams(struct apple_dart_atomic_stream_map *domain_maps,
+ struct apple_dart_stream_map *master_maps,
+ bool add_streams)
+{
+ int i;
+
+ for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
+ if (domain_maps[i].dart != master_maps[i].dart)
+ return -EINVAL;
+ }
+
+ for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
+ if (!domain_maps[i].dart)
+ break;
+ if (add_streams)
+ atomic64_or(master_maps[i].sidmap,
+ &domain_maps[i].sidmap);
+ else
+ atomic64_and(~master_maps[i].sidmap,
+ &domain_maps[i].sidmap);
+ }
+
+ return 0;
+}
+
+static int apple_dart_domain_add_streams(struct apple_dart_domain *domain,
+ struct apple_dart_master_cfg *cfg)
+{
+ return apple_dart_mod_streams(domain->stream_maps, cfg->stream_maps,
+ true);
+}
+
+static int apple_dart_domain_remove_streams(struct apple_dart_domain *domain,
+ struct apple_dart_master_cfg *cfg)
+{
+ return apple_dart_mod_streams(domain->stream_maps, cfg->stream_maps,
+ false);
+}
+
+static int apple_dart_attach_dev(struct iommu_domain *domain,
+ struct device *dev)
+{
+ int ret, i;
+ struct apple_dart_stream_map *stream_map;
+ struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
+ struct apple_dart_domain *dart_domain = to_dart_domain(domain);
+
+ if (cfg->stream_maps[0].dart->force_bypass &&
+ domain->type != IOMMU_DOMAIN_IDENTITY)
+ return -EINVAL;
+ if (!cfg->stream_maps[0].dart->supports_bypass &&
+ domain->type == IOMMU_DOMAIN_IDENTITY)
+ return -EINVAL;
+
+ ret = apple_dart_finalize_domain(domain, cfg);
+ if (ret)
+ return ret;
+
+ switch (domain->type) {
+ case IOMMU_DOMAIN_DMA:
+ case IOMMU_DOMAIN_UNMANAGED:
+ ret = apple_dart_domain_add_streams(dart_domain, cfg);
+ if (ret)
+ return ret;
+
+ for_each_stream_map(i, cfg, stream_map)
+ apple_dart_setup_translation(dart_domain, stream_map);
+ break;
+ case IOMMU_DOMAIN_BLOCKED:
+ for_each_stream_map(i, cfg, stream_map)
+ apple_dart_hw_disable_dma(stream_map);
+ break;
+ case IOMMU_DOMAIN_IDENTITY:
+ for_each_stream_map(i, cfg, stream_map)
+ apple_dart_hw_enable_bypass(stream_map);
+ break;
+ }
+
+ return ret;
+}
+
+static void apple_dart_detach_dev(struct iommu_domain *domain,
+ struct device *dev)
+{
+ int i;
+ struct apple_dart_stream_map *stream_map;
+ struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
+ struct apple_dart_domain *dart_domain = to_dart_domain(domain);
+
+ for_each_stream_map(i, cfg, stream_map)
+ apple_dart_hw_disable_dma(stream_map);
+
+ if (domain->type == IOMMU_DOMAIN_DMA ||
+ domain->type == IOMMU_DOMAIN_UNMANAGED)
+ apple_dart_domain_remove_streams(dart_domain, cfg);
+}
+
+static struct iommu_device *apple_dart_probe_device(struct device *dev)
+{
+ struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
+ struct apple_dart_stream_map *stream_map;
+ int i;
+
+ if (!cfg)
+ return ERR_PTR(-ENODEV);
+
+ for_each_stream_map(i, cfg, stream_map)
+ device_link_add(
+ dev, stream_map->dart->dev,
+ DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE_SUPPLIER);
+
+ return &cfg->stream_maps[0].dart->iommu;
+}
+
+static void apple_dart_release_device(struct device *dev)
+{
+ struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
+
+ if (!cfg)
+ return;
+
+ dev_iommu_priv_set(dev, NULL);
+ kfree(cfg);
+}
+
+static struct iommu_domain *apple_dart_domain_alloc(unsigned int type)
+{
+ struct apple_dart_domain *dart_domain;
+
+ if (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_UNMANAGED &&
+ type != IOMMU_DOMAIN_IDENTITY && type != IOMMU_DOMAIN_BLOCKED)
+ return NULL;
+
+ dart_domain = kzalloc(sizeof(*dart_domain), GFP_KERNEL);
+ if (!dart_domain)
+ return NULL;
+
+ iommu_get_dma_cookie(&dart_domain->domain);
+ mutex_init(&dart_domain->init_lock);
+
+ /* no need to allocate pgtbl_ops or do any other finalization steps */
+ if (type == IOMMU_DOMAIN_IDENTITY || type == IOMMU_DOMAIN_BLOCKED)
+ dart_domain->finalized = true;
+
+ return &dart_domain->domain;
+}
+
+static void apple_dart_domain_free(struct iommu_domain *domain)
+{
+ struct apple_dart_domain *dart_domain = to_dart_domain(domain);
+
+ if (dart_domain->pgtbl_ops)
+ free_io_pgtable_ops(dart_domain->pgtbl_ops);
+
+ kfree(dart_domain);
+}
+
+static int apple_dart_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+ struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
+ struct platform_device *iommu_pdev = of_find_device_by_node(args->np);
+ struct apple_dart *dart = platform_get_drvdata(iommu_pdev);
+ struct apple_dart *cfg_dart;
+ int i, sid;
+
+ if (args->args_count != 1)
+ return -EINVAL;
+ sid = args->args[0];
+
+ if (!cfg)
+ cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+ if (!cfg)
+ return -ENOMEM;
+ dev_iommu_priv_set(dev, cfg);
+
+ cfg_dart = cfg->stream_maps[0].dart;
+ if (cfg_dart) {
+ if (cfg_dart->supports_bypass != dart->supports_bypass)
+ return -EINVAL;
+ if (cfg_dart->force_bypass != dart->force_bypass)
+ return -EINVAL;
+ if (cfg_dart->pgsize != dart->pgsize)
+ return -EINVAL;
+ }
+
+ for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
+ if (cfg->stream_maps[i].dart == dart) {
+ cfg->stream_maps[i].sidmap |= 1 << sid;
+ return 0;
+ }
+ }
+ for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
+ if (!cfg->stream_maps[i].dart) {
+ cfg->stream_maps[i].dart = dart;
+ cfg->stream_maps[i].sidmap = 1 << sid;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static struct iommu_group *apple_dart_device_group(struct device *dev)
+{
+ static DEFINE_MUTEX(lock);
+ int i, sid;
+ struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
+ struct apple_dart_stream_map *stream_map;
+ struct iommu_group *group = NULL;
+ struct iommu_group *res = ERR_PTR(-EINVAL);
+
+ mutex_lock(&lock);
+
+ for_each_stream_map(i, cfg, stream_map) {
+ for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) {
+ struct iommu_group *stream_group =
+ stream_map->dart->sid2group[sid];
+
+ if (group && group != stream_group) {
+ res = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ group = stream_group;
+ }
+ }
+
+ if (group) {
+ res = iommu_group_ref_get(group);
+ goto out;
+ }
+
+#ifdef CONFIG_PCI
+ if (dev_is_pci(dev))
+ group = pci_device_group(dev);
+ else
+#endif
+ group = generic_device_group(dev);
+
+ for_each_stream_map(i, cfg, stream_map)
+ for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS)
+ stream_map->dart->sid2group[sid] = group;
+
+ res = group;
+
+out:
+ mutex_unlock(&lock);
+ return res;
+}
+
+static int apple_dart_def_domain_type(struct device *dev)
+{
+ struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
+
+ if (cfg->stream_maps[0].dart->force_bypass)
+ return IOMMU_DOMAIN_IDENTITY;
+ if (!cfg->stream_maps[0].dart->supports_bypass)
+ return IOMMU_DOMAIN_DMA;
+
+ return 0;
+}
+
+static const struct iommu_ops apple_dart_iommu_ops = {
+ .domain_alloc = apple_dart_domain_alloc,
+ .domain_free = apple_dart_domain_free,
+ .attach_dev = apple_dart_attach_dev,
+ .detach_dev = apple_dart_detach_dev,
+ .map_pages = apple_dart_map_pages,
+ .unmap_pages = apple_dart_unmap_pages,
+ .flush_iotlb_all = apple_dart_flush_iotlb_all,
+ .iotlb_sync = apple_dart_iotlb_sync,
+ .iotlb_sync_map = apple_dart_iotlb_sync_map,
+ .iova_to_phys = apple_dart_iova_to_phys,
+ .probe_device = apple_dart_probe_device,
+ .release_device = apple_dart_release_device,
+ .device_group = apple_dart_device_group,
+ .of_xlate = apple_dart_of_xlate,
+ .def_domain_type = apple_dart_def_domain_type,
+ .pgsize_bitmap = -1UL, /* Restricted during dart probe */
+};
+
+static irqreturn_t apple_dart_irq(int irq, void *dev)
+{
+ struct apple_dart *dart = dev;
+ const char *fault_name = NULL;
+ u32 error = readl(dart->regs + DART_ERROR);
+ u32 error_code = FIELD_GET(DART_ERROR_CODE, error);
+ u32 addr_lo = readl(dart->regs + DART_ERROR_ADDR_LO);
+ u32 addr_hi = readl(dart->regs + DART_ERROR_ADDR_HI);
+ u64 addr = addr_lo | (((u64)addr_hi) << 32);
+ u8 stream_idx = FIELD_GET(DART_ERROR_STREAM, error);
+
+ if (!(error & DART_ERROR_FLAG))
+ return IRQ_NONE;
+
+ /* there should only be a single bit set but let's use == to be sure */
+ if (error_code == DART_ERROR_READ_FAULT)
+ fault_name = "READ FAULT";
+ else if (error_code == DART_ERROR_WRITE_FAULT)
+ fault_name = "WRITE FAULT";
+ else if (error_code == DART_ERROR_NO_PTE)
+ fault_name = "NO PTE FOR IOVA";
+ else if (error_code == DART_ERROR_NO_PMD)
+ fault_name = "NO PMD FOR IOVA";
+ else if (error_code == DART_ERROR_NO_TTBR)
+ fault_name = "NO TTBR FOR IOVA";
+ else
+ fault_name = "unknown";
+
+ dev_err_ratelimited(
+ dart->dev,
+ "translation fault: status:0x%x stream:%d code:0x%x (%s) at 0x%llx",
+ error, stream_idx, error_code, fault_name, addr);
+
+ writel(error, dart->regs + DART_ERROR);
+ return IRQ_HANDLED;
+}
+
+static int apple_dart_set_bus_ops(const struct iommu_ops *ops)
+{
+ int ret;
+
+ if (!iommu_present(&platform_bus_type)) {
+ ret = bus_set_iommu(&platform_bus_type, ops);
+ if (ret)
+ return ret;
+ }
+#ifdef CONFIG_PCI
+ if (!iommu_present(&pci_bus_type)) {
+ ret = bus_set_iommu(&pci_bus_type, ops);
+ if (ret) {
+ bus_set_iommu(&platform_bus_type, NULL);
+ return ret;
+ }
+ }
+#endif
+ return 0;
+}
+
+static int apple_dart_probe(struct platform_device *pdev)
+{
+ int ret;
+ u32 dart_params[2];
+ struct resource *res;
+ struct apple_dart *dart;
+ struct device *dev = &pdev->dev;
+
+ dart = devm_kzalloc(dev, sizeof(*dart), GFP_KERNEL);
+ if (!dart)
+ return -ENOMEM;
+
+ dart->dev = dev;
+ spin_lock_init(&dart->lock);
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (resource_size(res) < 0x4000) {
+ dev_err(dev, "MMIO region too small (%pr)\n", res);
+ return -EINVAL;
+ }
+
+ dart->regs = devm_ioremap_resource(dev, res);
+ if (IS_ERR(dart->regs))
+ return PTR_ERR(dart->regs);
+
+ dart->irq = platform_get_irq(pdev, 0);
+ if (dart->irq < 0)
+ return -ENODEV;
+
+ ret = devm_clk_bulk_get_all(dev, &dart->clks);
+ if (ret < 0)
+ return ret;
+ dart->num_clks = ret;
+
+ ret = clk_bulk_prepare_enable(dart->num_clks, dart->clks);
+ if (ret)
+ return ret;
+
+ ret = apple_dart_hw_reset(dart);
+ if (ret)
+ goto err_clk_disable;
+
+ dart_params[0] = readl(dart->regs + DART_PARAMS1);
+ dart_params[1] = readl(dart->regs + DART_PARAMS2);
+ dart->pgsize = 1 << FIELD_GET(DART_PARAMS_PAGE_SHIFT, dart_params[0]);
+ dart->supports_bypass = dart_params[1] & DART_PARAMS_BYPASS_SUPPORT;
+ dart->force_bypass = dart->pgsize > PAGE_SIZE;
+
+ ret = request_irq(dart->irq, apple_dart_irq, IRQF_SHARED,
+ "apple-dart fault handler", dart);
+ if (ret)
+ goto err_clk_disable;
+
+ platform_set_drvdata(pdev, dart);
+
+ ret = apple_dart_set_bus_ops(&apple_dart_iommu_ops);
+ if (ret)
+ goto err_free_irq;
+
+ ret = iommu_device_sysfs_add(&dart->iommu, dev, NULL, "apple-dart.%s",
+ dev_name(&pdev->dev));
+ if (ret)
+ goto err_remove_bus_ops;
+
+ ret = iommu_device_register(&dart->iommu, &apple_dart_iommu_ops, dev);
+ if (ret)
+ goto err_sysfs_remove;
+
+ dev_info(
+ &pdev->dev,
+ "DART [pagesize %x, bypass support: %d, bypass forced: %d] initialized\n",
+ dart->pgsize, dart->supports_bypass, dart->force_bypass);
+ return 0;
+
+err_sysfs_remove:
+ iommu_device_sysfs_remove(&dart->iommu);
+err_remove_bus_ops:
+ apple_dart_set_bus_ops(NULL);
+err_free_irq:
+ free_irq(dart->irq, dart);
+err_clk_disable:
+ clk_bulk_disable_unprepare(dart->num_clks, dart->clks);
+
+ return ret;
+}
+
+static int apple_dart_remove(struct platform_device *pdev)
+{
+ struct apple_dart *dart = platform_get_drvdata(pdev);
+
+ apple_dart_hw_reset(dart);
+ free_irq(dart->irq, dart);
+ apple_dart_set_bus_ops(NULL);
+
+ iommu_device_unregister(&dart->iommu);
+ iommu_device_sysfs_remove(&dart->iommu);
+
+ clk_bulk_disable_unprepare(dart->num_clks, dart->clks);
+
+ return 0;
+}
+
+static const struct of_device_id apple_dart_of_match[] = {
+ { .compatible = "apple,t8103-dart", .data = NULL },
+ {},
+};
+MODULE_DEVICE_TABLE(of, apple_dart_of_match);
+
+static struct platform_driver apple_dart_driver = {
+ .driver = {
+ .name = "apple-dart",
+ .of_match_table = apple_dart_of_match,
+ .suppress_bind_attrs = true,
+ },
+ .probe = apple_dart_probe,
+ .remove = apple_dart_remove,
+};
+
+module_platform_driver(apple_dart_driver);
+
+MODULE_DESCRIPTION("IOMMU API for Apple's DART");
+MODULE_AUTHOR("Sven Peter <sven@svenpeter.dev>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 235f9bdaeaf2..a388e318f86e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -335,10 +335,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
return 0;
}
+static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)
+{
+ return &smmu->cmdq;
+}
+
static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
- u32 prod)
+ struct arm_smmu_queue *q, u32 prod)
{
- struct arm_smmu_queue *q = &smmu->cmdq.q;
struct arm_smmu_cmdq_ent ent = {
.opcode = CMDQ_OP_CMD_SYNC,
};
@@ -355,7 +359,8 @@ static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
arm_smmu_cmdq_build_cmd(cmd, &ent);
}
-static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
+static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
+ struct arm_smmu_queue *q)
{
static const char * const cerror_str[] = {
[CMDQ_ERR_CERROR_NONE_IDX] = "No error",
@@ -366,7 +371,6 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
int i;
u64 cmd[CMDQ_ENT_DWORDS];
- struct arm_smmu_queue *q = &smmu->cmdq.q;
u32 cons = readl_relaxed(q->cons_reg);
u32 idx = FIELD_GET(CMDQ_CONS_ERR, cons);
struct arm_smmu_cmdq_ent cmd_sync = {
@@ -413,6 +417,11 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
}
+static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
+{
+ __arm_smmu_cmdq_skip_err(smmu, &smmu->cmdq.q);
+}
+
/*
* Command queue locking.
* This is a form of bastardised rwlock with the following major changes:
@@ -579,7 +588,7 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
{
unsigned long flags;
struct arm_smmu_queue_poll qp;
- struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
int ret = 0;
/*
@@ -595,7 +604,7 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
queue_poll_init(smmu, &qp);
do {
- llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+ llq->val = READ_ONCE(cmdq->q.llq.val);
if (!queue_full(llq))
break;
@@ -614,7 +623,7 @@ static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
{
int ret = 0;
struct arm_smmu_queue_poll qp;
- struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
queue_poll_init(smmu, &qp);
@@ -637,12 +646,12 @@ static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
struct arm_smmu_ll_queue *llq)
{
struct arm_smmu_queue_poll qp;
- struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
u32 prod = llq->prod;
int ret = 0;
queue_poll_init(smmu, &qp);
- llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+ llq->val = READ_ONCE(cmdq->q.llq.val);
do {
if (queue_consumed(llq, prod))
break;
@@ -732,12 +741,12 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
u32 prod;
unsigned long flags;
bool owner;
- struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
- struct arm_smmu_ll_queue llq = {
- .max_n_shift = cmdq->q.llq.max_n_shift,
- }, head = llq;
+ struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
+ struct arm_smmu_ll_queue llq, head;
int ret = 0;
+ llq.max_n_shift = cmdq->q.llq.max_n_shift;
+
/* 1. Allocate some space in the queue */
local_irq_save(flags);
llq.val = READ_ONCE(cmdq->q.llq.val);
@@ -772,7 +781,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
if (sync) {
prod = queue_inc_prod_n(&llq, n);
- arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
+ arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, &cmdq->q, prod);
queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
/*
@@ -845,8 +854,9 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
return ret;
}
-static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
- struct arm_smmu_cmdq_ent *ent)
+static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_ent *ent,
+ bool sync)
{
u64 cmd[CMDQ_ENT_DWORDS];
@@ -856,12 +866,19 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
return -EINVAL;
}
- return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
+ return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, sync);
+}
+
+static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_ent *ent)
+{
+ return __arm_smmu_cmdq_issue_cmd(smmu, ent, false);
}
-static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_ent *ent)
{
- return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
+ return __arm_smmu_cmdq_issue_cmd(smmu, ent, true);
}
static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
@@ -929,8 +946,7 @@ void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
.tlbi.asid = asid,
};
- arm_smmu_cmdq_issue_cmd(smmu, &cmd);
- arm_smmu_cmdq_issue_sync(smmu);
+ arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
}
static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
@@ -939,7 +955,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
size_t i;
unsigned long flags;
struct arm_smmu_master *master;
- struct arm_smmu_cmdq_batch cmds = {};
+ struct arm_smmu_cmdq_batch cmds;
struct arm_smmu_device *smmu = smmu_domain->smmu;
struct arm_smmu_cmdq_ent cmd = {
.opcode = CMDQ_OP_CFGI_CD,
@@ -949,6 +965,8 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
},
};
+ cmds.num = 0;
+
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
list_for_each_entry(master, &smmu_domain->devices, domain_head) {
for (i = 0; i < master->num_streams; i++) {
@@ -1211,8 +1229,7 @@ static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
},
};
- arm_smmu_cmdq_issue_cmd(smmu, &cmd);
- arm_smmu_cmdq_issue_sync(smmu);
+ arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
}
static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
@@ -1747,15 +1764,16 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
{
int i;
struct arm_smmu_cmdq_ent cmd;
+ struct arm_smmu_cmdq_batch cmds = {};
arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
for (i = 0; i < master->num_streams; i++) {
cmd.atc.sid = master->streams[i].id;
- arm_smmu_cmdq_issue_cmd(master->smmu, &cmd);
+ arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd);
}
- return arm_smmu_cmdq_issue_sync(master->smmu);
+ return arm_smmu_cmdq_batch_submit(master->smmu, &cmds);
}
int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
@@ -1765,7 +1783,7 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
unsigned long flags;
struct arm_smmu_cmdq_ent cmd;
struct arm_smmu_master *master;
- struct arm_smmu_cmdq_batch cmds = {};
+ struct arm_smmu_cmdq_batch cmds;
if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
return 0;
@@ -1789,6 +1807,8 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
+ cmds.num = 0;
+
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
list_for_each_entry(master, &smmu_domain->devices, domain_head) {
if (!master->ats_enabled)
@@ -1823,8 +1843,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
} else {
cmd.opcode = CMDQ_OP_TLBI_S12_VMALL;
cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid;
- arm_smmu_cmdq_issue_cmd(smmu, &cmd);
- arm_smmu_cmdq_issue_sync(smmu);
+ arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
}
arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
}
@@ -1837,7 +1856,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
struct arm_smmu_device *smmu = smmu_domain->smmu;
unsigned long end = iova + size, num_pages = 0, tg = 0;
size_t inv_range = granule;
- struct arm_smmu_cmdq_batch cmds = {};
+ struct arm_smmu_cmdq_batch cmds;
if (!size)
return;
@@ -1855,6 +1874,8 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
num_pages = size >> tg;
}
+ cmds.num = 0;
+
while (iova < end) {
if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
/*
@@ -1972,6 +1993,7 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
if (type != IOMMU_DOMAIN_UNMANAGED &&
type != IOMMU_DOMAIN_DMA &&
+ type != IOMMU_DOMAIN_DMA_FQ &&
type != IOMMU_DOMAIN_IDENTITY)
return NULL;
@@ -1984,12 +2006,6 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
if (!smmu_domain)
return NULL;
- if (type == IOMMU_DOMAIN_DMA &&
- iommu_get_dma_cookie(&smmu_domain->domain)) {
- kfree(smmu_domain);
- return NULL;
- }
-
mutex_init(&smmu_domain->init_mutex);
INIT_LIST_HEAD(&smmu_domain->devices);
spin_lock_init(&smmu_domain->devices_lock);
@@ -2021,7 +2037,6 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct arm_smmu_device *smmu = smmu_domain->smmu;
- iommu_put_dma_cookie(domain);
free_io_pgtable_ops(smmu_domain->pgtbl_ops);
/* Free the CD and ASID, if we allocated them */
@@ -2181,9 +2196,6 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain,
.iommu_dev = smmu->dev,
};
- if (!iommu_get_dma_strict(domain))
- pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
-
pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
if (!pgtbl_ops)
return -ENOMEM;
@@ -2439,19 +2451,21 @@ out_unlock:
return ret;
}
-static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
- phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t pgsize, size_t pgcount,
+ int prot, gfp_t gfp, size_t *mapped)
{
struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
if (!ops)
return -ENODEV;
- return ops->map(ops, iova, paddr, size, prot, gfp);
+ return ops->map_pages(ops, iova, paddr, pgsize, pgcount, prot, gfp, mapped);
}
-static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
- size_t size, struct iommu_iotlb_gather *gather)
+static size_t arm_smmu_unmap_pages(struct iommu_domain *domain, unsigned long iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *gather)
{
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
@@ -2459,7 +2473,7 @@ static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
if (!ops)
return 0;
- return ops->unmap(ops, iova, size, gather);
+ return ops->unmap_pages(ops, iova, pgsize, pgcount, gather);
}
static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain)
@@ -2488,9 +2502,6 @@ arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
{
struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
- if (domain->type == IOMMU_DOMAIN_IDENTITY)
- return iova;
-
if (!ops)
return 0;
@@ -2825,8 +2836,8 @@ static struct iommu_ops arm_smmu_ops = {
.domain_alloc = arm_smmu_domain_alloc,
.domain_free = arm_smmu_domain_free,
.attach_dev = arm_smmu_attach_dev,
- .map = arm_smmu_map,
- .unmap = arm_smmu_unmap,
+ .map_pages = arm_smmu_map_pages,
+ .unmap_pages = arm_smmu_unmap_pages,
.flush_iotlb_all = arm_smmu_flush_iotlb_all,
.iotlb_sync = arm_smmu_iotlb_sync,
.iova_to_phys = arm_smmu_iova_to_phys,
@@ -3338,18 +3349,16 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
/* Invalidate any cached configuration */
cmd.opcode = CMDQ_OP_CFGI_ALL;
- arm_smmu_cmdq_issue_cmd(smmu, &cmd);
- arm_smmu_cmdq_issue_sync(smmu);
+ arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
/* Invalidate any stale TLB entries */
if (smmu->features & ARM_SMMU_FEAT_HYP) {
cmd.opcode = CMDQ_OP_TLBI_EL2_ALL;
- arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
}
cmd.opcode = CMDQ_OP_TLBI_NSNH_ALL;
- arm_smmu_cmdq_issue_cmd(smmu, &cmd);
- arm_smmu_cmdq_issue_sync(smmu);
+ arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
/* Event queue */
writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE);
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 9b9d13ec5a88..55690af1b25d 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -193,6 +193,8 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
{
struct adreno_smmu_priv *priv;
+ smmu_domain->cfg.flush_walk_prefer_tlbiasid = true;
+
/* Only enable split pagetables for the GPU device (SID 0) */
if (!qcom_adreno_smmu_is_gpu_device(dev))
return 0;
@@ -235,6 +237,14 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = {
{ }
};
+static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain,
+ struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
+{
+ smmu_domain->cfg.flush_walk_prefer_tlbiasid = true;
+
+ return 0;
+}
+
static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
{
unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1);
@@ -358,6 +368,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu)
}
static const struct arm_smmu_impl qcom_smmu_impl = {
+ .init_context = qcom_smmu_init_context,
.cfg_probe = qcom_smmu_cfg_probe,
.def_domain_type = qcom_smmu_def_domain_type,
.reset = qcom_smmu500_reset,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index f22dbeb1e510..4bc75c4ce402 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -327,9 +327,16 @@ static void arm_smmu_tlb_inv_range_s2(unsigned long iova, size_t size,
static void arm_smmu_tlb_inv_walk_s1(unsigned long iova, size_t size,
size_t granule, void *cookie)
{
- arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie,
- ARM_SMMU_CB_S1_TLBIVA);
- arm_smmu_tlb_sync_context(cookie);
+ struct arm_smmu_domain *smmu_domain = cookie;
+ struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+
+ if (cfg->flush_walk_prefer_tlbiasid) {
+ arm_smmu_tlb_inv_context_s1(cookie);
+ } else {
+ arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie,
+ ARM_SMMU_CB_S1_TLBIVA);
+ arm_smmu_tlb_sync_context(cookie);
+ }
}
static void arm_smmu_tlb_add_page_s1(struct iommu_iotlb_gather *gather,
@@ -765,9 +772,6 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
.iommu_dev = smmu->dev,
};
- if (!iommu_get_dma_strict(domain))
- pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
-
if (smmu->impl && smmu->impl->init_context) {
ret = smmu->impl->init_context(smmu_domain, &pgtbl_cfg, dev);
if (ret)
@@ -868,10 +872,11 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
{
struct arm_smmu_domain *smmu_domain;
- if (type != IOMMU_DOMAIN_UNMANAGED &&
- type != IOMMU_DOMAIN_DMA &&
- type != IOMMU_DOMAIN_IDENTITY)
- return NULL;
+ if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_IDENTITY) {
+ if (using_legacy_binding ||
+ (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_DMA_FQ))
+ return NULL;
+ }
/*
* Allocate the domain and initialise some of its data structures.
* We can't really do anything meaningful until we've added a
@@ -881,12 +886,6 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
if (!smmu_domain)
return NULL;
- if (type == IOMMU_DOMAIN_DMA && (using_legacy_binding ||
- iommu_get_dma_cookie(&smmu_domain->domain))) {
- kfree(smmu_domain);
- return NULL;
- }
-
mutex_init(&smmu_domain->init_mutex);
spin_lock_init(&smmu_domain->cb_lock);
@@ -901,7 +900,6 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
* Free the domain resources. We assume that all devices have
* already been detached.
*/
- iommu_put_dma_cookie(domain);
arm_smmu_destroy_domain_context(domain);
kfree(smmu_domain);
}
@@ -1198,8 +1196,9 @@ rpm_put:
return ret;
}
-static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
- phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t pgsize, size_t pgcount,
+ int prot, gfp_t gfp, size_t *mapped)
{
struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
@@ -1209,14 +1208,15 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
return -ENODEV;
arm_smmu_rpm_get(smmu);
- ret = ops->map(ops, iova, paddr, size, prot, gfp);
+ ret = ops->map_pages(ops, iova, paddr, pgsize, pgcount, prot, gfp, mapped);
arm_smmu_rpm_put(smmu);
return ret;
}
-static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
- size_t size, struct iommu_iotlb_gather *gather)
+static size_t arm_smmu_unmap_pages(struct iommu_domain *domain, unsigned long iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *iotlb_gather)
{
struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
@@ -1226,7 +1226,7 @@ static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
return 0;
arm_smmu_rpm_get(smmu);
- ret = ops->unmap(ops, iova, size, gather);
+ ret = ops->unmap_pages(ops, iova, pgsize, pgcount, iotlb_gather);
arm_smmu_rpm_put(smmu);
return ret;
@@ -1320,9 +1320,6 @@ static phys_addr_t arm_smmu_iova_to_phys(struct iommu_domain *domain,
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
- if (domain->type == IOMMU_DOMAIN_IDENTITY)
- return iova;
-
if (!ops)
return 0;
@@ -1478,16 +1475,21 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
struct iommu_group *group = NULL;
int i, idx;
+ mutex_lock(&smmu->stream_map_mutex);
for_each_cfg_sme(cfg, fwspec, i, idx) {
if (group && smmu->s2crs[idx].group &&
- group != smmu->s2crs[idx].group)
+ group != smmu->s2crs[idx].group) {
+ mutex_unlock(&smmu->stream_map_mutex);
return ERR_PTR(-EINVAL);
+ }
group = smmu->s2crs[idx].group;
}
- if (group)
+ if (group) {
+ mutex_unlock(&smmu->stream_map_mutex);
return iommu_group_ref_get(group);
+ }
if (dev_is_pci(dev))
group = pci_device_group(dev);
@@ -1501,6 +1503,7 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
for_each_cfg_sme(cfg, fwspec, i, idx)
smmu->s2crs[idx].group = group;
+ mutex_unlock(&smmu->stream_map_mutex);
return group;
}
@@ -1582,8 +1585,8 @@ static struct iommu_ops arm_smmu_ops = {
.domain_alloc = arm_smmu_domain_alloc,
.domain_free = arm_smmu_domain_free,
.attach_dev = arm_smmu_attach_dev,
- .map = arm_smmu_map,
- .unmap = arm_smmu_unmap,
+ .map_pages = arm_smmu_map_pages,
+ .unmap_pages = arm_smmu_unmap_pages,
.flush_iotlb_all = arm_smmu_flush_iotlb_all,
.iotlb_sync = arm_smmu_iotlb_sync,
.iova_to_phys = arm_smmu_iova_to_phys,
@@ -2281,18 +2284,38 @@ static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev)
static int __maybe_unused arm_smmu_pm_resume(struct device *dev)
{
+ int ret;
+ struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
+ ret = clk_bulk_prepare(smmu->num_clks, smmu->clks);
+ if (ret)
+ return ret;
+
if (pm_runtime_suspended(dev))
return 0;
- return arm_smmu_runtime_resume(dev);
+ ret = arm_smmu_runtime_resume(dev);
+ if (ret)
+ clk_bulk_unprepare(smmu->num_clks, smmu->clks);
+
+ return ret;
}
static int __maybe_unused arm_smmu_pm_suspend(struct device *dev)
{
+ int ret = 0;
+ struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
if (pm_runtime_suspended(dev))
- return 0;
+ goto clk_unprepare;
+
+ ret = arm_smmu_runtime_suspend(dev);
+ if (ret)
+ return ret;
- return arm_smmu_runtime_suspend(dev);
+clk_unprepare:
+ clk_bulk_unprepare(smmu->num_clks, smmu->clks);
+ return ret;
}
static const struct dev_pm_ops arm_smmu_pm_ops = {
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index a50271595960..432de2f742c3 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -346,6 +346,7 @@ struct arm_smmu_cfg {
};
enum arm_smmu_cbar_type cbar;
enum arm_smmu_context_fmt fmt;
+ bool flush_walk_prefer_tlbiasid;
};
#define ARM_SMMU_INVALID_IRPTNDX 0xff
diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
index 021cf8f65ffc..b91874cb6cf3 100644
--- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -10,7 +10,6 @@
#include <linux/bitfield.h>
#include <linux/clk.h>
#include <linux/delay.h>
-#include <linux/dma-iommu.h>
#include <linux/dma-mapping.h>
#include <linux/err.h>
#include <linux/interrupt.h>
@@ -335,12 +334,6 @@ static struct iommu_domain *qcom_iommu_domain_alloc(unsigned type)
if (!qcom_domain)
return NULL;
- if (type == IOMMU_DOMAIN_DMA &&
- iommu_get_dma_cookie(&qcom_domain->domain)) {
- kfree(qcom_domain);
- return NULL;
- }
-
mutex_init(&qcom_domain->init_mutex);
spin_lock_init(&qcom_domain->pgtbl_lock);
@@ -351,8 +344,6 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain)
{
struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
- iommu_put_dma_cookie(domain);
-
if (qcom_domain->iommu) {
/*
* NOTE: unmap can be called after client device is powered
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 9b4c025b8871..896bea04c347 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -317,6 +317,30 @@ static bool dev_is_untrusted(struct device *dev)
return dev_is_pci(dev) && to_pci_dev(dev)->untrusted;
}
+/* sysfs updates are serialised by the mutex of the group owning @domain */
+int iommu_dma_init_fq(struct iommu_domain *domain)
+{
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ int ret;
+
+ if (cookie->fq_domain)
+ return 0;
+
+ ret = init_iova_flush_queue(&cookie->iovad, iommu_dma_flush_iotlb_all,
+ iommu_dma_entry_dtor);
+ if (ret) {
+ pr_warn("iova flush queue initialization failed\n");
+ return ret;
+ }
+ /*
+ * Prevent incomplete iovad->fq being observable. Pairs with path from
+ * __iommu_dma_unmap() through iommu_dma_free_iova() to queue_iova()
+ */
+ smp_wmb();
+ WRITE_ONCE(cookie->fq_domain, domain);
+ return 0;
+}
+
/**
* iommu_dma_init_domain - Initialise a DMA mapping domain
* @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
@@ -370,17 +394,9 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
init_iova_domain(iovad, 1UL << order, base_pfn);
- if (!cookie->fq_domain && (!dev || !dev_is_untrusted(dev)) &&
- domain->ops->flush_iotlb_all && !iommu_get_dma_strict(domain)) {
- if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
- iommu_dma_entry_dtor))
- pr_warn("iova flush queue initialization failed\n");
- else
- cookie->fq_domain = domain;
- }
-
- if (!dev)
- return 0;
+ /* If the FQ fails we can simply fall back to strict mode */
+ if (domain->type == IOMMU_DOMAIN_DMA_FQ && iommu_dma_init_fq(domain))
+ domain->type = IOMMU_DOMAIN_DMA;
return iova_reserve_iommu_regions(dev, domain);
}
@@ -455,17 +471,17 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
}
static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
- dma_addr_t iova, size_t size, struct page *freelist)
+ dma_addr_t iova, size_t size, struct iommu_iotlb_gather *gather)
{
struct iova_domain *iovad = &cookie->iovad;
/* The MSI case is only ever cleaning up its most recent allocation */
if (cookie->type == IOMMU_DMA_MSI_COOKIE)
cookie->msi_iova -= size;
- else if (cookie->fq_domain) /* non-strict mode */
+ else if (gather && gather->queued)
queue_iova(iovad, iova_pfn(iovad, iova),
size >> iova_shift(iovad),
- (unsigned long)freelist);
+ (unsigned long)gather->freelist);
else
free_iova_fast(iovad, iova_pfn(iovad, iova),
size >> iova_shift(iovad));
@@ -484,13 +500,14 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
dma_addr -= iova_off;
size = iova_align(iovad, size + iova_off);
iommu_iotlb_gather_init(&iotlb_gather);
+ iotlb_gather.queued = READ_ONCE(cookie->fq_domain);
unmapped = iommu_unmap_fast(domain, dma_addr, size, &iotlb_gather);
WARN_ON(unmapped != size);
- if (!cookie->fq_domain)
+ if (!iotlb_gather.queued)
iommu_iotlb_sync(domain, &iotlb_gather);
- iommu_dma_free_iova(cookie, dma_addr, size, iotlb_gather.freelist);
+ iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather);
}
static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
@@ -506,7 +523,7 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
__iommu_dma_unmap(dev, dma_addr, size);
- if (unlikely(is_swiotlb_buffer(phys)))
+ if (unlikely(is_swiotlb_buffer(dev, phys)))
swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
}
@@ -577,7 +594,7 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
}
iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask);
- if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys))
+ if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys))
swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs);
return iova;
}
@@ -784,7 +801,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(phys, size, dir);
- if (is_swiotlb_buffer(phys))
+ if (is_swiotlb_buffer(dev, phys))
swiotlb_sync_single_for_cpu(dev, phys, size, dir);
}
@@ -797,7 +814,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
return;
phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
- if (is_swiotlb_buffer(phys))
+ if (is_swiotlb_buffer(dev, phys))
swiotlb_sync_single_for_device(dev, phys, size, dir);
if (!dev_is_dma_coherent(dev))
@@ -818,7 +835,7 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
- if (is_swiotlb_buffer(sg_phys(sg)))
+ if (is_swiotlb_buffer(dev, sg_phys(sg)))
swiotlb_sync_single_for_cpu(dev, sg_phys(sg),
sg->length, dir);
}
@@ -835,7 +852,7 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
return;
for_each_sg(sgl, sg, nelems, i) {
- if (is_swiotlb_buffer(sg_phys(sg)))
+ if (is_swiotlb_buffer(dev, sg_phys(sg)))
swiotlb_sync_single_for_device(dev, sg_phys(sg),
sg->length, dir);
@@ -1330,7 +1347,7 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
* The IOMMU core code allocates the default DMA domain, which the
* underlying IOMMU driver needs to support via the dma-iommu layer.
*/
- if (domain->type == IOMMU_DOMAIN_DMA) {
+ if (iommu_is_dma_domain(domain)) {
if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
goto out_err;
dev->dma_ops = &iommu_dma_ops;
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index d0fbf1d10e18..939ffa768986 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -21,7 +21,6 @@
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
#include <linux/slab.h>
-#include <linux/dma-iommu.h>
typedef u32 sysmmu_iova_t;
typedef u32 sysmmu_pte_t;
@@ -735,20 +734,16 @@ static struct iommu_domain *exynos_iommu_domain_alloc(unsigned type)
/* Check if correct PTE offsets are initialized */
BUG_ON(PG_ENT_SHIFT < 0 || !dma_dev);
+ if (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_UNMANAGED)
+ return NULL;
+
domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
return NULL;
- if (type == IOMMU_DOMAIN_DMA) {
- if (iommu_get_dma_cookie(&domain->domain) != 0)
- goto err_pgtable;
- } else if (type != IOMMU_DOMAIN_UNMANAGED) {
- goto err_pgtable;
- }
-
domain->pgtable = (sysmmu_pte_t *)__get_free_pages(GFP_KERNEL, 2);
if (!domain->pgtable)
- goto err_dma_cookie;
+ goto err_pgtable;
domain->lv2entcnt = (short *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
if (!domain->lv2entcnt)
@@ -779,9 +774,6 @@ err_lv2ent:
free_pages((unsigned long)domain->lv2entcnt, 1);
err_counter:
free_pages((unsigned long)domain->pgtable, 2);
-err_dma_cookie:
- if (type == IOMMU_DOMAIN_DMA)
- iommu_put_dma_cookie(&domain->domain);
err_pgtable:
kfree(domain);
return NULL;
@@ -809,9 +801,6 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain)
spin_unlock_irqrestore(&domain->lock, flags);
- if (iommu_domain->type == IOMMU_DOMAIN_DMA)
- iommu_put_dma_cookie(iommu_domain);
-
dma_unmap_single(dma_dev, virt_to_phys(domain->pgtable), LV1TABLE_SIZE,
DMA_TO_DEVICE);
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 43ebd8af11c5..0ddb77115be7 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -25,9 +25,11 @@ config INTEL_IOMMU
and include PCI device scope covered by these DMA
remapping devices.
+if INTEL_IOMMU
+
config INTEL_IOMMU_DEBUGFS
bool "Export Intel IOMMU internals in Debugfs"
- depends on INTEL_IOMMU && IOMMU_DEBUGFS
+ depends on IOMMU_DEBUGFS
select DMAR_PERF
help
!!!WARNING!!!
@@ -41,7 +43,7 @@ config INTEL_IOMMU_DEBUGFS
config INTEL_IOMMU_SVM
bool "Support for Shared Virtual Memory with Intel IOMMU"
- depends on INTEL_IOMMU && X86_64
+ depends on X86_64
select PCI_PASID
select PCI_PRI
select MMU_NOTIFIER
@@ -53,9 +55,8 @@ config INTEL_IOMMU_SVM
means of a Process Address Space ID (PASID).
config INTEL_IOMMU_DEFAULT_ON
- def_bool y
- prompt "Enable Intel DMA Remapping Devices by default"
- depends on INTEL_IOMMU
+ bool "Enable Intel DMA Remapping Devices by default"
+ default y
help
Selecting this option will enable a DMAR device at boot time if
one is found. If this option is not selected, DMAR support can
@@ -63,7 +64,7 @@ config INTEL_IOMMU_DEFAULT_ON
config INTEL_IOMMU_BROKEN_GFX_WA
bool "Workaround broken graphics drivers (going away soon)"
- depends on INTEL_IOMMU && BROKEN && X86
+ depends on BROKEN && X86
help
Current Graphics drivers tend to use physical address
for DMA and avoid using DMA APIs. Setting this config
@@ -74,7 +75,7 @@ config INTEL_IOMMU_BROKEN_GFX_WA
config INTEL_IOMMU_FLOPPY_WA
def_bool y
- depends on INTEL_IOMMU && X86
+ depends on X86
help
Floppy disk drivers are known to bypass DMA API calls
thereby failing to work when IOMMU is enabled. This
@@ -83,7 +84,7 @@ config INTEL_IOMMU_FLOPPY_WA
config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
bool "Enable Intel IOMMU scalable mode by default"
- depends on INTEL_IOMMU
+ default y
help
Selecting this option will enable by default the scalable mode if
hardware presents the capability. The scalable mode is defined in
@@ -92,3 +93,5 @@ config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
is not selected, scalable mode support could also be enabled by
passing intel_iommu=sm_on to the kernel. If not sure, please use
the default value.
+
+endif # INTEL_IOMMU
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index d66f79acd14d..0ec5514c9980 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -149,8 +149,6 @@ dmar_alloc_pci_notify_info(struct pci_dev *dev, unsigned long event)
} else {
info = kzalloc(size, GFP_KERNEL);
if (!info) {
- pr_warn("Out of memory when allocating notify_info "
- "for %s.\n", pci_name(dev));
if (dmar_dev_scope_status == 0)
dmar_dev_scope_status = -ENOMEM;
return NULL;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index dd22fc7d5176..d75f59ae28e6 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -33,6 +33,7 @@
#include <linux/iommu.h>
#include <linux/dma-iommu.h>
#include <linux/intel-iommu.h>
+#include <linux/intel-svm.h>
#include <linux/syscore_ops.h>
#include <linux/tboot.h>
#include <linux/dmi.h>
@@ -85,24 +86,6 @@
#define LEVEL_STRIDE (9)
#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
-/*
- * This bitmap is used to advertise the page sizes our hardware support
- * to the IOMMU core, which will then use this information to split
- * physically contiguous memory regions it is mapping into page sizes
- * that we support.
- *
- * Traditionally the IOMMU core just handed us the mappings directly,
- * after making sure the size is an order of a 4KiB page and that the
- * mapping has natural alignment.
- *
- * To retain this behavior, we currently advertise that we support
- * all page sizes that are an order of 4KiB.
- *
- * If at some point we'd like to utilize the IOMMU core's new behavior,
- * we could change this to advertise the real page sizes we support.
- */
-#define INTEL_IOMMU_PGSIZES (~0xFFFUL)
-
static inline int agaw_to_level(int agaw)
{
return agaw + 2;
@@ -345,23 +328,13 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
dma_addr_t iova);
-#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
-int dmar_disabled = 0;
-#else
-int dmar_disabled = 1;
-#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
-
-#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
-int intel_iommu_sm = 1;
-#else
-int intel_iommu_sm;
-#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
+int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
+int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
int intel_iommu_enabled = 0;
EXPORT_SYMBOL_GPL(intel_iommu_enabled);
static int dmar_map_gfx = 1;
-static int intel_iommu_strict;
static int intel_iommu_superpage = 1;
static int iommu_identity_mapping;
static int iommu_skip_te_disable;
@@ -454,14 +427,17 @@ static int __init intel_iommu_setup(char *str)
pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
iommu_dma_forcedac = true;
} else if (!strncmp(str, "strict", 6)) {
- pr_info("Disable batched IOTLB flush\n");
- intel_iommu_strict = 1;
+ pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
+ iommu_set_dma_strict();
} else if (!strncmp(str, "sp_off", 6)) {
pr_info("Disable supported super page\n");
intel_iommu_superpage = 0;
} else if (!strncmp(str, "sm_on", 5)) {
- pr_info("Intel-IOMMU: scalable mode supported\n");
+ pr_info("Enable scalable mode if hardware supports\n");
intel_iommu_sm = 1;
+ } else if (!strncmp(str, "sm_off", 6)) {
+ pr_info("Scalable mode is disallowed\n");
+ intel_iommu_sm = 0;
} else if (!strncmp(str, "tboot_noforce", 13)) {
pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
intel_iommu_tboot_noforce = 1;
@@ -601,7 +577,7 @@ struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
int iommu_id;
/* si_domain and vm domain should not get here. */
- if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
+ if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
return NULL;
for_each_domain_iommu(iommu_id, domain)
@@ -736,6 +712,23 @@ static int domain_update_device_node(struct dmar_domain *domain)
static void domain_update_iotlb(struct dmar_domain *domain);
+/* Return the super pagesize bitmap if supported. */
+static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
+{
+ unsigned long bitmap = 0;
+
+ /*
+ * 1-level super page supports page size of 2MiB, 2-level super page
+ * supports page size of both 2MiB and 1GiB.
+ */
+ if (domain->iommu_superpage == 1)
+ bitmap |= SZ_2M;
+ else if (domain->iommu_superpage == 2)
+ bitmap |= SZ_2M | SZ_1G;
+
+ return bitmap;
+}
+
/* Some capabilities may be different across iommus */
static void domain_update_iommu_cap(struct dmar_domain *domain)
{
@@ -762,6 +755,7 @@ static void domain_update_iommu_cap(struct dmar_domain *domain)
else
domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
+ domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
domain_update_iotlb(domain);
}
@@ -1035,7 +1029,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
if (domain_use_first_level(domain)) {
pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
- if (domain->domain.type == IOMMU_DOMAIN_DMA)
+ if (iommu_is_dma_domain(&domain->domain))
pteval |= DMA_FL_PTE_ACCESS;
}
if (cmpxchg64(&pte->val, 0ULL, pteval))
@@ -1548,7 +1542,7 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info)
if (info->pri_supported &&
(info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
- !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
+ !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
info->pri_enabled = 1;
#endif
if (info->ats_supported && pci_ats_page_aligned(pdev) &&
@@ -1780,11 +1774,8 @@ static int iommu_init_domains(struct intel_iommu *iommu)
spin_lock_init(&iommu->lock);
iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
- if (!iommu->domain_ids) {
- pr_err("%s: Allocating domain id array failed\n",
- iommu->name);
+ if (!iommu->domain_ids)
return -ENOMEM;
- }
size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
iommu->domains = kzalloc(size, GFP_KERNEL);
@@ -1980,10 +1971,6 @@ static void domain_exit(struct dmar_domain *domain)
/* Remove associated devices and clear attached or cached domains */
domain_remove_dev_info(domain);
- /* destroy iovas */
- if (domain->domain.type == IOMMU_DOMAIN_DMA)
- iommu_put_dma_cookie(&domain->domain);
-
if (domain->pgd) {
struct page *freelist;
@@ -2334,9 +2321,9 @@ static int
__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
unsigned long phys_pfn, unsigned long nr_pages, int prot)
{
+ struct dma_pte *first_pte = NULL, *pte = NULL;
unsigned int largepage_lvl = 0;
unsigned long lvl_pages = 0;
- struct dma_pte *pte = NULL;
phys_addr_t pteval;
u64 attr;
@@ -2348,13 +2335,9 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
attr |= DMA_FL_PTE_PRESENT;
if (domain_use_first_level(domain)) {
- attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
-
- if (domain->domain.type == IOMMU_DOMAIN_DMA) {
- attr |= DMA_FL_PTE_ACCESS;
- if (prot & DMA_PTE_WRITE)
- attr |= DMA_FL_PTE_DIRTY;
- }
+ attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
+ if (prot & DMA_PTE_WRITE)
+ attr |= DMA_FL_PTE_DIRTY;
}
pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
@@ -2369,6 +2352,8 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
if (!pte)
return -ENOMEM;
+ first_pte = pte;
+
/* It is large page*/
if (largepage_lvl > 1) {
unsigned long end_pfn;
@@ -2416,14 +2401,14 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
* recalculate 'pte' and switch back to smaller pages for the
* end of the mapping, if the trailing size is not enough to
* use another superpage (i.e. nr_pages < lvl_pages).
- *
- * We leave clflush for the leaf pte changes to iotlb_sync_map()
- * callback.
*/
pte++;
if (!nr_pages || first_pte_in_page(pte) ||
- (largepage_lvl > 1 && nr_pages < lvl_pages))
+ (largepage_lvl > 1 && nr_pages < lvl_pages)) {
+ domain_flush_cache(domain, first_pte,
+ (void *)pte - (void *)first_pte);
pte = NULL;
+ }
}
return 0;
@@ -3227,7 +3212,6 @@ static int __init init_dmars(void)
g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
GFP_KERNEL);
if (!g_iommus) {
- pr_err("Allocating global iommu array failed\n");
ret = -ENOMEM;
goto error;
}
@@ -4393,9 +4377,9 @@ int __init intel_iommu_init(void)
* is likely to be much lower than the overhead of synchronizing
* the virtual and physical IOMMU page-tables.
*/
- if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
- pr_warn("IOMMU batching is disabled due to virtualization");
- intel_iommu_strict = 1;
+ if (cap_caching_mode(iommu->cap)) {
+ pr_info_once("IOMMU batching disallowed due to virtualization\n");
+ iommu_set_dma_strict();
}
iommu_device_sysfs_add(&iommu->iommu, NULL,
intel_iommu_groups,
@@ -4404,7 +4388,6 @@ int __init intel_iommu_init(void)
}
up_read(&dmar_global_lock);
- iommu_set_dma_strict(intel_iommu_strict);
bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
if (si_domain && !hw_pass_through)
register_memory_notifier(&intel_iommu_memory_nb);
@@ -4532,6 +4515,7 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
switch (type) {
case IOMMU_DOMAIN_DMA:
+ case IOMMU_DOMAIN_DMA_FQ:
case IOMMU_DOMAIN_UNMANAGED:
dmar_domain = alloc_domain(0);
if (!dmar_domain) {
@@ -4544,10 +4528,6 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
return NULL;
}
- if (type == IOMMU_DOMAIN_DMA &&
- iommu_get_dma_cookie(&dmar_domain->domain))
- return NULL;
-
domain = &dmar_domain->domain;
domain->geometry.aperture_start = 0;
domain->geometry.aperture_end =
@@ -5067,6 +5047,28 @@ static int intel_iommu_map(struct iommu_domain *domain,
hpa >> VTD_PAGE_SHIFT, size, prot);
}
+static int intel_iommu_map_pages(struct iommu_domain *domain,
+ unsigned long iova, phys_addr_t paddr,
+ size_t pgsize, size_t pgcount,
+ int prot, gfp_t gfp, size_t *mapped)
+{
+ unsigned long pgshift = __ffs(pgsize);
+ size_t size = pgcount << pgshift;
+ int ret;
+
+ if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
+ return -EINVAL;
+
+ if (!IS_ALIGNED(iova | paddr, pgsize))
+ return -EINVAL;
+
+ ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
+ if (!ret && mapped)
+ *mapped = size;
+
+ return ret;
+}
+
static size_t intel_iommu_unmap(struct iommu_domain *domain,
unsigned long iova, size_t size,
struct iommu_iotlb_gather *gather)
@@ -5096,6 +5098,17 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
return size;
}
+static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
+ unsigned long iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *gather)
+{
+ unsigned long pgshift = __ffs(pgsize);
+ size_t size = pgcount << pgshift;
+
+ return intel_iommu_unmap(domain, iova, size, gather);
+}
+
static void intel_iommu_tlb_sync(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather)
{
@@ -5172,12 +5185,8 @@ static void intel_iommu_release_device(struct device *dev)
static void intel_iommu_probe_finalize(struct device *dev)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
-
- if (domain && domain->type == IOMMU_DOMAIN_DMA)
- iommu_setup_dma_ops(dev, 0, U64_MAX);
- else
- set_dma_ops(dev, NULL);
+ set_dma_ops(dev, NULL);
+ iommu_setup_dma_ops(dev, 0, U64_MAX);
}
static void intel_iommu_get_resv_regions(struct device *device,
@@ -5532,39 +5541,6 @@ static bool risky_device(struct pci_dev *pdev)
return false;
}
-static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn,
- unsigned long clf_pages)
-{
- struct dma_pte *first_pte = NULL, *pte = NULL;
- unsigned long lvl_pages = 0;
- int level = 0;
-
- while (clf_pages > 0) {
- if (!pte) {
- level = 0;
- pte = pfn_to_dma_pte(domain, clf_pfn, &level);
- if (WARN_ON(!pte))
- return;
- first_pte = pte;
- lvl_pages = lvl_to_nr_pages(level);
- }
-
- if (WARN_ON(!lvl_pages || clf_pages < lvl_pages))
- return;
-
- clf_pages -= lvl_pages;
- clf_pfn += lvl_pages;
- pte++;
-
- if (!clf_pages || first_pte_in_page(pte) ||
- (level > 1 && clf_pages < lvl_pages)) {
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
- pte = NULL;
- }
- }
-}
-
static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
unsigned long iova, size_t size)
{
@@ -5574,9 +5550,6 @@ static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
struct intel_iommu *iommu;
int iommu_id;
- if (!dmar_domain->iommu_coherency)
- clflush_sync_map(dmar_domain, pfn, pages);
-
for_each_domain_iommu(iommu_id, dmar_domain) {
iommu = g_iommus[iommu_id];
__mapping_notify_one(iommu, dmar_domain, pfn, pages);
@@ -5593,9 +5566,9 @@ const struct iommu_ops intel_iommu_ops = {
.aux_attach_dev = intel_iommu_aux_attach_device,
.aux_detach_dev = intel_iommu_aux_detach_device,
.aux_get_pasid = intel_iommu_aux_get_pasid,
- .map = intel_iommu_map,
+ .map_pages = intel_iommu_map_pages,
+ .unmap_pages = intel_iommu_unmap_pages,
.iotlb_sync_map = intel_iommu_iotlb_sync_map,
- .unmap = intel_iommu_unmap,
.flush_iotlb_all = intel_flush_iotlb_all,
.iotlb_sync = intel_iommu_tlb_sync,
.iova_to_phys = intel_iommu_iova_to_phys,
@@ -5611,7 +5584,7 @@ const struct iommu_ops intel_iommu_ops = {
.dev_disable_feat = intel_iommu_dev_disable_feat,
.is_attach_deferred = intel_iommu_is_attach_deferred,
.def_domain_type = device_def_domain_type,
- .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
+ .pgsize_bitmap = SZ_4K,
#ifdef CONFIG_INTEL_IOMMU_SVM
.cache_invalidate = intel_iommu_sva_invalidate,
.sva_bind_gpasid = intel_svm_bind_gpasid,
@@ -5714,8 +5687,8 @@ static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
} else if (dmar_map_gfx) {
/* we have to ensure the gfx device is idle before we flush */
pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
- intel_iommu_strict = 1;
- }
+ iommu_set_dma_strict();
+ }
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 9ec374e17469..07c390aed1fe 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -517,7 +517,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
if (WARN_ON(!pte))
return;
- if (!(pte->val[0] & PASID_PTE_PRESENT))
+ if (!pasid_pte_is_present(pte))
return;
did = pasid_get_domain_id(pte);
@@ -540,6 +540,10 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
devtlb_invalidation_with_pasid(iommu, dev, pasid);
}
+/*
+ * This function flushes cache for a newly setup pasid table entry.
+ * Caller of it should not modify the in-use pasid table entries.
+ */
static void pasid_flush_caches(struct intel_iommu *iommu,
struct pasid_entry *pte,
u32 pasid, u16 did)
@@ -591,6 +595,10 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
if (WARN_ON(!pte))
return -EINVAL;
+ /* Caller must ensure PASID entry is not in use. */
+ if (pasid_pte_is_present(pte))
+ return -EBUSY;
+
pasid_clear_entry(pte);
/* Setup the first level page table pointer: */
@@ -690,6 +698,10 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return -ENODEV;
}
+ /* Caller must ensure PASID entry is not in use. */
+ if (pasid_pte_is_present(pte))
+ return -EBUSY;
+
pasid_clear_entry(pte);
pasid_set_domain_id(pte, did);
pasid_set_slptr(pte, pgd_val);
@@ -729,6 +741,10 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
return -ENODEV;
}
+ /* Caller must ensure PASID entry is not in use. */
+ if (pasid_pte_is_present(pte))
+ return -EBUSY;
+
pasid_clear_entry(pte);
pasid_set_domain_id(pte, did);
pasid_set_address_width(pte, iommu->agaw);
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index c11bc8b833b8..d5552e2c160d 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -28,12 +28,12 @@
#define VCMD_CMD_ALLOC 0x1
#define VCMD_CMD_FREE 0x2
#define VCMD_VRSP_IP 0x1
-#define VCMD_VRSP_SC(e) (((e) >> 1) & 0x3)
+#define VCMD_VRSP_SC(e) (((e) & 0xff) >> 1)
#define VCMD_VRSP_SC_SUCCESS 0
-#define VCMD_VRSP_SC_NO_PASID_AVAIL 2
-#define VCMD_VRSP_SC_INVALID_PASID 2
-#define VCMD_VRSP_RESULT_PASID(e) (((e) >> 8) & 0xfffff)
-#define VCMD_CMD_OPERAND(e) ((e) << 8)
+#define VCMD_VRSP_SC_NO_PASID_AVAIL 16
+#define VCMD_VRSP_SC_INVALID_PASID 16
+#define VCMD_VRSP_RESULT_PASID(e) (((e) >> 16) & 0xfffff)
+#define VCMD_CMD_OPERAND(e) ((e) << 16)
/*
* Domain ID reserved for pasid entries programmed for first-level
* only and pass-through transfer modes.
diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c
index 73b7ec705552..0e8e03252d92 100644
--- a/drivers/iommu/intel/perf.c
+++ b/drivers/iommu/intel/perf.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/**
+/*
* perf.c - performance monitor
*
* Copyright (C) 2021 Intel Corporation
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 4b9b3f35ba0e..2014fe8695ac 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -31,8 +31,6 @@ static irqreturn_t prq_event_thread(int irq, void *d);
static void intel_svm_drain_prq(struct device *dev, u32 pasid);
#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
-#define PRQ_ORDER 0
-
static DEFINE_XARRAY_ALLOC(pasid_private_array);
static int pasid_private_add(ioasid_t pasid, void *priv)
{
@@ -725,8 +723,6 @@ struct page_req_dsc {
u64 priv_data[2];
};
-#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20)
-
static bool is_canonical_address(u64 addr)
{
int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index d4004bcf333a..bfb6acb651e5 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -519,11 +519,12 @@ static int __arm_v7s_map(struct arm_v7s_io_pgtable *data, unsigned long iova,
return __arm_v7s_map(data, iova, paddr, size, prot, lvl + 1, cptep, gfp);
}
-static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova,
- phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+static int arm_v7s_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
+ phys_addr_t paddr, size_t pgsize, size_t pgcount,
+ int prot, gfp_t gfp, size_t *mapped)
{
struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
- int ret;
+ int ret = -EINVAL;
if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias) ||
paddr >= (1ULL << data->iop.cfg.oas)))
@@ -533,7 +534,17 @@ static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova,
if (!(prot & (IOMMU_READ | IOMMU_WRITE)))
return 0;
- ret = __arm_v7s_map(data, iova, paddr, size, prot, 1, data->pgd, gfp);
+ while (pgcount--) {
+ ret = __arm_v7s_map(data, iova, paddr, pgsize, prot, 1, data->pgd,
+ gfp);
+ if (ret)
+ break;
+
+ iova += pgsize;
+ paddr += pgsize;
+ if (mapped)
+ *mapped += pgsize;
+ }
/*
* Synchronise all PTE updates for the new mapping before there's
* a chance for anything to kick off a table walk for the new iova.
@@ -543,6 +554,12 @@ static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova,
return ret;
}
+static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ return arm_v7s_map_pages(ops, iova, paddr, size, 1, prot, gfp, NULL);
+}
+
static void arm_v7s_free_pgtable(struct io_pgtable *iop)
{
struct arm_v7s_io_pgtable *data = io_pgtable_to_data(iop);
@@ -683,14 +700,7 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
ARM_V7S_BLOCK_SIZE(lvl + 1));
ptep = iopte_deref(pte[i], lvl, data);
__arm_v7s_free_table(ptep, lvl + 1, data);
- } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) {
- /*
- * Order the PTE update against queueing the IOVA, to
- * guarantee that a flush callback from a different CPU
- * has observed it before the TLBIALL can be issued.
- */
- smp_wmb();
- } else {
+ } else if (!iommu_iotlb_gather_queued(gather)) {
io_pgtable_tlb_add_page(iop, gather, iova, blk_size);
}
iova += blk_size;
@@ -710,15 +720,32 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
return __arm_v7s_unmap(data, gather, iova, size, lvl + 1, ptep);
}
-static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova,
- size_t size, struct iommu_iotlb_gather *gather)
+static size_t arm_v7s_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *gather)
{
struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
+ size_t unmapped = 0, ret;
if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias)))
return 0;
- return __arm_v7s_unmap(data, gather, iova, size, 1, data->pgd);
+ while (pgcount--) {
+ ret = __arm_v7s_unmap(data, gather, iova, pgsize, 1, data->pgd);
+ if (!ret)
+ break;
+
+ unmapped += pgsize;
+ iova += pgsize;
+ }
+
+ return unmapped;
+}
+
+static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ return arm_v7s_unmap_pages(ops, iova, size, 1, gather);
}
static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops,
@@ -757,8 +784,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NO_PERMS |
- IO_PGTABLE_QUIRK_ARM_MTK_EXT |
- IO_PGTABLE_QUIRK_NON_STRICT))
+ IO_PGTABLE_QUIRK_ARM_MTK_EXT))
return NULL;
/* If ARM_MTK_4GB is enabled, the NO_PERMS is also expected. */
@@ -780,7 +806,9 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
data->iop.ops = (struct io_pgtable_ops) {
.map = arm_v7s_map,
+ .map_pages = arm_v7s_map_pages,
.unmap = arm_v7s_unmap,
+ .unmap_pages = arm_v7s_unmap_pages,
.iova_to_phys = arm_v7s_iova_to_phys,
};
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 87def58e79b5..dd9e47189d0d 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -46,6 +46,9 @@
#define ARM_LPAE_PGD_SIZE(d) \
(sizeof(arm_lpae_iopte) << (d)->pgd_bits)
+#define ARM_LPAE_PTES_PER_TABLE(d) \
+ (ARM_LPAE_GRANULE(d) >> ilog2(sizeof(arm_lpae_iopte)))
+
/*
* Calculate the index at level l used to map virtual address a using the
* pagetable in d.
@@ -127,6 +130,9 @@
#define ARM_MALI_LPAE_MEMATTR_IMP_DEF 0x88ULL
#define ARM_MALI_LPAE_MEMATTR_WRITE_ALLOC 0x8DULL
+#define APPLE_DART_PTE_PROT_NO_WRITE (1<<7)
+#define APPLE_DART_PTE_PROT_NO_READ (1<<8)
+
/* IOPTE accessors */
#define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d))
@@ -232,70 +238,77 @@ static void __arm_lpae_free_pages(void *pages, size_t size,
free_pages((unsigned long)pages, get_order(size));
}
-static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep,
+static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
struct io_pgtable_cfg *cfg)
{
dma_sync_single_for_device(cfg->iommu_dev, __arm_lpae_dma_addr(ptep),
- sizeof(*ptep), DMA_TO_DEVICE);
+ sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
}
-static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,
- struct io_pgtable_cfg *cfg)
+static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg)
{
- *ptep = pte;
+
+ *ptep = 0;
if (!cfg->coherent_walk)
- __arm_lpae_sync_pte(ptep, cfg);
+ __arm_lpae_sync_pte(ptep, 1, cfg);
}
static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t size, int lvl,
- arm_lpae_iopte *ptep);
+ unsigned long iova, size_t size, size_t pgcount,
+ int lvl, arm_lpae_iopte *ptep);
static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
phys_addr_t paddr, arm_lpae_iopte prot,
- int lvl, arm_lpae_iopte *ptep)
+ int lvl, int num_entries, arm_lpae_iopte *ptep)
{
arm_lpae_iopte pte = prot;
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+ size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
+ int i;
if (data->iop.fmt != ARM_MALI_LPAE && lvl == ARM_LPAE_MAX_LEVELS - 1)
pte |= ARM_LPAE_PTE_TYPE_PAGE;
else
pte |= ARM_LPAE_PTE_TYPE_BLOCK;
- pte |= paddr_to_iopte(paddr, data);
+ for (i = 0; i < num_entries; i++)
+ ptep[i] = pte | paddr_to_iopte(paddr + i * sz, data);
- __arm_lpae_set_pte(ptep, pte, &data->iop.cfg);
+ if (!cfg->coherent_walk)
+ __arm_lpae_sync_pte(ptep, num_entries, cfg);
}
static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
unsigned long iova, phys_addr_t paddr,
- arm_lpae_iopte prot, int lvl,
+ arm_lpae_iopte prot, int lvl, int num_entries,
arm_lpae_iopte *ptep)
{
- arm_lpae_iopte pte = *ptep;
-
- if (iopte_leaf(pte, lvl, data->iop.fmt)) {
- /* We require an unmap first */
- WARN_ON(!selftest_running);
- return -EEXIST;
- } else if (iopte_type(pte) == ARM_LPAE_PTE_TYPE_TABLE) {
- /*
- * We need to unmap and free the old table before
- * overwriting it with a block entry.
- */
- arm_lpae_iopte *tblp;
- size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
-
- tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
- if (__arm_lpae_unmap(data, NULL, iova, sz, lvl, tblp) != sz) {
- WARN_ON(1);
- return -EINVAL;
+ int i;
+
+ for (i = 0; i < num_entries; i++)
+ if (iopte_leaf(ptep[i], lvl, data->iop.fmt)) {
+ /* We require an unmap first */
+ WARN_ON(!selftest_running);
+ return -EEXIST;
+ } else if (iopte_type(ptep[i]) == ARM_LPAE_PTE_TYPE_TABLE) {
+ /*
+ * We need to unmap and free the old table before
+ * overwriting it with a block entry.
+ */
+ arm_lpae_iopte *tblp;
+ size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
+
+ tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
+ if (__arm_lpae_unmap(data, NULL, iova + i * sz, sz, 1,
+ lvl, tblp) != sz) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
}
- }
- __arm_lpae_init_pte(data, paddr, prot, lvl, ptep);
+ __arm_lpae_init_pte(data, paddr, prot, lvl, num_entries, ptep);
return 0;
}
@@ -323,7 +336,7 @@ static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table,
return old;
/* Even if it's not ours, there's no point waiting; just kick it */
- __arm_lpae_sync_pte(ptep, cfg);
+ __arm_lpae_sync_pte(ptep, 1, cfg);
if (old == curr)
WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC);
@@ -331,20 +344,30 @@ static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table,
}
static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
- phys_addr_t paddr, size_t size, arm_lpae_iopte prot,
- int lvl, arm_lpae_iopte *ptep, gfp_t gfp)
+ phys_addr_t paddr, size_t size, size_t pgcount,
+ arm_lpae_iopte prot, int lvl, arm_lpae_iopte *ptep,
+ gfp_t gfp, size_t *mapped)
{
arm_lpae_iopte *cptep, pte;
size_t block_size = ARM_LPAE_BLOCK_SIZE(lvl, data);
size_t tblsz = ARM_LPAE_GRANULE(data);
struct io_pgtable_cfg *cfg = &data->iop.cfg;
+ int ret = 0, num_entries, max_entries, map_idx_start;
/* Find our entry at the current level */
- ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
+ map_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
+ ptep += map_idx_start;
/* If we can install a leaf entry at this level, then do so */
- if (size == block_size)
- return arm_lpae_init_pte(data, iova, paddr, prot, lvl, ptep);
+ if (size == block_size) {
+ max_entries = ARM_LPAE_PTES_PER_TABLE(data) - map_idx_start;
+ num_entries = min_t(int, pgcount, max_entries);
+ ret = arm_lpae_init_pte(data, iova, paddr, prot, lvl, num_entries, ptep);
+ if (!ret && mapped)
+ *mapped += num_entries * size;
+
+ return ret;
+ }
/* We can't allocate tables at the final level */
if (WARN_ON(lvl >= ARM_LPAE_MAX_LEVELS - 1))
@@ -361,7 +384,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
if (pte)
__arm_lpae_free_pages(cptep, tblsz, cfg);
} else if (!cfg->coherent_walk && !(pte & ARM_LPAE_PTE_SW_SYNC)) {
- __arm_lpae_sync_pte(ptep, cfg);
+ __arm_lpae_sync_pte(ptep, 1, cfg);
}
if (pte && !iopte_leaf(pte, lvl, data->iop.fmt)) {
@@ -373,7 +396,8 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
}
/* Rinse, repeat */
- return __arm_lpae_map(data, iova, paddr, size, prot, lvl + 1, cptep, gfp);
+ return __arm_lpae_map(data, iova, paddr, size, pgcount, prot, lvl + 1,
+ cptep, gfp, mapped);
}
static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
@@ -381,6 +405,15 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
{
arm_lpae_iopte pte;
+ if (data->iop.fmt == APPLE_DART) {
+ pte = 0;
+ if (!(prot & IOMMU_WRITE))
+ pte |= APPLE_DART_PTE_PROT_NO_WRITE;
+ if (!(prot & IOMMU_READ))
+ pte |= APPLE_DART_PTE_PROT_NO_READ;
+ return pte;
+ }
+
if (data->iop.fmt == ARM_64_LPAE_S1 ||
data->iop.fmt == ARM_32_LPAE_S1) {
pte = ARM_LPAE_PTE_nG;
@@ -440,8 +473,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
return pte;
}
-static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova,
- phys_addr_t paddr, size_t size, int iommu_prot, gfp_t gfp)
+static int arm_lpae_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
+ phys_addr_t paddr, size_t pgsize, size_t pgcount,
+ int iommu_prot, gfp_t gfp, size_t *mapped)
{
struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
struct io_pgtable_cfg *cfg = &data->iop.cfg;
@@ -450,7 +484,7 @@ static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova,
arm_lpae_iopte prot;
long iaext = (s64)iova >> cfg->ias;
- if (WARN_ON(!size || (size & cfg->pgsize_bitmap) != size))
+ if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize))
return -EINVAL;
if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1)
@@ -463,7 +497,8 @@ static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova,
return 0;
prot = arm_lpae_prot_to_pte(data, iommu_prot);
- ret = __arm_lpae_map(data, iova, paddr, size, prot, lvl, ptep, gfp);
+ ret = __arm_lpae_map(data, iova, paddr, pgsize, pgcount, prot, lvl,
+ ptep, gfp, mapped);
/*
* Synchronise all PTE updates for the new mapping before there's
* a chance for anything to kick off a table walk for the new iova.
@@ -473,6 +508,13 @@ static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova,
return ret;
}
+static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova,
+ phys_addr_t paddr, size_t size, int iommu_prot, gfp_t gfp)
+{
+ return arm_lpae_map_pages(ops, iova, paddr, size, 1, iommu_prot, gfp,
+ NULL);
+}
+
static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl,
arm_lpae_iopte *ptep)
{
@@ -516,14 +558,15 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
struct iommu_iotlb_gather *gather,
unsigned long iova, size_t size,
arm_lpae_iopte blk_pte, int lvl,
- arm_lpae_iopte *ptep)
+ arm_lpae_iopte *ptep, size_t pgcount)
{
struct io_pgtable_cfg *cfg = &data->iop.cfg;
arm_lpae_iopte pte, *tablep;
phys_addr_t blk_paddr;
size_t tablesz = ARM_LPAE_GRANULE(data);
size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
- int i, unmap_idx = -1;
+ int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data);
+ int i, unmap_idx_start = -1, num_entries = 0, max_entries;
if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
return 0;
@@ -532,18 +575,21 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
if (!tablep)
return 0; /* Bytes unmapped */
- if (size == split_sz)
- unmap_idx = ARM_LPAE_LVL_IDX(iova, lvl, data);
+ if (size == split_sz) {
+ unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
+ max_entries = ptes_per_table - unmap_idx_start;
+ num_entries = min_t(int, pgcount, max_entries);
+ }
blk_paddr = iopte_to_paddr(blk_pte, data);
pte = iopte_prot(blk_pte);
- for (i = 0; i < tablesz / sizeof(pte); i++, blk_paddr += split_sz) {
+ for (i = 0; i < ptes_per_table; i++, blk_paddr += split_sz) {
/* Unmap! */
- if (i == unmap_idx)
+ if (i >= unmap_idx_start && i < (unmap_idx_start + num_entries))
continue;
- __arm_lpae_init_pte(data, blk_paddr, pte, lvl, &tablep[i]);
+ __arm_lpae_init_pte(data, blk_paddr, pte, lvl, 1, &tablep[i]);
}
pte = arm_lpae_install_table(tablep, ptep, blk_pte, cfg);
@@ -558,76 +604,85 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
return 0;
tablep = iopte_deref(pte, data);
- } else if (unmap_idx >= 0) {
- io_pgtable_tlb_add_page(&data->iop, gather, iova, size);
- return size;
+ } else if (unmap_idx_start >= 0) {
+ for (i = 0; i < num_entries; i++)
+ io_pgtable_tlb_add_page(&data->iop, gather, iova + i * size, size);
+
+ return num_entries * size;
}
- return __arm_lpae_unmap(data, gather, iova, size, lvl, tablep);
+ return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl, tablep);
}
static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t size, int lvl,
- arm_lpae_iopte *ptep)
+ unsigned long iova, size_t size, size_t pgcount,
+ int lvl, arm_lpae_iopte *ptep)
{
arm_lpae_iopte pte;
struct io_pgtable *iop = &data->iop;
+ int i = 0, num_entries, max_entries, unmap_idx_start;
/* Something went horribly wrong and we ran out of page table */
if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
return 0;
- ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
+ unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
+ ptep += unmap_idx_start;
pte = READ_ONCE(*ptep);
if (WARN_ON(!pte))
return 0;
/* If the size matches this level, we're in the right place */
if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
- __arm_lpae_set_pte(ptep, 0, &iop->cfg);
-
- if (!iopte_leaf(pte, lvl, iop->fmt)) {
- /* Also flush any partial walks */
- io_pgtable_tlb_flush_walk(iop, iova, size,
- ARM_LPAE_GRANULE(data));
- ptep = iopte_deref(pte, data);
- __arm_lpae_free_pgtable(data, lvl + 1, ptep);
- } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) {
- /*
- * Order the PTE update against queueing the IOVA, to
- * guarantee that a flush callback from a different CPU
- * has observed it before the TLBIALL can be issued.
- */
- smp_wmb();
- } else {
- io_pgtable_tlb_add_page(iop, gather, iova, size);
+ max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
+ num_entries = min_t(int, pgcount, max_entries);
+
+ while (i < num_entries) {
+ pte = READ_ONCE(*ptep);
+ if (WARN_ON(!pte))
+ break;
+
+ __arm_lpae_clear_pte(ptep, &iop->cfg);
+
+ if (!iopte_leaf(pte, lvl, iop->fmt)) {
+ /* Also flush any partial walks */
+ io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
+ ARM_LPAE_GRANULE(data));
+ __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
+ } else if (!iommu_iotlb_gather_queued(gather)) {
+ io_pgtable_tlb_add_page(iop, gather, iova + i * size, size);
+ }
+
+ ptep++;
+ i++;
}
- return size;
+ return i * size;
} else if (iopte_leaf(pte, lvl, iop->fmt)) {
/*
* Insert a table at the next level to map the old region,
* minus the part we want to unmap
*/
return arm_lpae_split_blk_unmap(data, gather, iova, size, pte,
- lvl + 1, ptep);
+ lvl + 1, ptep, pgcount);
}
/* Keep on walkin' */
ptep = iopte_deref(pte, data);
- return __arm_lpae_unmap(data, gather, iova, size, lvl + 1, ptep);
+ return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl + 1, ptep);
}
-static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
- size_t size, struct iommu_iotlb_gather *gather)
+static size_t arm_lpae_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *gather)
{
struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
struct io_pgtable_cfg *cfg = &data->iop.cfg;
arm_lpae_iopte *ptep = data->pgd;
long iaext = (s64)iova >> cfg->ias;
- if (WARN_ON(!size || (size & cfg->pgsize_bitmap) != size))
+ if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount))
return 0;
if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1)
@@ -635,7 +690,14 @@ static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
if (WARN_ON(iaext))
return 0;
- return __arm_lpae_unmap(data, gather, iova, size, data->start_level, ptep);
+ return __arm_lpae_unmap(data, gather, iova, pgsize, pgcount,
+ data->start_level, ptep);
+}
+
+static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
+{
+ return arm_lpae_unmap_pages(ops, iova, size, 1, gather);
}
static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
@@ -750,7 +812,9 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
data->iop.ops = (struct io_pgtable_ops) {
.map = arm_lpae_map,
+ .map_pages = arm_lpae_map_pages,
.unmap = arm_lpae_unmap,
+ .unmap_pages = arm_lpae_unmap_pages,
.iova_to_phys = arm_lpae_iova_to_phys,
};
@@ -766,7 +830,6 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
bool tg1;
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
- IO_PGTABLE_QUIRK_NON_STRICT |
IO_PGTABLE_QUIRK_ARM_TTBR1 |
IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
return NULL;
@@ -870,7 +933,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
typeof(&cfg->arm_lpae_s2_cfg.vtcr) vtcr = &cfg->arm_lpae_s2_cfg.vtcr;
/* The NS quirk doesn't apply at stage 2 */
- if (cfg->quirks & ~(IO_PGTABLE_QUIRK_NON_STRICT))
+ if (cfg->quirks)
return NULL;
data = arm_lpae_alloc_pgtable(cfg);
@@ -1043,6 +1106,52 @@ out_free_data:
return NULL;
}
+static struct io_pgtable *
+apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
+{
+ struct arm_lpae_io_pgtable *data;
+ int i;
+
+ if (cfg->oas > 36)
+ return NULL;
+
+ data = arm_lpae_alloc_pgtable(cfg);
+ if (!data)
+ return NULL;
+
+ /*
+ * The table format itself always uses two levels, but the total VA
+ * space is mapped by four separate tables, making the MMIO registers
+ * an effective "level 1". For simplicity, though, we treat this
+ * equivalently to LPAE stage 2 concatenation at level 2, with the
+ * additional TTBRs each just pointing at consecutive pages.
+ */
+ if (data->start_level < 1)
+ goto out_free_data;
+ if (data->start_level == 1 && data->pgd_bits > 2)
+ goto out_free_data;
+ if (data->start_level > 1)
+ data->pgd_bits = 0;
+ data->start_level = 2;
+ cfg->apple_dart_cfg.n_ttbrs = 1 << data->pgd_bits;
+ data->pgd_bits += data->bits_per_level;
+
+ data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data), GFP_KERNEL,
+ cfg);
+ if (!data->pgd)
+ goto out_free_data;
+
+ for (i = 0; i < cfg->apple_dart_cfg.n_ttbrs; ++i)
+ cfg->apple_dart_cfg.ttbr[i] =
+ virt_to_phys(data->pgd + i * ARM_LPAE_GRANULE(data));
+
+ return &data->iop;
+
+out_free_data:
+ kfree(data);
+ return NULL;
+}
+
struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns = {
.alloc = arm_64_lpae_alloc_pgtable_s1,
.free = arm_lpae_free_pgtable,
@@ -1068,6 +1177,11 @@ struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = {
.free = arm_lpae_free_pgtable,
};
+struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns = {
+ .alloc = apple_dart_alloc_pgtable,
+ .free = arm_lpae_free_pgtable,
+};
+
#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST
static struct io_pgtable_cfg *cfg_cookie __initdata;
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 6e9917ce980f..f4bfcef98297 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -20,6 +20,7 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
[ARM_64_LPAE_S1] = &io_pgtable_arm_64_lpae_s1_init_fns,
[ARM_64_LPAE_S2] = &io_pgtable_arm_64_lpae_s2_init_fns,
[ARM_MALI_LPAE] = &io_pgtable_arm_mali_lpae_init_fns,
+ [APPLE_DART] = &io_pgtable_apple_dart_init_fns,
#endif
#ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S
[ARM_V7S] = &io_pgtable_arm_v7s_init_fns,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5a570d495f8d..3303d707bab4 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -7,7 +7,9 @@
#define pr_fmt(fmt) "iommu: " fmt
#include <linux/device.h>
+#include <linux/dma-iommu.h>
#include <linux/kernel.h>
+#include <linux/bits.h>
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/init.h>
@@ -29,7 +31,7 @@ static struct kset *iommu_group_kset;
static DEFINE_IDA(iommu_group_ida);
static unsigned int iommu_def_domain_type __read_mostly;
-static bool iommu_dma_strict __read_mostly = true;
+static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT);
static u32 iommu_cmd_line __read_mostly;
struct iommu_group {
@@ -113,6 +115,7 @@ static const char *iommu_domain_type_str(unsigned int t)
case IOMMU_DOMAIN_UNMANAGED:
return "Unmanaged";
case IOMMU_DOMAIN_DMA:
+ case IOMMU_DOMAIN_DMA_FQ:
return "Translated";
default:
return "Unknown";
@@ -133,11 +136,20 @@ static int __init iommu_subsys_init(void)
}
}
+ if (!iommu_default_passthrough() && !iommu_dma_strict)
+ iommu_def_domain_type = IOMMU_DOMAIN_DMA_FQ;
+
pr_info("Default domain type: %s %s\n",
iommu_domain_type_str(iommu_def_domain_type),
(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ?
"(set via kernel command line)" : "");
+ if (!iommu_default_passthrough())
+ pr_info("DMA domain TLB invalidation policy: %s mode %s\n",
+ iommu_dma_strict ? "strict" : "lazy",
+ (iommu_cmd_line & IOMMU_CMD_LINE_STRICT) ?
+ "(set via kernel command line)" : "");
+
return 0;
}
subsys_initcall(iommu_subsys_init);
@@ -273,7 +285,9 @@ int iommu_probe_device(struct device *dev)
* support default domains, so the return value is not yet
* checked.
*/
+ mutex_lock(&group->mutex);
iommu_alloc_default_domain(group, dev);
+ mutex_unlock(&group->mutex);
if (group->default_domain) {
ret = __iommu_attach_device(group->default_domain, dev);
@@ -344,21 +358,13 @@ static int __init iommu_dma_setup(char *str)
}
early_param("iommu.strict", iommu_dma_setup);
-void iommu_set_dma_strict(bool strict)
+void iommu_set_dma_strict(void)
{
- if (strict || !(iommu_cmd_line & IOMMU_CMD_LINE_STRICT))
- iommu_dma_strict = strict;
+ iommu_dma_strict = true;
+ if (iommu_def_domain_type == IOMMU_DOMAIN_DMA_FQ)
+ iommu_def_domain_type = IOMMU_DOMAIN_DMA;
}
-bool iommu_get_dma_strict(struct iommu_domain *domain)
-{
- /* only allow lazy flushing for DMA domains */
- if (domain->type == IOMMU_DOMAIN_DMA)
- return iommu_dma_strict;
- return true;
-}
-EXPORT_SYMBOL_GPL(iommu_get_dma_strict);
-
static ssize_t iommu_group_attr_show(struct kobject *kobj,
struct attribute *__attr, char *buf)
{
@@ -546,6 +552,9 @@ static ssize_t iommu_group_show_type(struct iommu_group *group,
case IOMMU_DOMAIN_DMA:
type = "DMA\n";
break;
+ case IOMMU_DOMAIN_DMA_FQ:
+ type = "DMA-FQ\n";
+ break;
}
}
mutex_unlock(&group->mutex);
@@ -759,7 +768,7 @@ static int iommu_create_device_direct_mappings(struct iommu_group *group,
unsigned long pg_size;
int ret = 0;
- if (!domain || domain->type != IOMMU_DOMAIN_DMA)
+ if (!domain || !iommu_is_dma_domain(domain))
return 0;
BUG_ON(!domain->pgsize_bitmap);
@@ -1944,6 +1953,11 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
/* Assume all sizes by default; the driver may override this later */
domain->pgsize_bitmap = bus->iommu_ops->pgsize_bitmap;
+ /* Temporarily avoid -EEXIST while drivers still get their own cookies */
+ if (iommu_is_dma_domain(domain) && !domain->iova_cookie && iommu_get_dma_cookie(domain)) {
+ iommu_domain_free(domain);
+ domain = NULL;
+ }
return domain;
}
@@ -1955,6 +1969,7 @@ EXPORT_SYMBOL_GPL(iommu_domain_alloc);
void iommu_domain_free(struct iommu_domain *domain)
{
+ iommu_put_dma_cookie(domain);
domain->ops->domain_free(domain);
}
EXPORT_SYMBOL_GPL(iommu_domain_free);
@@ -2370,45 +2385,94 @@ EXPORT_SYMBOL_GPL(iommu_detach_group);
phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
{
- if (unlikely(domain->ops->iova_to_phys == NULL))
+ if (domain->type == IOMMU_DOMAIN_IDENTITY)
+ return iova;
+
+ if (domain->type == IOMMU_DOMAIN_BLOCKED)
return 0;
return domain->ops->iova_to_phys(domain, iova);
}
EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
-static size_t iommu_pgsize(struct iommu_domain *domain,
- unsigned long addr_merge, size_t size)
+static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, size_t *count)
{
- unsigned int pgsize_idx;
- size_t pgsize;
+ unsigned int pgsize_idx, pgsize_idx_next;
+ unsigned long pgsizes;
+ size_t offset, pgsize, pgsize_next;
+ unsigned long addr_merge = paddr | iova;
- /* Max page size that still fits into 'size' */
- pgsize_idx = __fls(size);
+ /* Page sizes supported by the hardware and small enough for @size */
+ pgsizes = domain->pgsize_bitmap & GENMASK(__fls(size), 0);
- /* need to consider alignment requirements ? */
- if (likely(addr_merge)) {
- /* Max page size allowed by address */
- unsigned int align_pgsize_idx = __ffs(addr_merge);
- pgsize_idx = min(pgsize_idx, align_pgsize_idx);
- }
+ /* Constrain the page sizes further based on the maximum alignment */
+ if (likely(addr_merge))
+ pgsizes &= GENMASK(__ffs(addr_merge), 0);
- /* build a mask of acceptable page sizes */
- pgsize = (1UL << (pgsize_idx + 1)) - 1;
+ /* Make sure we have at least one suitable page size */
+ BUG_ON(!pgsizes);
- /* throw away page sizes not supported by the hardware */
- pgsize &= domain->pgsize_bitmap;
+ /* Pick the biggest page size remaining */
+ pgsize_idx = __fls(pgsizes);
+ pgsize = BIT(pgsize_idx);
+ if (!count)
+ return pgsize;
- /* make sure we're still sane */
- BUG_ON(!pgsize);
+ /* Find the next biggest support page size, if it exists */
+ pgsizes = domain->pgsize_bitmap & ~GENMASK(pgsize_idx, 0);
+ if (!pgsizes)
+ goto out_set_count;
- /* pick the biggest page */
- pgsize_idx = __fls(pgsize);
- pgsize = 1UL << pgsize_idx;
+ pgsize_idx_next = __ffs(pgsizes);
+ pgsize_next = BIT(pgsize_idx_next);
+ /*
+ * There's no point trying a bigger page size unless the virtual
+ * and physical addresses are similarly offset within the larger page.
+ */
+ if ((iova ^ paddr) & (pgsize_next - 1))
+ goto out_set_count;
+
+ /* Calculate the offset to the next page size alignment boundary */
+ offset = pgsize_next - (addr_merge & (pgsize_next - 1));
+
+ /*
+ * If size is big enough to accommodate the larger page, reduce
+ * the number of smaller pages.
+ */
+ if (offset + pgsize_next <= size)
+ size = offset;
+
+out_set_count:
+ *count = size >> pgsize_idx;
return pgsize;
}
+static int __iommu_map_pages(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot,
+ gfp_t gfp, size_t *mapped)
+{
+ const struct iommu_ops *ops = domain->ops;
+ size_t pgsize, count;
+ int ret;
+
+ pgsize = iommu_pgsize(domain, iova, paddr, size, &count);
+
+ pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n",
+ iova, &paddr, pgsize, count);
+
+ if (ops->map_pages) {
+ ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot,
+ gfp, mapped);
+ } else {
+ ret = ops->map(domain, iova, paddr, pgsize, prot, gfp);
+ *mapped = ret ? 0 : pgsize;
+ }
+
+ return ret;
+}
+
static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
{
@@ -2419,7 +2483,7 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
phys_addr_t orig_paddr = paddr;
int ret = 0;
- if (unlikely(ops->map == NULL ||
+ if (unlikely(!(ops->map || ops->map_pages) ||
domain->pgsize_bitmap == 0UL))
return -ENODEV;
@@ -2443,18 +2507,21 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size);
while (size) {
- size_t pgsize = iommu_pgsize(domain, iova | paddr, size);
+ size_t mapped = 0;
- pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n",
- iova, &paddr, pgsize);
- ret = ops->map(domain, iova, paddr, pgsize, prot, gfp);
+ ret = __iommu_map_pages(domain, iova, paddr, size, prot, gfp,
+ &mapped);
+ /*
+ * Some pages may have been mapped, even if an error occurred,
+ * so we should account for those so they can be unmapped.
+ */
+ size -= mapped;
if (ret)
break;
- iova += pgsize;
- paddr += pgsize;
- size -= pgsize;
+ iova += mapped;
+ paddr += mapped;
}
/* unroll mapping in case something went wrong */
@@ -2494,6 +2561,19 @@ int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova,
}
EXPORT_SYMBOL_GPL(iommu_map_atomic);
+static size_t __iommu_unmap_pages(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ struct iommu_iotlb_gather *iotlb_gather)
+{
+ const struct iommu_ops *ops = domain->ops;
+ size_t pgsize, count;
+
+ pgsize = iommu_pgsize(domain, iova, iova, size, &count);
+ return ops->unmap_pages ?
+ ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather) :
+ ops->unmap(domain, iova, pgsize, iotlb_gather);
+}
+
static size_t __iommu_unmap(struct iommu_domain *domain,
unsigned long iova, size_t size,
struct iommu_iotlb_gather *iotlb_gather)
@@ -2503,7 +2583,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
unsigned long orig_iova = iova;
unsigned int min_pagesz;
- if (unlikely(ops->unmap == NULL ||
+ if (unlikely(!(ops->unmap || ops->unmap_pages) ||
domain->pgsize_bitmap == 0UL))
return 0;
@@ -2531,9 +2611,9 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
* or we hit an area that isn't mapped.
*/
while (unmapped < size) {
- size_t pgsize = iommu_pgsize(domain, iova, size - unmapped);
-
- unmapped_page = ops->unmap(domain, iova, pgsize, iotlb_gather);
+ unmapped_page = __iommu_unmap_pages(domain, iova,
+ size - unmapped,
+ iotlb_gather);
if (!unmapped_page)
break;
@@ -3128,6 +3208,14 @@ static int iommu_change_dev_def_domain(struct iommu_group *group,
goto out;
}
+ /* We can bring up a flush queue without tearing down the domain */
+ if (type == IOMMU_DOMAIN_DMA_FQ && prev_dom->type == IOMMU_DOMAIN_DMA) {
+ ret = iommu_dma_init_fq(prev_dom);
+ if (!ret)
+ prev_dom->type = IOMMU_DOMAIN_DMA_FQ;
+ goto out;
+ }
+
/* Sets group->default_domain to the newly allocated domain */
ret = iommu_group_alloc_default_domain(dev->bus, group, type);
if (ret)
@@ -3168,9 +3256,9 @@ out:
}
/*
- * Changing the default domain through sysfs requires the users to ubind the
- * drivers from the devices in the iommu group. Return failure if this doesn't
- * meet.
+ * Changing the default domain through sysfs requires the users to unbind the
+ * drivers from the devices in the iommu group, except for a DMA -> DMA-FQ
+ * transition. Return failure if this isn't met.
*
* We need to consider the race between this and the device release path.
* device_lock(dev) is used here to guarantee that the device release path
@@ -3193,6 +3281,8 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
req_type = IOMMU_DOMAIN_IDENTITY;
else if (sysfs_streq(buf, "DMA"))
req_type = IOMMU_DOMAIN_DMA;
+ else if (sysfs_streq(buf, "DMA-FQ"))
+ req_type = IOMMU_DOMAIN_DMA_FQ;
else if (sysfs_streq(buf, "auto"))
req_type = 0;
else
@@ -3244,7 +3334,8 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
/* Check if the device in the group still has a driver bound to it */
device_lock(dev);
- if (device_is_bound(dev)) {
+ if (device_is_bound(dev) && !(req_type == IOMMU_DOMAIN_DMA_FQ &&
+ group->default_domain->type == IOMMU_DOMAIN_DMA)) {
pr_err_ratelimited("Device is still bound to driver\n");
ret = -EBUSY;
goto out;
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b6cf5f16123b..0af42fb93a49 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -121,8 +121,6 @@ int init_iova_flush_queue(struct iova_domain *iovad,
spin_lock_init(&fq->lock);
}
- smp_wmb();
-
iovad->fq = queue;
timer_setup(&iovad->fq_timer, fq_flush_timeout, 0);
@@ -633,10 +631,20 @@ void queue_iova(struct iova_domain *iovad,
unsigned long pfn, unsigned long pages,
unsigned long data)
{
- struct iova_fq *fq = raw_cpu_ptr(iovad->fq);
+ struct iova_fq *fq;
unsigned long flags;
unsigned idx;
+ /*
+ * Order against the IOMMU driver's pagetable update from unmapping
+ * @pte, to guarantee that iova_domain_flush() observes that if called
+ * from a different CPU before we release the lock below. Full barrier
+ * so it also pairs with iommu_dma_init_fq() to avoid seeing partially
+ * written fq state here.
+ */
+ smp_mb();
+
+ fq = raw_cpu_ptr(iovad->fq);
spin_lock_irqsave(&fq->lock, flags);
/*
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index 51ea6f00db2f..d38ff29a76e8 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -8,7 +8,6 @@
#include <linux/bitmap.h>
#include <linux/delay.h>
-#include <linux/dma-iommu.h>
#include <linux/dma-mapping.h>
#include <linux/err.h>
#include <linux/export.h>
@@ -564,10 +563,13 @@ static irqreturn_t ipmmu_irq(int irq, void *dev)
* IOMMU Operations
*/
-static struct iommu_domain *__ipmmu_domain_alloc(unsigned type)
+static struct iommu_domain *ipmmu_domain_alloc(unsigned type)
{
struct ipmmu_vmsa_domain *domain;
+ if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA)
+ return NULL;
+
domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
return NULL;
@@ -577,27 +579,6 @@ static struct iommu_domain *__ipmmu_domain_alloc(unsigned type)
return &domain->io_domain;
}
-static struct iommu_domain *ipmmu_domain_alloc(unsigned type)
-{
- struct iommu_domain *io_domain = NULL;
-
- switch (type) {
- case IOMMU_DOMAIN_UNMANAGED:
- io_domain = __ipmmu_domain_alloc(type);
- break;
-
- case IOMMU_DOMAIN_DMA:
- io_domain = __ipmmu_domain_alloc(type);
- if (io_domain && iommu_get_dma_cookie(io_domain)) {
- kfree(io_domain);
- io_domain = NULL;
- }
- break;
- }
-
- return io_domain;
-}
-
static void ipmmu_domain_free(struct iommu_domain *io_domain)
{
struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
@@ -606,7 +587,6 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain)
* Free the domain resources. We assume that all devices have already
* been detached.
*/
- iommu_put_dma_cookie(io_domain);
ipmmu_domain_destroy_context(domain);
free_io_pgtable_ops(domain->iop);
kfree(domain);
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 6f7c69688ce2..d837adfd1da5 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -9,7 +9,6 @@
#include <linux/component.h>
#include <linux/device.h>
#include <linux/dma-direct.h>
-#include <linux/dma-iommu.h>
#include <linux/err.h>
#include <linux/interrupt.h>
#include <linux/io.h>
@@ -441,17 +440,11 @@ static struct iommu_domain *mtk_iommu_domain_alloc(unsigned type)
if (!dom)
return NULL;
- if (iommu_get_dma_cookie(&dom->domain)) {
- kfree(dom);
- return NULL;
- }
-
return &dom->domain;
}
static void mtk_iommu_domain_free(struct iommu_domain *domain)
{
- iommu_put_dma_cookie(domain);
kfree(to_mtk_domain(domain));
}
@@ -520,12 +513,8 @@ static size_t mtk_iommu_unmap(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather)
{
struct mtk_iommu_domain *dom = to_mtk_domain(domain);
- unsigned long end = iova + size - 1;
- if (gather->start > iova)
- gather->start = iova;
- if (gather->end < end)
- gather->end = end;
+ iommu_iotlb_gather_add_range(gather, iova, size);
return dom->iop->unmap(dom->iop, iova, size, gather);
}
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index 778e66f5f1aa..be22fcf988ce 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -13,7 +13,6 @@
#include <linux/component.h>
#include <linux/device.h>
#include <linux/dma-mapping.h>
-#include <linux/dma-iommu.h>
#include <linux/err.h>
#include <linux/interrupt.h>
#include <linux/io.h>
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index 9febfb7f3025..5cb260820eda 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -10,7 +10,6 @@
#include <linux/compiler.h>
#include <linux/delay.h>
#include <linux/device.h>
-#include <linux/dma-iommu.h>
#include <linux/dma-mapping.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
@@ -1074,10 +1073,6 @@ static struct iommu_domain *rk_iommu_domain_alloc(unsigned type)
if (!rk_domain)
return NULL;
- if (type == IOMMU_DOMAIN_DMA &&
- iommu_get_dma_cookie(&rk_domain->domain))
- goto err_free_domain;
-
/*
* rk32xx iommus use a 2 level pagetable.
* Each level1 (dt) and level2 (pt) table has 1024 4-byte entries.
@@ -1085,7 +1080,7 @@ static struct iommu_domain *rk_iommu_domain_alloc(unsigned type)
*/
rk_domain->dt = (u32 *)get_zeroed_page(GFP_KERNEL | GFP_DMA32);
if (!rk_domain->dt)
- goto err_put_cookie;
+ goto err_free_domain;
rk_domain->dt_dma = dma_map_single(dma_dev, rk_domain->dt,
SPAGE_SIZE, DMA_TO_DEVICE);
@@ -1106,9 +1101,6 @@ static struct iommu_domain *rk_iommu_domain_alloc(unsigned type)
err_free_dt:
free_page((unsigned long)rk_domain->dt);
-err_put_cookie:
- if (type == IOMMU_DOMAIN_DMA)
- iommu_put_dma_cookie(&rk_domain->domain);
err_free_domain:
kfree(rk_domain);
@@ -1137,8 +1129,6 @@ static void rk_iommu_domain_free(struct iommu_domain *domain)
SPAGE_SIZE, DMA_TO_DEVICE);
free_page((unsigned long)rk_domain->dt);
- if (domain->type == IOMMU_DOMAIN_DMA)
- iommu_put_dma_cookie(&rk_domain->domain);
kfree(rk_domain);
}
diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c
index 73dfd9946312..27ac818b0354 100644
--- a/drivers/iommu/sprd-iommu.c
+++ b/drivers/iommu/sprd-iommu.c
@@ -8,7 +8,6 @@
#include <linux/clk.h>
#include <linux/device.h>
-#include <linux/dma-iommu.h>
#include <linux/dma-mapping.h>
#include <linux/errno.h>
#include <linux/iommu.h>
@@ -144,11 +143,6 @@ static struct iommu_domain *sprd_iommu_domain_alloc(unsigned int domain_type)
if (!dom)
return NULL;
- if (iommu_get_dma_cookie(&dom->domain)) {
- kfree(dom);
- return NULL;
- }
-
spin_lock_init(&dom->pgtlock);
dom->domain.geometry.aperture_start = 0;
@@ -161,7 +155,6 @@ static void sprd_iommu_domain_free(struct iommu_domain *domain)
{
struct sprd_iommu_domain *dom = to_sprd_domain(domain);
- iommu_put_dma_cookie(domain);
kfree(dom);
}
diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
index 181bb1c3437c..92997021e188 100644
--- a/drivers/iommu/sun50i-iommu.c
+++ b/drivers/iommu/sun50i-iommu.c
@@ -7,7 +7,6 @@
#include <linux/clk.h>
#include <linux/device.h>
#include <linux/dma-direction.h>
-#include <linux/dma-iommu.h>
#include <linux/dma-mapping.h>
#include <linux/err.h>
#include <linux/errno.h>
@@ -610,14 +609,10 @@ static struct iommu_domain *sun50i_iommu_domain_alloc(unsigned type)
if (!sun50i_domain)
return NULL;
- if (type == IOMMU_DOMAIN_DMA &&
- iommu_get_dma_cookie(&sun50i_domain->domain))
- goto err_free_domain;
-
sun50i_domain->dt = (u32 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(DT_SIZE));
if (!sun50i_domain->dt)
- goto err_put_cookie;
+ goto err_free_domain;
refcount_set(&sun50i_domain->refcnt, 1);
@@ -627,10 +622,6 @@ static struct iommu_domain *sun50i_iommu_domain_alloc(unsigned type)
return &sun50i_domain->domain;
-err_put_cookie:
- if (type == IOMMU_DOMAIN_DMA)
- iommu_put_dma_cookie(&sun50i_domain->domain);
-
err_free_domain:
kfree(sun50i_domain);
@@ -644,8 +635,6 @@ static void sun50i_iommu_domain_free(struct iommu_domain *domain)
free_pages((unsigned long)sun50i_domain->dt, get_order(DT_SIZE));
sun50i_domain->dt = NULL;
- iommu_put_dma_cookie(domain);
-
kfree(sun50i_domain);
}
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 6abdcab7273b..80930ce04a16 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -598,12 +598,6 @@ static struct iommu_domain *viommu_domain_alloc(unsigned type)
spin_lock_init(&vdomain->mappings_lock);
vdomain->mappings = RB_ROOT_CACHED;
- if (type == IOMMU_DOMAIN_DMA &&
- iommu_get_dma_cookie(&vdomain->domain)) {
- kfree(vdomain);
- return NULL;
- }
-
return &vdomain->domain;
}
@@ -643,8 +637,6 @@ static void viommu_domain_free(struct iommu_domain *domain)
{
struct viommu_domain *vdomain = to_viommu_domain(domain);
- iommu_put_dma_cookie(domain);
-
/* Free all remaining mappings (size 2^64) */
viommu_del_mappings(vdomain, 0, 0);
diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c
index cb1a64a5c256..80a2c270d502 100644
--- a/drivers/mmc/host/jz4740_mmc.c
+++ b/drivers/mmc/host/jz4740_mmc.c
@@ -578,10 +578,6 @@ static bool jz4740_mmc_read_data(struct jz4740_mmc_host *host,
}
}
data->bytes_xfered += miter->length;
-
- /* This can go away once MIPS implements
- * flush_kernel_dcache_page */
- flush_dcache_page(miter->page);
}
sg_miter_stop(miter);
diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index a1bcde3395a6..f4c8e1a61f53 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -941,7 +941,7 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct mmc_command *cmd,
/* discard mappings */
if (direction == DMA_FROM_DEVICE)
- flush_kernel_dcache_page(sg_page(sg));
+ flush_dcache_page(sg_page(sg));
kunmap(sg_page(sg));
if (dma_dev)
dma_unmap_page(dma_dev, dma_addr, PAGE_SIZE, dir);
diff --git a/drivers/of/device.c b/drivers/of/device.c
index c5a9473a5fb1..5b043ee30824 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -5,6 +5,7 @@
#include <linux/of_device.h>
#include <linux/of_address.h>
#include <linux/of_iommu.h>
+#include <linux/of_reserved_mem.h>
#include <linux/dma-direct.h> /* for bus_dma_region */
#include <linux/dma-map-ops.h>
#include <linux/init.h>
@@ -52,6 +53,42 @@ int of_device_add(struct platform_device *ofdev)
return device_add(&ofdev->dev);
}
+static void
+of_dma_set_restricted_buffer(struct device *dev, struct device_node *np)
+{
+ struct device_node *node, *of_node = dev->of_node;
+ int count, i;
+
+ if (!IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL))
+ return;
+
+ count = of_property_count_elems_of_size(of_node, "memory-region",
+ sizeof(u32));
+ /*
+ * If dev->of_node doesn't exist or doesn't contain memory-region, try
+ * the OF node having DMA configuration.
+ */
+ if (count <= 0) {
+ of_node = np;
+ count = of_property_count_elems_of_size(
+ of_node, "memory-region", sizeof(u32));
+ }
+
+ for (i = 0; i < count; i++) {
+ node = of_parse_phandle(of_node, "memory-region", i);
+ /*
+ * There might be multiple memory regions, but only one
+ * restricted-dma-pool region is allowed.
+ */
+ if (of_device_is_compatible(node, "restricted-dma-pool") &&
+ of_device_is_available(node))
+ break;
+ }
+
+ if (i != count && of_reserved_mem_device_init_by_idx(dev, of_node, i))
+ dev_warn(dev, "failed to initialise \"restricted-dma-pool\" memory node\n");
+}
+
/**
* of_dma_configure_id - Setup DMA configuration
* @dev: Device to apply DMA configuration
@@ -165,6 +202,9 @@ int of_dma_configure_id(struct device *dev, struct device_node *np,
arch_setup_dma_ops(dev, dma_start, size, iommu, coherent);
+ if (!iommu)
+ of_dma_set_restricted_buffer(dev, np);
+
return 0;
}
EXPORT_SYMBOL_GPL(of_dma_configure_id);
diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index fd3964d24224..59c1390cdf42 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -33,18 +33,22 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
phys_addr_t *res_base)
{
phys_addr_t base;
+ int err = 0;
end = !end ? MEMBLOCK_ALLOC_ANYWHERE : end;
align = !align ? SMP_CACHE_BYTES : align;
- base = memblock_find_in_range(start, end, size, align);
+ base = memblock_phys_alloc_range(size, align, start, end);
if (!base)
return -ENOMEM;
*res_base = base;
- if (nomap)
- return memblock_mark_nomap(base, size);
+ if (nomap) {
+ err = memblock_mark_nomap(base, size);
+ if (err)
+ memblock_free(base, size);
+ }
- return memblock_reserve(base, size);
+ return err;
}
/*
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 427041c1e408..2156c632524d 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -699,7 +699,7 @@ static int pcifront_connect_and_init_dma(struct pcifront_device *pdev)
spin_unlock(&pcifront_dev_lock);
- if (!err && !is_swiotlb_active()) {
+ if (!err && !is_swiotlb_active(&pdev->xdev->dev)) {
err = pci_xen_swiotlb_init_late();
if (err)
dev_err(&pdev->xdev->dev, "Could not setup SWIOTLB!\n");
diff --git a/drivers/phy/Kconfig b/drivers/phy/Kconfig
index 7dd35f1b9cc5..82b63e60c5a2 100644
--- a/drivers/phy/Kconfig
+++ b/drivers/phy/Kconfig
@@ -37,7 +37,7 @@ config PHY_LPC18XX_USB_OTG
config PHY_PISTACHIO_USB
tristate "IMG Pistachio USB2.0 PHY driver"
- depends on MACH_PISTACHIO
+ depends on MIPS || COMPILE_TEST
select GENERIC_PHY
help
Enable this to support the USB2.0 PHY on the IMG Pistachio SoC.
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 85d58b720a24..643fe440c46e 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -100,7 +100,7 @@ static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr)
* in our domain. Therefore _only_ check address within our domain.
*/
if (pfn_valid(PFN_DOWN(paddr)))
- return is_swiotlb_buffer(paddr);
+ return is_swiotlb_buffer(dev, paddr);
return 0;
}
@@ -164,7 +164,7 @@ int __ref xen_swiotlb_init(void)
int rc = -ENOMEM;
char *start;
- if (io_tlb_default_mem != NULL) {
+ if (io_tlb_default_mem.nslabs) {
pr_warn("swiotlb buffer already initialized\n");
return -EEXIST;
}
@@ -374,7 +374,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
if (dma_capable(dev, dev_addr, size, true) &&
!range_straddles_page_boundary(phys, size) &&
!xen_arch_need_swiotlb(dev, phys, dev_addr) &&
- swiotlb_force != SWIOTLB_FORCE)
+ !is_swiotlb_force_bounce(dev))
goto done;
/*
@@ -547,7 +547,7 @@ xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
static int
xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
{
- return xen_phys_to_dma(hwdev, io_tlb_default_mem->end - 1) <= mask;
+ return xen_phys_to_dma(hwdev, io_tlb_default_mem.end - 1) <= mask;
}
const struct dma_map_ops xen_swiotlb_dma_ops = {
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index f00fcc4a4f72..e619c31b6bd9 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -3,6 +3,7 @@
* Implement the manual drop-all-pagecache function
*/
+#include <linux/pagemap.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/fs.h>
@@ -27,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
* we need to reschedule to avoid softlockups.
*/
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
- (inode->i_mapping->nrpages == 0 && !need_resched())) {
+ (mapping_empty(inode->i_mapping) && !need_resched())) {
spin_unlock(&inode->i_lock);
continue;
}
diff --git a/fs/exec.c b/fs/exec.c
index 3b78b22addfb..2dc489c164fe 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -217,8 +217,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* We are doing an exec(). 'current' is the process
* doing the exec and bprm->mm is the new process's mm.
*/
+ mmap_read_lock(bprm->mm);
ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
&page, NULL, NULL);
+ mmap_read_unlock(bprm->mm);
if (ret <= 0)
return NULL;
@@ -574,7 +576,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
}
if (kmapped_page) {
- flush_kernel_dcache_page(kmapped_page);
+ flush_dcache_page(kmapped_page);
kunmap(kmapped_page);
put_arg_page(kmapped_page);
}
@@ -592,7 +594,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
ret = 0;
out:
if (kmapped_page) {
- flush_kernel_dcache_page(kmapped_page);
+ flush_dcache_page(kmapped_page);
kunmap(kmapped_page);
put_arg_page(kmapped_page);
}
@@ -634,7 +636,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
kaddr = kmap_atomic(page);
flush_arg_page(bprm, pos & PAGE_MASK, page);
memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
- flush_kernel_dcache_page(page);
+ flush_dcache_page(page);
kunmap_atomic(kaddr);
put_arg_page(page);
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 68added37c15..9c6c6a3e2de5 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1051,7 +1051,8 @@ static int __init fcntl_init(void)
__FMODE_EXEC | __FMODE_NONOTIFY));
fasync_cache = kmem_cache_create("fasync_cache",
- sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+ sizeof(struct fasync_struct), 0,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
return 0;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index eb57dade6076..81ec192ce067 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -406,6 +406,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
inc_wb_stat(new_wb, WB_WRITEBACK);
}
+ if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+ atomic_dec(&old_wb->writeback_inodes);
+ atomic_inc(&new_wb->writeback_inodes);
+ }
+
wb_get(new_wb);
/*
@@ -1034,20 +1039,20 @@ restart:
* cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
* @bdi_id: target bdi id
* @memcg_id: target memcg css id
- * @nr: number of pages to write, 0 for best-effort dirty flushing
* @reason: reason why some writeback work initiated
* @done: target wb_completion
*
* Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
* with the specified parameters.
*/
-int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
+int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
enum wb_reason reason, struct wb_completion *done)
{
struct backing_dev_info *bdi;
struct cgroup_subsys_state *memcg_css;
struct bdi_writeback *wb;
struct wb_writeback_work *work;
+ unsigned long dirty;
int ret;
/* lookup bdi and memcg */
@@ -1076,24 +1081,22 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
}
/*
- * If @nr is zero, the caller is attempting to write out most of
+ * The caller is attempting to write out most of
* the currently dirty pages. Let's take the current dirty page
* count and inflate it by 25% which should be large enough to
* flush out most dirty pages while avoiding getting livelocked by
* concurrent dirtiers.
+ *
+ * BTW the memcg stats are flushed periodically and this is best-effort
+ * estimation, so some potential error is ok.
*/
- if (!nr) {
- unsigned long filepages, headroom, dirty, writeback;
-
- mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
- &writeback);
- nr = dirty * 10 / 8;
- }
+ dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
+ dirty = dirty * 10 / 8;
/* issue the writeback work */
work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
if (work) {
- work->nr_pages = nr;
+ work->nr_pages = dirty;
work->sync_mode = WB_SYNC_NONE;
work->range_cyclic = 1;
work->reason = reason;
@@ -1999,7 +2002,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
- unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages;
unsigned long dirtied_before = jiffies;
struct inode *inode;
@@ -2053,8 +2055,6 @@ static long wb_writeback(struct bdi_writeback *wb,
progress = __writeback_inodes_wb(wb, work);
trace_writeback_written(wb, work);
- wb_update_bandwidth(wb, wb_start);
-
/*
* Did we write something? Try for more
*
diff --git a/fs/fs_context.c b/fs/fs_context.c
index de1985eae535..b7e43a780a62 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -254,7 +254,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
struct fs_context *fc;
int ret = -ENOMEM;
- fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+ fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT);
if (!fc)
return ERR_PTR(-ENOMEM);
@@ -649,7 +649,7 @@ const struct fs_context_operations legacy_fs_context_ops = {
*/
static int legacy_init_fs_context(struct fs_context *fc)
{
- fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
+ fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
if (!fc->fs_private)
return -ENOMEM;
fc->ops = &legacy_fs_context_ops;
diff --git a/fs/inode.c b/fs/inode.c
index 84c528cd1955..37710ca863b5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -770,7 +770,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
return LRU_ROTATE;
}
- if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
__iget(inode);
spin_unlock(&inode->i_lock);
spin_unlock(lru_lock);
diff --git a/fs/locks.c b/fs/locks.c
index 3d6fb4ae847b..51a5b72ef302 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2941,10 +2941,12 @@ static int __init filelock_init(void)
int i;
flctx_cache = kmem_cache_create("file_lock_ctx",
- sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
+ sizeof(struct file_lock_context), 0,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
filelock_cache = kmem_cache_create("file_lock_cache",
- sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+ sizeof(struct file_lock), 0,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
for_each_possible_cpu(i) {
struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
diff --git a/fs/namei.c b/fs/namei.c
index d049d3972695..95a881e0552b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4089,7 +4089,9 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
return -EPERM;
inode_lock(target);
- if (is_local_mountpoint(dentry))
+ if (IS_SWAPFILE(target))
+ error = -EPERM;
+ else if (is_local_mountpoint(dentry))
error = -EBUSY;
else {
error = security_inode_unlink(dir, dentry);
@@ -4597,6 +4599,10 @@ int vfs_rename(struct renamedata *rd)
else if (target)
inode_lock(target);
+ error = -EPERM;
+ if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
+ goto out;
+
error = -EBUSY;
if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
goto out;
diff --git a/fs/namespace.c b/fs/namespace.c
index 12852363d90e..659a8f39c61a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -203,7 +203,8 @@ static struct mount *alloc_vfsmnt(const char *name)
goto out_free_cache;
if (name) {
- mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
+ mnt->mnt_devname = kstrdup_const(name,
+ GFP_KERNEL_ACCOUNT);
if (!mnt->mnt_devname)
goto out_free_id;
}
@@ -3370,7 +3371,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
if (!ucounts)
return ERR_PTR(-ENOSPC);
- new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+ new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns) {
dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
@@ -4306,7 +4307,7 @@ void __init mnt_init(void)
int err;
mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
mount_hashtable = alloc_large_system_hash("Mount-cache",
sizeof(struct hlist_head),
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 48fd369c29a4..359524b7341f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -16,6 +16,7 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/delay.h>
#include <linux/quotaops.h>
#include <linux/sched/signal.h>
@@ -2721,7 +2722,7 @@ int ocfs2_inode_lock_tracker(struct inode *inode,
return status;
}
}
- return tmp_oh ? 1 : 0;
+ return 1;
}
void ocfs2_inode_unlock_tracker(struct inode *inode,
@@ -3912,6 +3913,17 @@ downconvert:
spin_unlock_irqrestore(&lockres->l_lock, flags);
ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
gen);
+ /* The dlm lock convert is being cancelled in background,
+ * ocfs2_cancel_convert() is asynchronous in fs/dlm,
+ * requeue it, try again later.
+ */
+ if (ret == -EBUSY) {
+ ctl->requeue = 1;
+ mlog(ML_BASTS, "lockres %s, ReQ: Downconvert busy\n",
+ lockres->l_name);
+ ret = 0;
+ msleep(20);
+ }
leave:
if (ret)
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index eda83487c9ec..f033de733adb 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -357,7 +357,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
}
oinfo->dqi_gi.dqi_sb = sb;
oinfo->dqi_gi.dqi_type = type;
- ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
oinfo->dqi_gqi_bh = NULL;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index b1a8b046f4c2..0e4b16d4c037 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -702,6 +702,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
info->dqi_priv = oinfo;
oinfo->dqi_type = type;
INIT_LIST_HEAD(&oinfo->dqi_chunk);
+ oinfo->dqi_gqinode = NULL;
+ ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
oinfo->dqi_rec = NULL;
oinfo->dqi_lqi_bh = NULL;
oinfo->dqi_libh = NULL;
diff --git a/fs/pipe.c b/fs/pipe.c
index 6d4342bad9f1..1fa1f52763f0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal);
*/
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
- return try_get_page(buf->page);
+ return try_get_compound_head(buf->page, 1);
}
EXPORT_SYMBOL(generic_pipe_buf_get);
diff --git a/fs/select.c b/fs/select.c
index 945896d0ac9e..e83e563a351d 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -655,7 +655,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
goto out_nofds;
alloc_size = 6 * size;
- bits = kvmalloc(alloc_size, GFP_KERNEL);
+ bits = kvmalloc(alloc_size, GFP_KERNEL_ACCOUNT);
if (!bits)
goto out_nofds;
}
@@ -1000,7 +1000,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
len = min(todo, POLLFD_PER_PAGE);
walk = walk->next = kmalloc(struct_size(walk, entries, len),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!walk) {
err = -ENOMEM;
goto out_fds;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5c2d806e6ae5..003f0d31743e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -33,11 +33,6 @@ int sysctl_unprivileged_userfaultfd __read_mostly;
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
-enum userfaultfd_state {
- UFFD_STATE_WAIT_API,
- UFFD_STATE_RUNNING,
-};
-
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
* to be in the same cacheline.
@@ -69,12 +64,10 @@ struct userfaultfd_ctx {
unsigned int flags;
/* features requested from the userspace */
unsigned int features;
- /* state machine */
- enum userfaultfd_state state;
/* released */
bool released;
/* memory mappings are changing because of non-cooperative event */
- bool mmap_changing;
+ atomic_t mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
struct mm_struct *mm;
};
@@ -104,6 +97,14 @@ struct userfaultfd_wake_range {
unsigned long len;
};
+/* internal indication that UFFD_API ioctl was successfully executed */
+#define UFFD_FEATURE_INITIALIZED (1u << 31)
+
+static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
+{
+ return ctx->features & UFFD_FEATURE_INITIALIZED;
+}
+
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
@@ -623,7 +624,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
* already released.
*/
out:
- WRITE_ONCE(ctx->mmap_changing, false);
+ atomic_dec(&ctx->mmap_changing);
+ VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
userfaultfd_ctx_put(ctx);
}
@@ -666,15 +668,14 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
refcount_set(&ctx->refcount, 1);
ctx->flags = octx->flags;
- ctx->state = UFFD_STATE_RUNNING;
ctx->features = octx->features;
ctx->released = false;
- ctx->mmap_changing = false;
+ atomic_set(&ctx->mmap_changing, 0);
ctx->mm = vma->vm_mm;
mmgrab(ctx->mm);
userfaultfd_ctx_get(octx);
- WRITE_ONCE(octx->mmap_changing, true);
+ atomic_inc(&octx->mmap_changing);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
@@ -721,7 +722,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
- WRITE_ONCE(ctx->mmap_changing, true);
+ atomic_inc(&ctx->mmap_changing);
} else {
/* Drop uffd context if remap feature not enabled */
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -766,7 +767,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
userfaultfd_ctx_get(ctx);
- WRITE_ONCE(ctx->mmap_changing, true);
+ atomic_inc(&ctx->mmap_changing);
mmap_read_unlock(mm);
msg_init(&ewq.msg);
@@ -810,7 +811,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
return -ENOMEM;
userfaultfd_ctx_get(ctx);
- WRITE_ONCE(ctx->mmap_changing, true);
+ atomic_inc(&ctx->mmap_changing);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
@@ -943,38 +944,33 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
poll_wait(file, &ctx->fd_wqh, wait);
- switch (ctx->state) {
- case UFFD_STATE_WAIT_API:
+ if (!userfaultfd_is_initialized(ctx))
return EPOLLERR;
- case UFFD_STATE_RUNNING:
- /*
- * poll() never guarantees that read won't block.
- * userfaults can be waken before they're read().
- */
- if (unlikely(!(file->f_flags & O_NONBLOCK)))
- return EPOLLERR;
- /*
- * lockless access to see if there are pending faults
- * __pollwait last action is the add_wait_queue but
- * the spin_unlock would allow the waitqueue_active to
- * pass above the actual list_add inside
- * add_wait_queue critical section. So use a full
- * memory barrier to serialize the list_add write of
- * add_wait_queue() with the waitqueue_active read
- * below.
- */
- ret = 0;
- smp_mb();
- if (waitqueue_active(&ctx->fault_pending_wqh))
- ret = EPOLLIN;
- else if (waitqueue_active(&ctx->event_wqh))
- ret = EPOLLIN;
- return ret;
- default:
- WARN_ON_ONCE(1);
+ /*
+ * poll() never guarantees that read won't block.
+ * userfaults can be waken before they're read().
+ */
+ if (unlikely(!(file->f_flags & O_NONBLOCK)))
return EPOLLERR;
- }
+ /*
+ * lockless access to see if there are pending faults
+ * __pollwait last action is the add_wait_queue but
+ * the spin_unlock would allow the waitqueue_active to
+ * pass above the actual list_add inside
+ * add_wait_queue critical section. So use a full
+ * memory barrier to serialize the list_add write of
+ * add_wait_queue() with the waitqueue_active read
+ * below.
+ */
+ ret = 0;
+ smp_mb();
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ ret = EPOLLIN;
+ else if (waitqueue_active(&ctx->event_wqh))
+ ret = EPOLLIN;
+
+ return ret;
}
static const struct file_operations userfaultfd_fops;
@@ -1169,7 +1165,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
int no_wait = file->f_flags & O_NONBLOCK;
struct inode *inode = file_inode(file);
- if (ctx->state == UFFD_STATE_WAIT_API)
+ if (!userfaultfd_is_initialized(ctx))
return -EINVAL;
for (;;) {
@@ -1700,7 +1696,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
user_uffdio_copy = (struct uffdio_copy __user *) arg;
ret = -EAGAIN;
- if (READ_ONCE(ctx->mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out;
ret = -EFAULT;
@@ -1757,7 +1753,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
ret = -EAGAIN;
- if (READ_ONCE(ctx->mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out;
ret = -EFAULT;
@@ -1807,7 +1803,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
struct userfaultfd_wake_range range;
bool mode_wp, mode_dontwake;
- if (READ_ONCE(ctx->mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
return -EAGAIN;
user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
@@ -1855,7 +1851,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
user_uffdio_continue = (struct uffdio_continue __user *)arg;
ret = -EAGAIN;
- if (READ_ONCE(ctx->mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out;
ret = -EFAULT;
@@ -1908,9 +1904,10 @@ out:
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
- * For the current set of features the bits just coincide
+ * For the current set of features the bits just coincide. Set
+ * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
*/
- return (unsigned int)user_features;
+ return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
}
/*
@@ -1923,12 +1920,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
{
struct uffdio_api uffdio_api;
void __user *buf = (void __user *)arg;
+ unsigned int ctx_features;
int ret;
__u64 features;
- ret = -EINVAL;
- if (ctx->state != UFFD_STATE_WAIT_API)
- goto out;
ret = -EFAULT;
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
@@ -1952,9 +1947,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EFAULT;
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
goto out;
- ctx->state = UFFD_STATE_RUNNING;
+
/* only enable the requested features for this uffd context */
- ctx->features = uffd_ctx_features(features);
+ ctx_features = uffd_ctx_features(features);
+ ret = -EINVAL;
+ if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
+ goto err_out;
+
ret = 0;
out:
return ret;
@@ -1971,7 +1970,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
int ret = -EINVAL;
struct userfaultfd_ctx *ctx = file->private_data;
- if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+ if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
return -EINVAL;
switch(cmd) {
@@ -2085,9 +2084,8 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
refcount_set(&ctx->refcount, 1);
ctx->flags = flags;
ctx->features = 0;
- ctx->state = UFFD_STATE_WAIT_API;
ctx->released = false;
- ctx->mmap_changing = false;
+ atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 1d7edad9914f..33207004cfde 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -116,6 +116,7 @@ struct bdi_writeback {
struct list_head b_dirty_time; /* time stamps are dirty */
spinlock_t list_lock; /* protects the b_* lists */
+ atomic_t writeback_inodes; /* number of inodes under writeback */
struct percpu_counter stat[NR_WB_STAT_ITEMS];
unsigned long congested; /* WB_[a]sync_congested flags */
@@ -142,6 +143,7 @@ struct bdi_writeback {
spinlock_t work_lock; /* protects work_list & dwork scheduling */
struct list_head work_list;
struct delayed_work dwork; /* work item used for writeback */
+ struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
unsigned long dirty_sleep; /* last wait */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 29530859d9ff..ac7f231b8825 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -288,6 +288,17 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
return inode->i_wb;
}
+static inline struct bdi_writeback *inode_to_wb_wbc(
+ struct inode *inode,
+ struct writeback_control *wbc)
+{
+ /*
+ * If wbc does not have inode attached, it means cgroup writeback was
+ * disabled when wbc started. Just use the default wb in that case.
+ */
+ return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb;
+}
+
/**
* unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
* @inode: target inode
@@ -366,6 +377,14 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
return &inode_to_bdi(inode)->wb;
}
+static inline struct bdi_writeback *inode_to_wb_wbc(
+ struct inode *inode,
+ struct writeback_control *wbc)
+{
+ return inode_to_wb(inode);
+}
+
+
static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index e7e99da31349..6486d3c19463 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -409,7 +409,7 @@ static inline void invalidate_inode_buffers(struct inode *inode) {}
static inline int remove_inode_buffers(struct inode *inode) { return 1; }
static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
static inline void invalidate_bh_lrus_cpu(int cpu) {}
-static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; }
+static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; }
#define buffer_heads_over_limit 0
#endif /* CONFIG_BLOCK */
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index c24098c7acca..34bce35c808d 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -84,6 +84,8 @@ static inline unsigned long compact_gap(unsigned int order)
extern unsigned int sysctl_compaction_proactiveness;
extern int sysctl_compaction_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos);
+extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table,
+ int write, void *buffer, size_t *length, loff_t *ppos);
extern int sysctl_extfrag_threshold;
extern int sysctl_compact_unevictable_allowed;
diff --git a/include/linux/device.h b/include/linux/device.h
index 65d84b67b024..e270cb740b9e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -424,6 +424,7 @@ struct dev_links_info {
* @dma_pools: Dma pools (if dma'ble device).
* @dma_mem: Internal for coherent mem override.
* @cma_area: Contiguous memory area for dma allocations
+ * @dma_io_tlb_mem: Pointer to the swiotlb pool used. Not for driver use.
* @archdata: For arch-specific additions.
* @of_node: Associated device tree node.
* @fwnode: Associated device node supplied by platform firmware.
@@ -534,6 +535,9 @@ struct device {
struct cma *cma_area; /* contiguous memory area for dma
allocations */
#endif
+#ifdef CONFIG_SWIOTLB
+ struct io_tlb_mem *dma_io_tlb_mem;
+#endif
/* arch specific additions */
struct dev_archdata archdata;
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 758ca4694257..24607dc3c2ac 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -20,6 +20,7 @@ void iommu_put_dma_cookie(struct iommu_domain *domain);
/* Setup call for arch DMA mapping code */
void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
+int iommu_dma_init_fq(struct iommu_domain *domain);
/* The DMA API isn't _quite_ the whole story, though... */
/*
@@ -54,6 +55,11 @@ static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,
{
}
+static inline int iommu_dma_init_fq(struct iommu_domain *domain)
+{
+ return -EINVAL;
+}
+
static inline int iommu_get_dma_cookie(struct iommu_domain *domain)
{
return -ENODEV;
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index d9a606a9fc64..b4c49f9cc379 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -130,10 +130,7 @@ static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page
}
#endif
-#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-static inline void flush_kernel_dcache_page(struct page *page)
-{
-}
+#ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE
static inline void flush_kernel_vmap_range(void *vaddr, int size)
{
}
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 0b8d1fdda3a1..c137396129db 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -121,6 +121,13 @@ static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
css_put(&h_cg->css);
}
+static inline void resv_map_dup_hugetlb_cgroup_uncharge_info(
+ struct resv_map *resv_map)
+{
+ if (resv_map->css)
+ css_get(resv_map->css);
+}
+
extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
struct hugetlb_cgroup **ptr);
extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
@@ -199,6 +206,11 @@ static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
{
}
+static inline void resv_map_dup_hugetlb_cgroup_uncharge_info(
+ struct resv_map *resv_map)
+{
+}
+
static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
struct hugetlb_cgroup **ptr)
{
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d0fa0b31994d..05a65eb155f7 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -124,9 +124,9 @@
#define DMAR_MTRR_PHYSMASK8_REG 0x208
#define DMAR_MTRR_PHYSBASE9_REG 0x210
#define DMAR_MTRR_PHYSMASK9_REG 0x218
-#define DMAR_VCCAP_REG 0xe00 /* Virtual command capability register */
-#define DMAR_VCMD_REG 0xe10 /* Virtual command register */
-#define DMAR_VCRSP_REG 0xe20 /* Virtual command response register */
+#define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */
+#define DMAR_VCMD_REG 0xe00 /* Virtual command register */
+#define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */
#define DMAR_IQER_REG_IQEI(reg) FIELD_GET(GENMASK_ULL(3, 0), reg)
#define DMAR_IQER_REG_ITESID(reg) FIELD_GET(GENMASK_ULL(47, 32), reg)
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 10fa80eef13a..57cceecbe37f 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -14,6 +14,11 @@
#define SVM_REQ_EXEC (1<<1)
#define SVM_REQ_PRIV (1<<0)
+/* Page Request Queue depth */
+#define PRQ_ORDER 2
+#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20)
+#define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5)
+
/*
* The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only
* for access to kernel addresses. No IOTLB flushes are automatically done
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 4d40dfa75b55..86af6f0a00a2 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -16,6 +16,7 @@ enum io_pgtable_fmt {
ARM_V7S,
ARM_MALI_LPAE,
AMD_IOMMU_V1,
+ APPLE_DART,
IO_PGTABLE_NUM_FMTS,
};
@@ -73,10 +74,6 @@ struct io_pgtable_cfg {
* to support up to 35 bits PA where the bit32, bit33 and bit34 are
* encoded in the bit9, bit4 and bit5 of the PTE respectively.
*
- * IO_PGTABLE_QUIRK_NON_STRICT: Skip issuing synchronous leaf TLBIs
- * on unmap, for DMA domains using the flush queue mechanism for
- * delayed invalidation.
- *
* IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
* for use in the upper half of a split address space.
*
@@ -86,7 +83,6 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS BIT(1)
#define IO_PGTABLE_QUIRK_ARM_MTK_EXT BIT(3)
- #define IO_PGTABLE_QUIRK_NON_STRICT BIT(4)
#define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5)
#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6)
unsigned long quirks;
@@ -136,6 +132,11 @@ struct io_pgtable_cfg {
u64 transtab;
u64 memattr;
} arm_mali_lpae_cfg;
+
+ struct {
+ u64 ttbr[4];
+ u32 n_ttbrs;
+ } apple_dart_cfg;
};
};
@@ -143,7 +144,9 @@ struct io_pgtable_cfg {
* struct io_pgtable_ops - Page table manipulation API for IOMMU drivers.
*
* @map: Map a physically contiguous memory region.
+ * @map_pages: Map a physically contiguous range of pages of the same size.
* @unmap: Unmap a physically contiguous memory region.
+ * @unmap_pages: Unmap a range of virtually contiguous pages of the same size.
* @iova_to_phys: Translate iova to physical address.
*
* These functions map directly onto the iommu_ops member functions with
@@ -152,8 +155,14 @@ struct io_pgtable_cfg {
struct io_pgtable_ops {
int (*map)(struct io_pgtable_ops *ops, unsigned long iova,
phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
+ int (*map_pages)(struct io_pgtable_ops *ops, unsigned long iova,
+ phys_addr_t paddr, size_t pgsize, size_t pgcount,
+ int prot, gfp_t gfp, size_t *mapped);
size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova,
size_t size, struct iommu_iotlb_gather *gather);
+ size_t (*unmap_pages)(struct io_pgtable_ops *ops, unsigned long iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *gather);
phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
unsigned long iova);
};
@@ -246,5 +255,6 @@ extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns;
extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns;
extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns;
#endif /* __IO_PGTABLE_H */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9369458ba1bd..d2f3435e7d17 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -40,6 +40,7 @@ struct iommu_domain;
struct notifier_block;
struct iommu_sva;
struct iommu_fault_event;
+struct iommu_dma_cookie;
/* iommu fault flags */
#define IOMMU_FAULT_READ 0x0
@@ -60,6 +61,7 @@ struct iommu_domain_geometry {
#define __IOMMU_DOMAIN_DMA_API (1U << 1) /* Domain for use in DMA-API
implementation */
#define __IOMMU_DOMAIN_PT (1U << 2) /* Domain is identity mapped */
+#define __IOMMU_DOMAIN_DMA_FQ (1U << 3) /* DMA-API uses flush queue */
/*
* This are the possible domain-types
@@ -72,12 +74,17 @@ struct iommu_domain_geometry {
* IOMMU_DOMAIN_DMA - Internally used for DMA-API implementations.
* This flag allows IOMMU drivers to implement
* certain optimizations for these domains
+ * IOMMU_DOMAIN_DMA_FQ - As above, but definitely using batched TLB
+ * invalidation.
*/
#define IOMMU_DOMAIN_BLOCKED (0U)
#define IOMMU_DOMAIN_IDENTITY (__IOMMU_DOMAIN_PT)
#define IOMMU_DOMAIN_UNMANAGED (__IOMMU_DOMAIN_PAGING)
#define IOMMU_DOMAIN_DMA (__IOMMU_DOMAIN_PAGING | \
__IOMMU_DOMAIN_DMA_API)
+#define IOMMU_DOMAIN_DMA_FQ (__IOMMU_DOMAIN_PAGING | \
+ __IOMMU_DOMAIN_DMA_API | \
+ __IOMMU_DOMAIN_DMA_FQ)
struct iommu_domain {
unsigned type;
@@ -86,9 +93,14 @@ struct iommu_domain {
iommu_fault_handler_t handler;
void *handler_token;
struct iommu_domain_geometry geometry;
- void *iova_cookie;
+ struct iommu_dma_cookie *iova_cookie;
};
+static inline bool iommu_is_dma_domain(struct iommu_domain *domain)
+{
+ return domain->type & __IOMMU_DOMAIN_DMA_API;
+}
+
enum iommu_cap {
IOMMU_CAP_CACHE_COHERENCY, /* IOMMU can enforce cache coherent DMA
transactions */
@@ -160,16 +172,22 @@ enum iommu_dev_features {
* @start: IOVA representing the start of the range to be flushed
* @end: IOVA representing the end of the range to be flushed (inclusive)
* @pgsize: The interval at which to perform the flush
+ * @freelist: Removed pages to free after sync
+ * @queued: Indicates that the flush will be queued
*
* This structure is intended to be updated by multiple calls to the
* ->unmap() function in struct iommu_ops before eventually being passed
- * into ->iotlb_sync().
+ * into ->iotlb_sync(). Drivers can add pages to @freelist to be freed after
+ * ->iotlb_sync() or ->iotlb_flush_all() have cleared all cached references to
+ * them. @queued is set to indicate when ->iotlb_flush_all() will be called
+ * later instead of ->iotlb_sync(), so drivers may optimise accordingly.
*/
struct iommu_iotlb_gather {
unsigned long start;
unsigned long end;
size_t pgsize;
struct page *freelist;
+ bool queued;
};
/**
@@ -180,7 +198,10 @@ struct iommu_iotlb_gather {
* @attach_dev: attach device to an iommu domain
* @detach_dev: detach device from an iommu domain
* @map: map a physically contiguous memory region to an iommu domain
+ * @map_pages: map a physically contiguous set of pages of the same size to
+ * an iommu domain.
* @unmap: unmap a physically contiguous memory region from an iommu domain
+ * @unmap_pages: unmap a number of pages of the same size from an iommu domain
* @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
* @iotlb_sync_map: Sync mappings created recently using @map to the hardware
* @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
@@ -229,8 +250,14 @@ struct iommu_ops {
void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
int (*map)(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
+ int (*map_pages)(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t pgsize, size_t pgcount,
+ int prot, gfp_t gfp, size_t *mapped);
size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
size_t size, struct iommu_iotlb_gather *iotlb_gather);
+ size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *iotlb_gather);
void (*flush_iotlb_all)(struct iommu_domain *domain);
void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova,
size_t size);
@@ -476,8 +503,7 @@ int iommu_enable_nesting(struct iommu_domain *domain);
int iommu_set_pgtable_quirks(struct iommu_domain *domain,
unsigned long quirks);
-void iommu_set_dma_strict(bool val);
-bool iommu_get_dma_strict(struct iommu_domain *domain);
+void iommu_set_dma_strict(void);
extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
unsigned long iova, int flags);
@@ -497,29 +523,80 @@ static inline void iommu_iotlb_sync(struct iommu_domain *domain,
iommu_iotlb_gather_init(iotlb_gather);
}
+/**
+ * iommu_iotlb_gather_is_disjoint - Checks whether a new range is disjoint
+ *
+ * @gather: TLB gather data
+ * @iova: start of page to invalidate
+ * @size: size of page to invalidate
+ *
+ * Helper for IOMMU drivers to check whether a new range and the gathered range
+ * are disjoint. For many IOMMUs, flushing the IOMMU in this case is better
+ * than merging the two, which might lead to unnecessary invalidations.
+ */
+static inline
+bool iommu_iotlb_gather_is_disjoint(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t size)
+{
+ unsigned long start = iova, end = start + size - 1;
+
+ return gather->end != 0 &&
+ (end + 1 < gather->start || start > gather->end + 1);
+}
+
+
+/**
+ * iommu_iotlb_gather_add_range - Gather for address-based TLB invalidation
+ * @gather: TLB gather data
+ * @iova: start of page to invalidate
+ * @size: size of page to invalidate
+ *
+ * Helper for IOMMU drivers to build arbitrarily-sized invalidation commands
+ * where only the address range matters, and simply minimising intermediate
+ * syncs is preferred.
+ */
+static inline void iommu_iotlb_gather_add_range(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t size)
+{
+ unsigned long end = iova + size - 1;
+
+ if (gather->start > iova)
+ gather->start = iova;
+ if (gather->end < end)
+ gather->end = end;
+}
+
+/**
+ * iommu_iotlb_gather_add_page - Gather for page-based TLB invalidation
+ * @domain: IOMMU domain to be invalidated
+ * @gather: TLB gather data
+ * @iova: start of page to invalidate
+ * @size: size of page to invalidate
+ *
+ * Helper for IOMMU drivers to build invalidation commands based on individual
+ * pages, or with page size/table level hints which cannot be gathered if they
+ * differ.
+ */
static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather,
unsigned long iova, size_t size)
{
- unsigned long start = iova, end = start + size - 1;
-
/*
* If the new page is disjoint from the current range or is mapped at
* a different granularity, then sync the TLB so that the gather
* structure can be rewritten.
*/
- if (gather->pgsize != size ||
- end + 1 < gather->start || start > gather->end + 1) {
- if (gather->pgsize)
- iommu_iotlb_sync(domain, gather);
- gather->pgsize = size;
- }
+ if ((gather->pgsize && gather->pgsize != size) ||
+ iommu_iotlb_gather_is_disjoint(gather, iova, size))
+ iommu_iotlb_sync(domain, gather);
- if (gather->end < end)
- gather->end = end;
+ gather->pgsize = size;
+ iommu_iotlb_gather_add_range(gather, iova, size);
+}
- if (gather->start > start)
- gather->start = start;
+static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
+{
+ return gather && gather->queued;
}
/* PCI device grouping function */
@@ -870,6 +947,11 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
{
}
+static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
+{
+ return false;
+}
+
static inline void iommu_device_unregister(struct iommu_device *iommu)
{
}
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 4a53c3ca86bd..b066024c62e3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -99,8 +99,6 @@ void memblock_discard(void);
static inline void memblock_discard(void) {}
#endif
-phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align);
void memblock_allow_resize(void);
int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
int memblock_add(phys_addr_t base, phys_addr_t size);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 20151c4f1e0e..3096c9a0ee01 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -105,14 +105,6 @@ struct mem_cgroup_reclaim_iter {
unsigned int generation;
};
-struct lruvec_stat {
- long count[NR_VM_NODE_STAT_ITEMS];
-};
-
-struct batched_lruvec_stat {
- s32 count[NR_VM_NODE_STAT_ITEMS];
-};
-
/*
* Bitmap and deferred work of shrinker::id corresponding to memcg-aware
* shrinkers, which have elements charged to this memcg.
@@ -123,24 +115,30 @@ struct shrinker_info {
unsigned long *map;
};
+struct lruvec_stats_percpu {
+ /* Local (CPU and cgroup) state */
+ long state[NR_VM_NODE_STAT_ITEMS];
+
+ /* Delta calculation for lockless upward propagation */
+ long state_prev[NR_VM_NODE_STAT_ITEMS];
+};
+
+struct lruvec_stats {
+ /* Aggregated (CPU and subtree) state */
+ long state[NR_VM_NODE_STAT_ITEMS];
+
+ /* Pending child counts during tree propagation */
+ long state_pending[NR_VM_NODE_STAT_ITEMS];
+};
+
/*
* per-node information in memory controller.
*/
struct mem_cgroup_per_node {
struct lruvec lruvec;
- /*
- * Legacy local VM stats. This should be struct lruvec_stat and
- * cannot be optimized to struct batched_lruvec_stat. Because
- * the threshold of the lruvec_stat_cpu can be as big as
- * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this
- * filed has no upper limit.
- */
- struct lruvec_stat __percpu *lruvec_stat_local;
-
- /* Subtree VM stats (batched updates) */
- struct batched_lruvec_stat __percpu *lruvec_stat_cpu;
- atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS];
+ struct lruvec_stats_percpu __percpu *lruvec_stats_percpu;
+ struct lruvec_stats lruvec_stats;
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
@@ -595,13 +593,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
}
#endif
-static __always_inline bool memcg_stat_item_in_bytes(int idx)
-{
- if (idx == MEMCG_PERCPU_B)
- return true;
- return vmstat_item_in_bytes(idx);
-}
-
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return (memcg == root_mem_cgroup);
@@ -693,13 +684,35 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
page_counter_read(&memcg->memory);
}
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
+int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask);
+static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+ return __mem_cgroup_charge(page, mm, gfp_mask);
+}
+
int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry);
void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
-void mem_cgroup_uncharge(struct page *page);
-void mem_cgroup_uncharge_list(struct list_head *page_list);
+void __mem_cgroup_uncharge(struct page *page);
+static inline void mem_cgroup_uncharge(struct page *page)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_uncharge(page);
+}
+
+void __mem_cgroup_uncharge_list(struct list_head *page_list);
+static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_uncharge_list(page_list);
+}
void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
@@ -884,11 +897,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
return !!(memcg->css.flags & CSS_ONLINE);
}
-/*
- * For memory reclaim.
- */
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
-
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int zid, int nr_pages);
@@ -955,22 +963,21 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
local_irq_restore(flags);
}
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+ return READ_ONCE(memcg->vmstats.state[idx]);
+}
+
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
{
struct mem_cgroup_per_node *pn;
- long x;
if (mem_cgroup_disabled())
return node_page_state(lruvec_pgdat(lruvec), idx);
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- x = atomic_long_read(&pn->lruvec_stat[idx]);
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- return x;
+ return READ_ONCE(pn->lruvec_stats.state[idx]);
}
static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
@@ -985,7 +992,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
for_each_possible_cpu(cpu)
- x += per_cpu(pn->lruvec_stat_local->count[idx], cpu);
+ x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -993,6 +1000,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
+void mem_cgroup_flush_stats(void);
+
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val);
void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
@@ -1391,6 +1400,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
{
}
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+ return 0;
+}
+
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
{
@@ -1403,6 +1417,10 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);
}
+static inline void mem_cgroup_flush_stats(void)
+{
+}
+
static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx, int val)
{
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 97e92e8b556a..d9a0b61cd432 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -90,7 +90,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
void remove_memory_block_devices(unsigned long start, unsigned long size);
extern void memory_dev_init(void);
extern int memory_notify(unsigned long val, void *v);
-extern struct memory_block *find_memory_block(struct mem_section *);
+extern struct memory_block *find_memory_block(unsigned long section_nr);
typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
extern int walk_memory_blocks(unsigned long start, unsigned long size,
void *arg, walk_memory_blocks_func_t func);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 0aaf91b496e2..4091692bed8c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -184,6 +184,14 @@ extern bool vma_migratable(struct vm_area_struct *vma);
extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
extern void mpol_put_task_policy(struct task_struct *);
+extern bool numa_demotion_enabled;
+
+static inline bool mpol_is_preferred_many(struct mempolicy *pol)
+{
+ return (pol->mode == MPOL_PREFERRED_MANY);
+}
+
+
#else
struct mempolicy {};
@@ -292,5 +300,13 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
{
return NULL;
}
+
+#define numa_demotion_enabled false
+
+static inline bool mpol_is_preferred_many(struct mempolicy *pol)
+{
+ return false;
+}
+
#endif /* CONFIG_NUMA */
#endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 23dadf7aeba8..326250996b4e 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -28,6 +28,7 @@ enum migrate_reason {
MR_NUMA_MISPLACED,
MR_CONTIG_RANGE,
MR_LONGTERM_PIN,
+ MR_DEMOTION,
MR_TYPES
};
@@ -41,7 +42,8 @@ extern int migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode);
extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
- unsigned long private, enum migrate_mode mode, int reason);
+ unsigned long private, enum migrate_mode mode, int reason,
+ unsigned int *ret_succeeded);
extern struct page *alloc_migration_target(struct page *page, unsigned long private);
extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
@@ -56,7 +58,7 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
static inline void putback_movable_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t new,
free_page_t free, unsigned long private, enum migrate_mode mode,
- int reason)
+ int reason, unsigned int *ret_succeeded)
{ return -ENOSYS; }
static inline struct page *alloc_migration_target(struct page *page,
unsigned long private)
@@ -166,6 +168,14 @@ struct migrate_vma {
int migrate_vma_setup(struct migrate_vma *args);
void migrate_vma_pages(struct migrate_vma *migrate);
void migrate_vma_finalize(struct migrate_vma *migrate);
+int next_demotion_node(int node);
+
+#else /* CONFIG_MIGRATION disabled: */
+
+static inline int next_demotion_node(int node)
+{
+ return NUMA_NO_NODE;
+}
#endif /* CONFIG_MIGRATION */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e59646a5d44d..ed2552c9aeca 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1216,18 +1216,10 @@ static inline void get_page(struct page *page)
}
bool __must_check try_grab_page(struct page *page, unsigned int flags);
-__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs,
- unsigned int flags);
+struct page *try_grab_compound_head(struct page *page, int refs,
+ unsigned int flags);
-
-static inline __must_check bool try_get_page(struct page *page)
-{
- page = compound_head(page);
- if (WARN_ON_ONCE(page_ref_count(page) <= 0))
- return false;
- page_ref_inc(page);
- return true;
-}
+struct page *try_get_compound_head(struct page *page, int refs);
static inline void put_page(struct page *page)
{
@@ -1849,7 +1841,6 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
struct kvec;
int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
struct page **pages);
-int get_kernel_page(unsigned long start, int write, struct page **pages);
struct page *get_dump_page(unsigned long addr);
extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
@@ -3121,7 +3112,7 @@ extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
-extern void shake_page(struct page *p, int access);
+extern void shake_page(struct page *p);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fcb535560028..1bd5f5955f9a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -846,6 +846,7 @@ typedef struct pglist_data {
enum zone_type kcompactd_highest_zoneidx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
+ bool proactive_compact_trigger;
#endif
/*
* This is a per-node reserve of pages that are not available
@@ -1342,7 +1343,6 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
return NULL;
return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
}
-extern unsigned long __section_nr(struct mem_section *ms);
extern size_t mem_section_usage_size(void);
/*
@@ -1365,7 +1365,7 @@ extern size_t mem_section_usage_size(void);
#define SECTION_TAINT_ZONE_DEVICE (1UL<<4)
#define SECTION_MAP_LAST_BIT (1UL<<5)
#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
-#define SECTION_NID_SHIFT 3
+#define SECTION_NID_SHIFT 6
static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ed02aa522263..5dcf446f42e5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -736,7 +736,7 @@ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
/*
* Fault everything in given userspace address range in.
*/
-static inline int fault_in_pages_writeable(char __user *uaddr, int size)
+static inline int fault_in_pages_writeable(char __user *uaddr, size_t size)
{
char __user *end = uaddr + size - 1;
@@ -763,7 +763,7 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size)
return 0;
}
-static inline int fault_in_pages_readable(const char __user *uaddr, int size)
+static inline int fault_in_pages_readable(const char __user *uaddr, size_t size)
{
volatile char c;
const char __user *end = uaddr + size - 1;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index e24b1fe348e3..5561486fddef 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -174,13 +174,13 @@ static inline gfp_t current_gfp_context(gfp_t flags)
}
#ifdef CONFIG_LOCKDEP
-extern void __fs_reclaim_acquire(void);
-extern void __fs_reclaim_release(void);
+extern void __fs_reclaim_acquire(unsigned long ip);
+extern void __fs_reclaim_release(unsigned long ip);
extern void fs_reclaim_acquire(gfp_t gfp_mask);
extern void fs_reclaim_release(gfp_t gfp_mask);
#else
-static inline void __fs_reclaim_acquire(void) { }
-static inline void __fs_reclaim_release(void) { }
+static inline void __fs_reclaim_acquire(unsigned long ip) { }
+static inline void __fs_reclaim_release(unsigned long ip) { }
static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
static inline void fs_reclaim_release(gfp_t gfp_mask) { }
#endif
@@ -306,7 +306,7 @@ set_active_memcg(struct mem_cgroup *memcg)
{
struct mem_cgroup *old;
- if (in_interrupt()) {
+ if (!in_task()) {
old = this_cpu_read(int_active_memcg);
this_cpu_write(int_active_memcg, memcg);
} else {
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 8e775ce517bb..166158b6e917 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -18,6 +18,7 @@ struct shmem_inode_info {
unsigned long flags;
unsigned long alloced; /* data pages alloced to file */
unsigned long swapped; /* subtotal assigned to swap */
+ pgoff_t fallocend; /* highest fallocate endindex */
struct list_head shrinklist; /* shrinkable hpage inodes */
struct list_head swaplist; /* chain of maybes on swap */
struct shared_policy policy; /* NUMA memory alloc policy */
@@ -31,7 +32,7 @@ struct shmem_sb_info {
struct percpu_counter used_blocks; /* How many are allocated */
unsigned long max_inodes; /* How many inodes are allowed */
unsigned long free_inodes; /* How many are left for allocation */
- spinlock_t stat_lock; /* Serialize shmem_sb_info changes */
+ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */
umode_t mode; /* Mount mode for root directory */
unsigned char huge; /* Whether to try for hugepages */
kuid_t uid; /* Mount uid for root directory */
@@ -85,7 +86,12 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
extern int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse);
-extern bool shmem_huge_enabled(struct vm_area_struct *vma);
+extern bool shmem_is_huge(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index);
+static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
+{
+ return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff);
+}
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end);
@@ -93,9 +99,8 @@ extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
/* Flag allocation requirements to shmem_getpage */
enum sgp_type {
SGP_READ, /* don't exceed i_size, don't allocate page */
+ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */
SGP_CACHE, /* don't exceed i_size, may allocate page */
- SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */
- SGP_HUGE, /* like SGP_CACHE, huge pages preferred */
SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
};
@@ -119,6 +124,18 @@ static inline bool shmem_file(struct file *file)
return shmem_mapping(file->f_mapping);
}
+/*
+ * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages
+ * beyond i_size's notion of EOF, which fallocate has committed to reserving:
+ * which split_huge_page() must therefore not delete. This use of a single
+ * "fallocend" per inode errs on the side of not deleting a reservation when
+ * in doubt: there are plenty of cases when it preserves unreserved pages.
+ */
+static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
+{
+ return max(eof, SHMEM_I(inode)->fallocend);
+}
+
extern bool shmem_charge(struct inode *inode, long pages);
extern void shmem_uncharge(struct inode *inode, long pages);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6f5a43251593..ba52f3a3478e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -408,7 +408,7 @@ static inline bool node_reclaim_enabled(void)
extern void check_move_unevictable_pages(struct pagevec *pvec);
-extern int kswapd_run(int nid);
+extern void kswapd_run(int nid);
extern void kswapd_stop(int nid);
#ifdef CONFIG_SWAP
@@ -721,7 +721,13 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
#endif
#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
+extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
+static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __cgroup_throttle_swaprate(page, gfp_mask);
+}
#else
static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
{
@@ -730,8 +736,22 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
#ifdef CONFIG_MEMCG_SWAP
extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
-extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
-extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
+extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
+static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+ return __mem_cgroup_try_charge_swap(page, entry);
+}
+
+extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
+static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_uncharge_swap(entry, nr_pages);
+}
+
extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
extern bool mem_cgroup_swap_full(struct page *page);
#else
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 216854a5e513..b0cb2a9973f4 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -2,6 +2,7 @@
#ifndef __LINUX_SWIOTLB_H
#define __LINUX_SWIOTLB_H
+#include <linux/device.h>
#include <linux/dma-direction.h>
#include <linux/init.h>
#include <linux/types.h>
@@ -72,7 +73,8 @@ extern enum swiotlb_force swiotlb_force;
* range check to see if the memory was in fact allocated by this
* API.
* @nslabs: The number of IO TLB blocks (in groups of 64) between @start and
- * @end. This is command line adjustable via setup_io_tlb_npages.
+ * @end. For default swiotlb, this is command line adjustable via
+ * setup_io_tlb_npages.
* @used: The number of used IO TLB block.
* @list: The free list describing the number of free entries available
* from each index.
@@ -83,6 +85,8 @@ extern enum swiotlb_force swiotlb_force;
* unmap calls.
* @debugfs: The dentry to debugfs.
* @late_alloc: %true if allocated using the page allocator
+ * @force_bounce: %true if swiotlb bouncing is forced
+ * @for_alloc: %true if the pool is used for memory allocation
*/
struct io_tlb_mem {
phys_addr_t start;
@@ -93,29 +97,42 @@ struct io_tlb_mem {
spinlock_t lock;
struct dentry *debugfs;
bool late_alloc;
+ bool force_bounce;
+ bool for_alloc;
struct io_tlb_slot {
phys_addr_t orig_addr;
size_t alloc_size;
unsigned int list;
- } slots[];
+ } *slots;
};
-extern struct io_tlb_mem *io_tlb_default_mem;
+extern struct io_tlb_mem io_tlb_default_mem;
-static inline bool is_swiotlb_buffer(phys_addr_t paddr)
+static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
return mem && paddr >= mem->start && paddr < mem->end;
}
+static inline bool is_swiotlb_force_bounce(struct device *dev)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+
+ return mem && mem->force_bounce;
+}
+
void __init swiotlb_exit(void);
unsigned int swiotlb_max_segment(void);
size_t swiotlb_max_mapping_size(struct device *dev);
-bool is_swiotlb_active(void);
+bool is_swiotlb_active(struct device *dev);
void __init swiotlb_adjust_size(unsigned long size);
#else
#define swiotlb_force SWIOTLB_NO_FORCE
-static inline bool is_swiotlb_buffer(phys_addr_t paddr)
+static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
+{
+ return false;
+}
+static inline bool is_swiotlb_force_bounce(struct device *dev)
{
return false;
}
@@ -131,7 +148,7 @@ static inline size_t swiotlb_max_mapping_size(struct device *dev)
return SIZE_MAX;
}
-static inline bool is_swiotlb_active(void)
+static inline bool is_swiotlb_active(struct device *dev)
{
return false;
}
@@ -144,4 +161,28 @@ static inline void swiotlb_adjust_size(unsigned long size)
extern void swiotlb_print_info(void);
extern void swiotlb_set_max_segment(unsigned int);
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+struct page *swiotlb_alloc(struct device *dev, size_t size);
+bool swiotlb_free(struct device *dev, struct page *page, size_t size);
+
+static inline bool is_swiotlb_for_alloc(struct device *dev)
+{
+ return dev->dma_io_tlb_mem->for_alloc;
+}
+#else
+static inline struct page *swiotlb_alloc(struct device *dev, size_t size)
+{
+ return NULL;
+}
+static inline bool swiotlb_free(struct device *dev, struct page *page,
+ size_t size)
+{
+ return false;
+}
+static inline bool is_swiotlb_for_alloc(struct device *dev)
+{
+ return false;
+}
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
+
#endif /* __LINUX_SWIOTLB_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2b47584eb843..60a3ab0ad2cc 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -915,6 +915,7 @@ asmlinkage long sys_mincore(unsigned long start, size_t len,
asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
size_t vlen, int behavior, unsigned int flags);
+asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
unsigned long prot, unsigned long pgoff,
unsigned long flags);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 331d2ccf0bcc..33cea484d1ad 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -60,16 +60,16 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- bool *mmap_changing, __u64 mode);
+ atomic_t *mmap_changing, __u64 mode);
extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long len,
- bool *mmap_changing);
+ atomic_t *mmap_changing);
extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long len, bool *mmap_changing);
+ unsigned long len, atomic_t *mmap_changing);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
- bool enable_wp, bool *mmap_changing);
+ bool enable_wp, atomic_t *mmap_changing);
/* mm helpers */
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index ae0dd1948c2b..a185cc75ff52 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -33,6 +33,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGREUSE,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
+ PGDEMOTE_KSWAPD,
+ PGDEMOTE_DIRECT,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_DIRECT_THROTTLE,
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 6d28bc433c1c..6a2f51ebbfd3 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -37,7 +37,7 @@ extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
extern void vmpressure_init(struct vmpressure *vmpr);
extern void vmpressure_cleanup(struct vmpressure *vmpr);
extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
-extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
+extern struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr);
extern int vmpressure_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd,
const char *args);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 270677dc4f36..d1f65adf6a26 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -218,7 +218,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
void wbc_detach_inode(struct writeback_control *wbc);
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
size_t bytes);
-int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages,
+int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
enum wb_reason reason, struct wb_completion *done);
void cgroup_writeback_umount(void);
bool cleanup_offline_cgwb(struct bdi_writeback *wb);
@@ -374,7 +374,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
-void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
+void wb_update_bandwidth(struct bdi_writeback *wb);
void balance_dirty_pages_ratelimited(struct address_space *mapping);
bool wb_over_bg_thresh(struct bdi_writeback *wb);
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 9fb2a3bbcdfb..779f3fad9ecd 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -21,7 +21,8 @@
EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \
EM( MR_NUMA_MISPLACED, "numa_misplaced") \
EM( MR_CONTIG_RANGE, "contig_range") \
- EMe(MR_LONGTERM_PIN, "longterm_pin")
+ EM( MR_LONGTERM_PIN, "longterm_pin") \
+ EMe(MR_DEMOTION, "demotion")
/*
* First define the enums in the above macros to be exported to userspace
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a9d6fcd95f42..14c8fe863c6d 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -877,9 +877,11 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
#define __NR_memfd_secret 447
__SYSCALL(__NR_memfd_secret, sys_memfd_secret)
#endif
+#define __NR_process_mrelease 448
+__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
#undef __NR_syscalls
-#define __NR_syscalls 448
+#define __NR_syscalls 449
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 19a00bc7fe86..046d0ccba4cd 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -22,6 +22,7 @@ enum {
MPOL_BIND,
MPOL_INTERLEAVE,
MPOL_LOCAL,
+ MPOL_PREFERRED_MANY,
MPOL_MAX, /* always last member of enum */
};
diff --git a/ipc/msg.c b/ipc/msg.c
index 6810276d6bb9..a0d05775af2c 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -147,7 +147,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
key_t key = params->key;
int msgflg = params->flg;
- msq = kmalloc(sizeof(*msq), GFP_KERNEL);
+ msq = kmalloc(sizeof(*msq), GFP_KERNEL_ACCOUNT);
if (unlikely(!msq))
return -ENOMEM;
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 7bd0766ddc3b..ae83f0f2651b 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -42,7 +42,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
+ ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT);
if (ns == NULL)
goto fail_dec;
diff --git a/ipc/sem.c b/ipc/sem.c
index 971e75d28364..1a8b9f0ac047 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -514,7 +514,7 @@ static struct sem_array *sem_alloc(size_t nsems)
if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0]))
return NULL;
- sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL);
+ sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL_ACCOUNT);
if (unlikely(!sma))
return NULL;
@@ -1855,7 +1855,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp)
undo_list = current->sysvsem.undo_list;
if (!undo_list) {
- undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
+ undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT);
if (undo_list == NULL)
return -ENOMEM;
spin_lock_init(&undo_list->lock);
@@ -1941,7 +1941,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
/* step 2: allocate new undo structure */
new = kvzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems,
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!new) {
ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
return ERR_PTR(-ENOMEM);
@@ -2005,7 +2005,8 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops,
if (nsops > ns->sc_semopm)
return -E2BIG;
if (nsops > SEMOPM_FAST) {
- sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL);
+ sops = kvmalloc_array(nsops, sizeof(*sops),
+ GFP_KERNEL_ACCOUNT);
if (sops == NULL)
return -ENOMEM;
}
diff --git a/ipc/shm.c b/ipc/shm.c
index 748933e376ca..ab749be6d8b7 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -619,7 +619,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
ns->shm_tot + numpages > ns->shm_ctlall)
return -ENOSPC;
- shp = kmalloc(sizeof(*shp), GFP_KERNEL);
+ shp = kmalloc(sizeof(*shp), GFP_KERNEL_ACCOUNT);
if (unlikely(!shp))
return -ENOMEM;
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index f5e8828c109c..0d5c29879a50 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
struct cgroup_namespace *new_ns;
int ret;
- new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+ new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns)
return ERR_PTR(-ENOMEM);
ret = ns_alloc_inum(&new_ns->ns);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 725cfd51762b..1b02179758cb 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -80,6 +80,19 @@ config SWIOTLB
bool
select NEED_DMA_MAP_STATE
+config DMA_RESTRICTED_POOL
+ bool "DMA Restricted Pool"
+ depends on OF && OF_RESERVED_MEM && SWIOTLB
+ help
+ This enables support for restricted DMA pools which provide a level of
+ DMA memory protection on systems with limited hardware protection
+ capabilities, such as those lacking an IOMMU.
+
+ For more information see
+ <Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt>
+ and <kernel/dma/swiotlb.c>.
+ If unsure, say "n".
+
#
# Should be selected if we can mmap non-coherent mappings to userspace.
# The only thing that is really required is a way to set an uncached bit
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8dca4f97d12d..4c6c5e0635e3 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -75,6 +75,15 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}
+static void __dma_direct_free_pages(struct device *dev, struct page *page,
+ size_t size)
+{
+ if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
+ swiotlb_free(dev, page, size))
+ return;
+ dma_free_contiguous(dev, page, size);
+}
+
static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp_t gfp)
{
@@ -86,6 +95,16 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_limit);
+ if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
+ is_swiotlb_for_alloc(dev)) {
+ page = swiotlb_alloc(dev, size);
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ __dma_direct_free_pages(dev, page, size);
+ return NULL;
+ }
+ return page;
+ }
+
page = dma_alloc_contiguous(dev, size, gfp);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, size);
@@ -142,7 +161,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
gfp |= __GFP_NOWARN;
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev)) {
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
if (!page)
return NULL;
@@ -157,7 +176,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
!IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
- !dev_is_dma_coherent(dev))
+ !dev_is_dma_coherent(dev) &&
+ !is_swiotlb_for_alloc(dev))
return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
@@ -167,11 +187,16 @@ void *dma_direct_alloc(struct device *dev, size_t size,
/*
* Remapping or decrypting memory may block. If either is required and
* we can't block, allocate the memory from the atomic pools.
+ * If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must
+ * set up another device coherent pool by shared-dma-pool and use
+ * dma_alloc_from_dev_coherent instead.
*/
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
!gfpflags_allow_blocking(gfp) &&
(force_dma_unencrypted(dev) ||
- (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev))))
+ (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ !dev_is_dma_coherent(dev))) &&
+ !is_swiotlb_for_alloc(dev))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
/* we always manually zero the memory once we are done */
@@ -242,7 +267,7 @@ out_encrypt_pages:
return NULL;
}
out_free_pages:
- dma_free_contiguous(dev, page, size);
+ __dma_direct_free_pages(dev, page, size);
return NULL;
}
@@ -252,7 +277,7 @@ void dma_direct_free(struct device *dev, size_t size,
unsigned int page_order = get_order(size);
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev)) {
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
dma_free_contiguous(dev, cpu_addr, size);
return;
@@ -261,7 +286,8 @@ void dma_direct_free(struct device *dev, size_t size,
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
!IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
- !dev_is_dma_coherent(dev)) {
+ !dev_is_dma_coherent(dev) &&
+ !is_swiotlb_for_alloc(dev)) {
arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
return;
}
@@ -286,7 +312,7 @@ void dma_direct_free(struct device *dev, size_t size,
else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
arch_dma_clear_uncached(cpu_addr, size);
- dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
+ __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
}
struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
@@ -296,7 +322,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
void *ret;
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
- force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp))
+ force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) &&
+ !is_swiotlb_for_alloc(dev))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
page = __dma_direct_alloc_pages(dev, size, gfp);
@@ -323,7 +350,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
out_free_pages:
- dma_free_contiguous(dev, page, size);
+ __dma_direct_free_pages(dev, page, size);
return NULL;
}
@@ -342,7 +369,7 @@ void dma_direct_free_pages(struct device *dev, size_t size,
if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
- dma_free_contiguous(dev, page, size);
+ __dma_direct_free_pages(dev, page, size);
}
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
@@ -356,7 +383,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
for_each_sg(sgl, sg, nents, i) {
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
- if (unlikely(is_swiotlb_buffer(paddr)))
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_device(dev, paddr, sg->length,
dir);
@@ -382,7 +409,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(paddr, sg->length, dir);
- if (unlikely(is_swiotlb_buffer(paddr)))
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
dir);
@@ -510,8 +537,8 @@ int dma_direct_supported(struct device *dev, u64 mask)
size_t dma_direct_max_mapping_size(struct device *dev)
{
/* If SWIOTLB is active, use its maximum mapping size */
- if (is_swiotlb_active() &&
- (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
+ if (is_swiotlb_active(dev) &&
+ (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev)))
return swiotlb_max_mapping_size(dev);
return SIZE_MAX;
}
@@ -519,7 +546,7 @@ size_t dma_direct_max_mapping_size(struct device *dev)
bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
{
return !dev_is_dma_coherent(dev) ||
- is_swiotlb_buffer(dma_to_phys(dev, dma_addr));
+ is_swiotlb_buffer(dev, dma_to_phys(dev, dma_addr));
}
/**
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 50afc05b6f1d..4632b0f4f72e 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -56,7 +56,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
{
phys_addr_t paddr = dma_to_phys(dev, addr);
- if (unlikely(is_swiotlb_buffer(paddr)))
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_device(dev, paddr, size, dir);
if (!dev_is_dma_coherent(dev))
@@ -73,7 +73,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
arch_sync_dma_for_cpu_all();
}
- if (unlikely(is_swiotlb_buffer(paddr)))
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
if (dir == DMA_FROM_DEVICE)
@@ -87,7 +87,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
phys_addr_t phys = page_to_phys(page) + offset;
dma_addr_t dma_addr = phys_to_dma(dev, phys);
- if (unlikely(swiotlb_force == SWIOTLB_FORCE))
+ if (is_swiotlb_force_bounce(dev))
return swiotlb_map(dev, phys, size, dir, attrs);
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
@@ -113,7 +113,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
- if (unlikely(is_swiotlb_buffer(phys)))
+ if (unlikely(is_swiotlb_buffer(dev, phys)))
swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
}
#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index e50df8d8f87e..87c40517e822 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -39,6 +39,13 @@
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#endif
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
+#include <linux/slab.h>
+#endif
#include <asm/io.h>
#include <asm/dma.h>
@@ -63,7 +70,7 @@
enum swiotlb_force swiotlb_force;
-struct io_tlb_mem *io_tlb_default_mem;
+struct io_tlb_mem io_tlb_default_mem;
/*
* Max segment that we can provide which (if pages are contingous) will
@@ -94,7 +101,7 @@ early_param("swiotlb", setup_io_tlb_npages);
unsigned int swiotlb_max_segment(void)
{
- return io_tlb_default_mem ? max_segment : 0;
+ return io_tlb_default_mem.nslabs ? max_segment : 0;
}
EXPORT_SYMBOL_GPL(swiotlb_max_segment);
@@ -127,9 +134,9 @@ void __init swiotlb_adjust_size(unsigned long size)
void swiotlb_print_info(void)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
- if (!mem) {
+ if (!mem->nslabs) {
pr_warn("No low mem\n");
return;
}
@@ -156,11 +163,11 @@ static inline unsigned long nr_slots(u64 val)
*/
void __init swiotlb_update_mem_attributes(void)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
void *vaddr;
unsigned long bytes;
- if (!mem || mem->late_alloc)
+ if (!mem->nslabs || mem->late_alloc)
return;
vaddr = phys_to_virt(mem->start);
bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
@@ -168,36 +175,50 @@ void __init swiotlb_update_mem_attributes(void)
memset(vaddr, 0, bytes);
}
-int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
+ unsigned long nslabs, bool late_alloc)
{
+ void *vaddr = phys_to_virt(start);
unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
- struct io_tlb_mem *mem;
- size_t alloc_size;
-
- if (swiotlb_force == SWIOTLB_NO_FORCE)
- return 0;
-
- /* protect against double initialization */
- if (WARN_ON_ONCE(io_tlb_default_mem))
- return -ENOMEM;
- alloc_size = PAGE_ALIGN(struct_size(mem, slots, nslabs));
- mem = memblock_alloc(alloc_size, PAGE_SIZE);
- if (!mem)
- panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
- __func__, alloc_size, PAGE_SIZE);
mem->nslabs = nslabs;
- mem->start = __pa(tlb);
+ mem->start = start;
mem->end = mem->start + bytes;
mem->index = 0;
+ mem->late_alloc = late_alloc;
+
+ if (swiotlb_force == SWIOTLB_FORCE)
+ mem->force_bounce = true;
+
spin_lock_init(&mem->lock);
for (i = 0; i < mem->nslabs; i++) {
mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
}
+ memset(vaddr, 0, bytes);
+}
+
+int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+{
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+ size_t alloc_size;
+
+ if (swiotlb_force == SWIOTLB_NO_FORCE)
+ return 0;
+
+ /* protect against double initialization */
+ if (WARN_ON_ONCE(mem->nslabs))
+ return -ENOMEM;
+
+ alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
+ mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!mem->slots)
+ panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+
+ swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
- io_tlb_default_mem = mem;
if (verbose)
swiotlb_print_info();
swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
@@ -282,37 +303,24 @@ swiotlb_late_init_with_default_size(size_t default_size)
int
swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
{
- unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
- struct io_tlb_mem *mem;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+ unsigned long bytes = nslabs << IO_TLB_SHIFT;
if (swiotlb_force == SWIOTLB_NO_FORCE)
return 0;
/* protect against double initialization */
- if (WARN_ON_ONCE(io_tlb_default_mem))
+ if (WARN_ON_ONCE(mem->nslabs))
return -ENOMEM;
- mem = (void *)__get_free_pages(GFP_KERNEL,
- get_order(struct_size(mem, slots, nslabs)));
- if (!mem)
+ mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(array_size(sizeof(*mem->slots), nslabs)));
+ if (!mem->slots)
return -ENOMEM;
- mem->nslabs = nslabs;
- mem->start = virt_to_phys(tlb);
- mem->end = mem->start + bytes;
- mem->index = 0;
- mem->late_alloc = 1;
- spin_lock_init(&mem->lock);
- for (i = 0; i < mem->nslabs; i++) {
- mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
- mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
- mem->slots[i].alloc_size = 0;
- }
-
set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
- memset(tlb, 0, bytes);
+ swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true);
- io_tlb_default_mem = mem;
swiotlb_print_info();
swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
return 0;
@@ -320,18 +328,28 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
void __init swiotlb_exit(void)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
- size_t size;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+ unsigned long tbl_vaddr;
+ size_t tbl_size, slots_size;
- if (!mem)
+ if (!mem->nslabs)
return;
- size = struct_size(mem, slots, mem->nslabs);
- if (mem->late_alloc)
- free_pages((unsigned long)mem, get_order(size));
- else
- memblock_free_late(__pa(mem), PAGE_ALIGN(size));
- io_tlb_default_mem = NULL;
+ pr_info("tearing down default memory pool\n");
+ tbl_vaddr = (unsigned long)phys_to_virt(mem->start);
+ tbl_size = PAGE_ALIGN(mem->end - mem->start);
+ slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs));
+
+ set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
+ if (mem->late_alloc) {
+ free_pages(tbl_vaddr, get_order(tbl_size));
+ free_pages((unsigned long)mem->slots, get_order(slots_size));
+ } else {
+ memblock_free_late(mem->start, tbl_size);
+ memblock_free_late(__pa(mem->slots), slots_size);
+ }
+
+ memset(mem, 0, sizeof(*mem));
}
/*
@@ -348,19 +366,33 @@ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
enum dma_data_direction dir)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = mem->slots[index].orig_addr;
size_t alloc_size = mem->slots[index].alloc_size;
unsigned long pfn = PFN_DOWN(orig_addr);
unsigned char *vaddr = phys_to_virt(tlb_addr);
- unsigned int tlb_offset;
+ unsigned int tlb_offset, orig_addr_offset;
if (orig_addr == INVALID_PHYS_ADDR)
return;
- tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) -
- swiotlb_align_offset(dev, orig_addr);
+ tlb_offset = tlb_addr & (IO_TLB_SIZE - 1);
+ orig_addr_offset = swiotlb_align_offset(dev, orig_addr);
+ if (tlb_offset < orig_addr_offset) {
+ dev_WARN_ONCE(dev, 1,
+ "Access before mapping start detected. orig offset %u, requested offset %u.\n",
+ orig_addr_offset, tlb_offset);
+ return;
+ }
+
+ tlb_offset -= orig_addr_offset;
+ if (tlb_offset > alloc_size) {
+ dev_WARN_ONCE(dev, 1,
+ "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n",
+ alloc_size, size, tlb_offset);
+ return;
+ }
orig_addr += tlb_offset;
alloc_size -= tlb_offset;
@@ -426,10 +458,10 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
* Find a suitable number of IO TLB entries size that will fit this request and
* allocate a buffer from that IO TLB pool.
*/
-static int find_slots(struct device *dev, phys_addr_t orig_addr,
- size_t alloc_size)
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
@@ -438,6 +470,7 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
unsigned int nslots = nr_slots(alloc_size), stride;
unsigned int index, wrap, count = 0, i;
+ unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned long flags;
BUG_ON(!nslots);
@@ -457,8 +490,9 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
do {
- if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
- (orig_addr & iotlb_align_mask)) {
+ if (orig_addr &&
+ (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
+ (orig_addr & iotlb_align_mask)) {
index = wrap_index(mem, index + 1);
continue;
}
@@ -482,8 +516,11 @@ not_found:
return -1;
found:
- for (i = index; i < index + nslots; i++)
+ for (i = index; i < index + nslots; i++) {
mem->slots[i].list = 0;
+ mem->slots[i].alloc_size =
+ alloc_size - (offset + ((i - index) << IO_TLB_SHIFT));
+ }
for (i = index - 1;
io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
mem->slots[i].list; i--)
@@ -506,7 +543,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
size_t mapping_size, size_t alloc_size,
enum dma_data_direction dir, unsigned long attrs)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned int i;
int index;
@@ -524,7 +561,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
return (phys_addr_t)DMA_MAPPING_ERROR;
}
- index = find_slots(dev, orig_addr, alloc_size + offset);
+ index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset);
if (index == -1) {
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
@@ -538,11 +575,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
* This is needed when we sync the memory. Then we sync the buffer if
* needed.
*/
- for (i = 0; i < nr_slots(alloc_size + offset); i++) {
+ for (i = 0; i < nr_slots(alloc_size + offset); i++)
mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
- mem->slots[index + i].alloc_size =
- alloc_size - (i << IO_TLB_SHIFT);
- }
tlb_addr = slot_addr(mem->start, index) + offset;
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
@@ -550,28 +584,16 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
return tlb_addr;
}
-/*
- * tlb_addr is the physical address of the bounce buffer to unmap.
- */
-void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t mapping_size, enum dma_data_direction dir,
- unsigned long attrs)
+static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long flags;
- unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
+ unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
int nslots = nr_slots(mem->slots[index].alloc_size + offset);
int count, i;
/*
- * First, sync the memory before unmapping the entry
- */
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
-
- /*
* Return the buffer to the free list by setting the corresponding
* entries to indicate the number of contiguous entries available.
* While returning the entries to the free list, we merge the entries
@@ -605,6 +627,23 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
spin_unlock_irqrestore(&mem->lock, flags);
}
+/*
+ * tlb_addr is the physical address of the bounce buffer to unmap.
+ */
+void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
+ size_t mapping_size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ /*
+ * First, sync the memory before unmapping the entry
+ */
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+
+ swiotlb_release_slots(dev, tlb_addr);
+}
+
void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir)
{
@@ -662,26 +701,155 @@ size_t swiotlb_max_mapping_size(struct device *dev)
return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
}
-bool is_swiotlb_active(void)
+bool is_swiotlb_active(struct device *dev)
{
- return io_tlb_default_mem != NULL;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+
+ return mem && mem->nslabs;
}
EXPORT_SYMBOL_GPL(is_swiotlb_active);
#ifdef CONFIG_DEBUG_FS
+static struct dentry *debugfs_dir;
-static int __init swiotlb_create_debugfs(void)
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
-
- if (!mem)
- return 0;
- mem->debugfs = debugfs_create_dir("swiotlb", NULL);
debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
+}
+
+static int __init swiotlb_create_default_debugfs(void)
+{
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+
+ debugfs_dir = debugfs_create_dir("swiotlb", NULL);
+ if (mem->nslabs) {
+ mem->debugfs = debugfs_dir;
+ swiotlb_create_debugfs_files(mem);
+ }
return 0;
}
-late_initcall(swiotlb_create_debugfs);
+late_initcall(swiotlb_create_default_debugfs);
+
+#endif
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+
+#ifdef CONFIG_DEBUG_FS
+static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
+{
+ struct io_tlb_mem *mem = rmem->priv;
+
+ mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir);
+ swiotlb_create_debugfs_files(mem);
+}
+#else
+static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
+{
+}
#endif
+
+struct page *swiotlb_alloc(struct device *dev, size_t size)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ phys_addr_t tlb_addr;
+ int index;
+
+ if (!mem)
+ return NULL;
+
+ index = swiotlb_find_slots(dev, 0, size);
+ if (index == -1)
+ return NULL;
+
+ tlb_addr = slot_addr(mem->start, index);
+
+ return pfn_to_page(PFN_DOWN(tlb_addr));
+}
+
+bool swiotlb_free(struct device *dev, struct page *page, size_t size)
+{
+ phys_addr_t tlb_addr = page_to_phys(page);
+
+ if (!is_swiotlb_buffer(dev, tlb_addr))
+ return false;
+
+ swiotlb_release_slots(dev, tlb_addr);
+
+ return true;
+}
+
+static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ struct io_tlb_mem *mem = rmem->priv;
+ unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
+
+ /*
+ * Since multiple devices can share the same pool, the private data,
+ * io_tlb_mem struct, will be initialized by the first device attached
+ * to it.
+ */
+ if (!mem) {
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem)
+ return -ENOMEM;
+
+ mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs),
+ GFP_KERNEL);
+ if (!mem->slots) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
+ rmem->size >> PAGE_SHIFT);
+ swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false);
+ mem->force_bounce = true;
+ mem->for_alloc = true;
+
+ rmem->priv = mem;
+
+ rmem_swiotlb_debugfs_init(rmem);
+ }
+
+ dev->dma_io_tlb_mem = mem;
+
+ return 0;
+}
+
+static void rmem_swiotlb_device_release(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ dev->dma_io_tlb_mem = &io_tlb_default_mem;
+}
+
+static const struct reserved_mem_ops rmem_swiotlb_ops = {
+ .device_init = rmem_swiotlb_device_init,
+ .device_release = rmem_swiotlb_device_release,
+};
+
+static int __init rmem_swiotlb_setup(struct reserved_mem *rmem)
+{
+ unsigned long node = rmem->fdt_node;
+
+ if (of_get_flat_dt_prop(node, "reusable", NULL) ||
+ of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
+ of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
+ of_get_flat_dt_prop(node, "no-map", NULL))
+ return -EINVAL;
+
+ if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) {
+ pr_err("Restricted DMA pool must be accessible within the linear mapping.");
+ return -EINVAL;
+ }
+
+ rmem->ops = &rmem_swiotlb_ops;
+ pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n",
+ &rmem->base, (unsigned long)rmem->size / SZ_1M);
+ return 0;
+}
+
+RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup);
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 62be16135e7c..19e83e9b723c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -491,6 +491,7 @@ struct irq_domain *irq_get_default_host(void)
{
return irq_default_domain;
}
+EXPORT_SYMBOL_GPL(irq_get_default_host);
static bool irq_domain_is_nomap(struct irq_domain *domain)
{
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 3a4beb9395c4..291b857a6e20 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -411,7 +411,7 @@ void klp_try_complete_transition(void)
/*
* Ditto for the idle "swapper" tasks.
*/
- get_online_cpus();
+ cpus_read_lock();
for_each_possible_cpu(cpu) {
task = idle_task(cpu);
if (cpu_online(cpu)) {
@@ -423,7 +423,7 @@ void klp_try_complete_transition(void)
task->patch_state = klp_target_state;
}
}
- put_online_cpus();
+ cpus_read_unlock();
if (!complete) {
if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT))
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index abc01fcad8c7..eec72ca962e2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -568,6 +568,6 @@ out:
int __init nsproxy_cache_init(void)
{
- nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
+ nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT);
return 0;
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ca43239a255a..a46a3723bc66 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -51,7 +51,8 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
mutex_lock(&pid_caches_mutex);
/* Name collision forces to do allocation under mutex. */
if (!*pkc)
- *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0);
+ *pkc = kmem_cache_create(name, len, 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0);
mutex_unlock(&pid_caches_mutex);
/* current can fail, but someone else can succeed. */
return READ_ONCE(*pkc);
@@ -449,7 +450,7 @@ const struct proc_ns_operations pidns_for_children_operations = {
static __init int pid_namespaces_init(void)
{
- pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+ pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
#ifdef CONFIG_CHECKPOINT_RESTORE
register_sysctl_paths(kern_path, pid_ns_ctl_table);
diff --git a/kernel/signal.c b/kernel/signal.c
index cf7e2505ae31..952741f6d0f9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -4726,7 +4726,7 @@ void __init signals_init(void)
{
siginfo_buildtime_checks();
- sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
+ sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT);
}
#ifdef CONFIG_KGDB_KDB
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index cb6f98f5c97a..64578adfe115 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -289,6 +289,7 @@ COND_SYSCALL(munlockall);
COND_SYSCALL(mincore);
COND_SYSCALL(madvise);
COND_SYSCALL(process_madvise);
+COND_SYSCALL(process_mrelease);
COND_SYSCALL(remap_file_pages);
COND_SYSCALL(mbind);
COND_SYSCALL_COMPAT(mbind);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 25e49b4d8049..083be6af29d7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2912,7 +2912,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_compaction_proactiveness,
.maxlen = sizeof(sysctl_compaction_proactiveness),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = compaction_proactiveness_sysctl_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 12eab0d2ae28..aec832801c26 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -88,13 +88,13 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = kmalloc(sizeof(*ns), GFP_KERNEL);
+ ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
if (!ns)
goto fail_dec;
refcount_set(&ns->ns.count, 1);
- ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!ns->vvar_page)
goto fail_free;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 3913222e7bcf..1cd10b102c51 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -273,8 +273,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
static __init int init_posix_timers(void)
{
posix_timers_cache = kmem_cache_create("posix_timers_cache",
- sizeof (struct k_itimer), 0, SLAB_PANIC,
- NULL);
+ sizeof(struct k_itimer), 0,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
return 0;
}
__initcall(init_posix_timers);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index ef82d401dde8..6b2e3ca7ee99 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1385,7 +1385,7 @@ const struct proc_ns_operations userns_operations = {
static __init int user_namespaces_init(void)
{
- user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+ user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
return 0;
}
subsys_initcall(user_namespaces_init);
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index f4b1ff78ab2d..abb3432ed744 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -918,9 +918,8 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
miter->__offset += miter->consumed;
miter->__remaining -= miter->consumed;
- if ((miter->__flags & SG_MITER_TO_SG) &&
- !PageSlab(miter->page))
- flush_kernel_dcache_page(miter->page);
+ if (miter->__flags & SG_MITER_TO_SG)
+ flush_dcache_page(miter->page);
if (miter->__flags & SG_MITER_ATOMIC) {
WARN_ON_ONCE(preemptible());
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 8be9d4b3b259..8835e0784578 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -120,12 +120,28 @@ static void kasan_test_exit(struct kunit *test)
static void kmalloc_oob_right(struct kunit *test)
{
char *ptr;
- size_t size = 123;
+ size_t size = 128 - KASAN_GRANULE_SIZE - 5;
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 'x');
+ /*
+ * An unaligned access past the requested kmalloc size.
+ * Only generic KASAN can precisely detect these.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'x');
+
+ /*
+ * An aligned access into the first out-of-bounds granule that falls
+ * within the aligned kmalloc object.
+ */
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y');
+
+ /* Out-of-bounds access past the aligned kmalloc object. */
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] =
+ ptr[size + KASAN_GRANULE_SIZE + 5]);
+
kfree(ptr);
}
@@ -149,7 +165,7 @@ static void kmalloc_node_oob_right(struct kunit *test)
ptr = kmalloc_node(size, GFP_KERNEL, 0);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
kfree(ptr);
}
@@ -185,7 +201,7 @@ static void kmalloc_pagealloc_uaf(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
kfree(ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
}
static void kmalloc_pagealloc_invalid_free(struct kunit *test)
@@ -219,7 +235,7 @@ static void pagealloc_oob_right(struct kunit *test)
ptr = page_address(pages);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
free_pages((unsigned long)ptr, order);
}
@@ -234,7 +250,7 @@ static void pagealloc_uaf(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
free_pages((unsigned long)ptr, order);
- KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
}
static void kmalloc_large_oob_right(struct kunit *test)
@@ -410,64 +426,70 @@ static void kmalloc_uaf_16(struct kunit *test)
kfree(ptr1);
}
+/*
+ * Note: in the memset tests below, the written range touches both valid and
+ * invalid memory. This makes sure that the instrumentation does not only check
+ * the starting address but the whole range.
+ */
+
static void kmalloc_oob_memset_2(struct kunit *test)
{
char *ptr;
- size_t size = 8;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 7 + OOB_TAG_OFF, 0, 2));
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2));
kfree(ptr);
}
static void kmalloc_oob_memset_4(struct kunit *test)
{
char *ptr;
- size_t size = 8;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 5 + OOB_TAG_OFF, 0, 4));
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4));
kfree(ptr);
}
-
static void kmalloc_oob_memset_8(struct kunit *test)
{
char *ptr;
- size_t size = 8;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 1 + OOB_TAG_OFF, 0, 8));
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8));
kfree(ptr);
}
static void kmalloc_oob_memset_16(struct kunit *test)
{
char *ptr;
- size_t size = 16;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 1 + OOB_TAG_OFF, 0, 16));
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16));
kfree(ptr);
}
static void kmalloc_oob_in_memset(struct kunit *test)
{
char *ptr;
- size_t size = 666;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size + 5 + OOB_TAG_OFF));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ memset(ptr, 0, size + KASAN_GRANULE_SIZE));
kfree(ptr);
}
@@ -477,11 +499,17 @@ static void kmalloc_memmove_invalid_size(struct kunit *test)
size_t size = 64;
volatile size_t invalid_size = -2;
+ /*
+ * Hardware tag-based mode doesn't check memmove for negative size.
+ * As a result, this test introduces a side-effect memory corruption,
+ * which can result in a crash.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
memset((char *)ptr, 0, 64);
-
KUNIT_EXPECT_KASAN_FAIL(test,
memmove((char *)ptr, (char *)ptr + 4, invalid_size));
kfree(ptr);
@@ -496,7 +524,7 @@ static void kmalloc_uaf(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
kfree(ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, *(ptr + 8) = 'x');
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]);
}
static void kmalloc_uaf_memset(struct kunit *test)
@@ -504,6 +532,12 @@ static void kmalloc_uaf_memset(struct kunit *test)
char *ptr;
size_t size = 33;
+ /*
+ * Only generic KASAN uses quarantine, which is required to avoid a
+ * kernel memory corruption this test causes.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -535,7 +569,7 @@ again:
goto again;
}
- KUNIT_EXPECT_KASAN_FAIL(test, ptr1[40] = 'x');
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]);
KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2);
kfree(ptr2);
@@ -682,7 +716,7 @@ static void ksize_unpoisons_memory(struct kunit *test)
ptr[size] = 'x';
/* This one must. */
- KUNIT_EXPECT_KASAN_FAIL(test, ptr[real_size] = 'y');
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size]);
kfree(ptr);
}
@@ -701,8 +735,8 @@ static void ksize_uaf(struct kunit *test)
kfree(ptr);
KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *(ptr + size));
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]);
}
static void kasan_stack_oob(struct kunit *test)
diff --git a/lib/test_kasan_module.c b/lib/test_kasan_module.c
index f1017f345d6c..7ebf433edef3 100644
--- a/lib/test_kasan_module.c
+++ b/lib/test_kasan_module.c
@@ -15,13 +15,11 @@
#include "../mm/kasan/kasan.h"
-#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
-
static noinline void __init copy_user_test(void)
{
char *kmem;
char __user *usermem;
- size_t size = 10;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
int __maybe_unused unused;
kmem = kmalloc(size, GFP_KERNEL);
@@ -38,25 +36,25 @@ static noinline void __init copy_user_test(void)
}
pr_info("out-of-bounds in copy_from_user()\n");
- unused = copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
+ unused = copy_from_user(kmem, usermem, size + 1);
pr_info("out-of-bounds in copy_to_user()\n");
- unused = copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF);
+ unused = copy_to_user(usermem, kmem, size + 1);
pr_info("out-of-bounds in __copy_from_user()\n");
- unused = __copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
+ unused = __copy_from_user(kmem, usermem, size + 1);
pr_info("out-of-bounds in __copy_to_user()\n");
- unused = __copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF);
+ unused = __copy_to_user(usermem, kmem, size + 1);
pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
- unused = __copy_from_user_inatomic(kmem, usermem, size + 1 + OOB_TAG_OFF);
+ unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
- unused = __copy_to_user_inatomic(usermem, kmem, size + 1 + OOB_TAG_OFF);
+ unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
pr_info("out-of-bounds in strncpy_from_user()\n");
- unused = strncpy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
+ unused = strncpy_from_user(kmem, usermem, size + 1);
vm_munmap((unsigned long)usermem, PAGE_SIZE);
kfree(kmem);
@@ -73,7 +71,7 @@ static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp)
struct kasan_rcu_info, rcu);
kfree(fp);
- fp->i = 1;
+ ((volatile struct kasan_rcu_info *)fp)->i;
}
static noinline void __init kasan_rcu_uaf(void)
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index 01e9543de566..e14993bc84d2 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -35,6 +35,9 @@ __param(int, test_repeat_count, 1,
__param(int, test_loop_count, 1000000,
"Set test loop counter");
+__param(int, nr_pages, 0,
+ "Set number of pages for fix_size_alloc_test(default: 1)");
+
__param(int, run_test_mask, INT_MAX,
"Set tests specified in the mask.\n\n"
"\t\tid: 1, name: fix_size_alloc_test\n"
@@ -262,7 +265,7 @@ static int fix_size_alloc_test(void)
int i;
for (i = 0; i < test_loop_count; i++) {
- ptr = vmalloc(3 * PAGE_SIZE);
+ ptr = vmalloc((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE);
if (!ptr)
return -1;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index cd06dca232c3..4a9d4e27d0d9 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -271,6 +271,14 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
spin_unlock_bh(&wb->work_lock);
}
+static void wb_update_bandwidth_workfn(struct work_struct *work)
+{
+ struct bdi_writeback *wb = container_of(to_delayed_work(work),
+ struct bdi_writeback, bw_dwork);
+
+ wb_update_bandwidth(wb);
+}
+
/*
* Initial write bandwidth: 100 MB/s
*/
@@ -293,6 +301,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->b_dirty_time);
spin_lock_init(&wb->list_lock);
+ atomic_set(&wb->writeback_inodes, 0);
wb->bw_time_stamp = jiffies;
wb->balanced_dirty_ratelimit = INIT_BW;
wb->dirty_ratelimit = INIT_BW;
@@ -302,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
spin_lock_init(&wb->work_lock);
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+ INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
@@ -350,6 +360,7 @@ static void wb_shutdown(struct bdi_writeback *wb)
mod_delayed_work(bdi_wq, &wb->dwork, 0);
flush_delayed_work(&wb->dwork);
WARN_ON(!list_empty(&wb->work_list));
+ flush_delayed_work(&wb->bw_dwork);
}
static void wb_exit(struct bdi_writeback *wb)
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
index 5b152dba7344..f03f42f426f6 100644
--- a/mm/bootmem_info.c
+++ b/mm/bootmem_info.c
@@ -39,7 +39,7 @@ void put_page_bootmem(struct page *page)
}
#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void register_page_bootmem_info_section(unsigned long start_pfn)
+static void __init register_page_bootmem_info_section(unsigned long start_pfn)
{
unsigned long mapsize, section_nr, i;
struct mem_section *ms;
@@ -74,7 +74,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
}
#else /* CONFIG_SPARSEMEM_VMEMMAP */
-static void register_page_bootmem_info_section(unsigned long start_pfn)
+static void __init register_page_bootmem_info_section(unsigned long start_pfn)
{
unsigned long mapsize, section_nr, i;
struct mem_section *ms;
diff --git a/mm/compaction.c b/mm/compaction.c
index 621508e0ecd5..fa9b2b598eab 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2398,7 +2398,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
- MR_COMPACTION);
+ MR_COMPACTION, NULL);
trace_mm_compaction_migratepages(cc->nr_migratepages, err,
&cc->migratepages);
@@ -2706,6 +2706,30 @@ static void compact_nodes(void)
*/
unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ int rc, nid;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write && sysctl_compaction_proactiveness) {
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ if (pgdat->proactive_compact_trigger)
+ continue;
+
+ pgdat->proactive_compact_trigger = true;
+ wake_up_interruptible(&pgdat->kcompactd_wait);
+ }
+ }
+
+ return 0;
+}
+
/*
* This is the entry point for compacting all nodes via
* /proc/sys/vm/compact_memory
@@ -2750,7 +2774,8 @@ void compaction_unregister_node(struct node *node)
static inline bool kcompactd_work_requested(pg_data_t *pgdat)
{
- return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
+ return pgdat->kcompactd_max_order > 0 || kthread_should_stop() ||
+ pgdat->proactive_compact_trigger;
}
static bool kcompactd_node_suitable(pg_data_t *pgdat)
@@ -2885,7 +2910,8 @@ static int kcompactd(void *p)
{
pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
- unsigned int proactive_defer = 0;
+ long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
+ long timeout = default_timeout;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -2900,25 +2926,39 @@ static int kcompactd(void *p)
while (!kthread_should_stop()) {
unsigned long pflags;
+ /*
+ * Avoid the unnecessary wakeup for proactive compaction
+ * when it is disabled.
+ */
+ if (!sysctl_compaction_proactiveness)
+ timeout = MAX_SCHEDULE_TIMEOUT;
trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
- kcompactd_work_requested(pgdat),
- msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
+ kcompactd_work_requested(pgdat), timeout) &&
+ !pgdat->proactive_compact_trigger) {
psi_memstall_enter(&pflags);
kcompactd_do_work(pgdat);
psi_memstall_leave(&pflags);
+ /*
+ * Reset the timeout value. The defer timeout from
+ * proactive compaction is lost here but that is fine
+ * as the condition of the zone changing substantionally
+ * then carrying on with the previous defer interval is
+ * not useful.
+ */
+ timeout = default_timeout;
continue;
}
- /* kcompactd wait timeout */
+ /*
+ * Start the proactive work with default timeout. Based
+ * on the fragmentation score, this timeout is updated.
+ */
+ timeout = default_timeout;
if (should_proactive_compact_node(pgdat)) {
unsigned int prev_score, score;
- if (proactive_defer) {
- proactive_defer--;
- continue;
- }
prev_score = fragmentation_score_node(pgdat);
proactive_compact_node(pgdat);
score = fragmentation_score_node(pgdat);
@@ -2926,9 +2966,12 @@ static int kcompactd(void *p)
* Defer proactive compaction if the fragmentation
* score did not go down i.e. no progress made.
*/
- proactive_defer = score < prev_score ?
- 0 : 1 << COMPACT_MAX_DEFER_SHIFT;
+ if (unlikely(score >= prev_score))
+ timeout =
+ default_timeout << COMPACT_MAX_DEFER_SHIFT;
}
+ if (unlikely(pgdat->proactive_compact_trigger))
+ pgdat->proactive_compact_trigger = false;
}
return 0;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 1c922691aa61..1403639302e4 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -29,6 +29,8 @@
#include <linux/start_kernel.h>
#include <linux/sched/mm.h>
#include <linux/io.h>
+
+#include <asm/cacheflush.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -58,10 +60,41 @@
#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
#define RANDOM_NZVALUE GENMASK(7, 0)
-static void __init pte_basic_tests(unsigned long pfn, int idx)
+struct pgtable_debug_args {
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ p4d_t *start_p4dp;
+ pud_t *start_pudp;
+ pmd_t *start_pmdp;
+ pgtable_t start_ptep;
+
+ unsigned long vaddr;
+ pgprot_t page_prot;
+ pgprot_t page_prot_none;
+
+ bool is_contiguous_page;
+ unsigned long pud_pfn;
+ unsigned long pmd_pfn;
+ unsigned long pte_pfn;
+
+ unsigned long fixed_pgd_pfn;
+ unsigned long fixed_p4d_pfn;
+ unsigned long fixed_pud_pfn;
+ unsigned long fixed_pmd_pfn;
+ unsigned long fixed_pte_pfn;
+};
+
+static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
{
pgprot_t prot = protection_map[idx];
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, prot);
unsigned long val = idx, *ptr = &val;
pr_debug("Validating PTE basic (%pGv)\n", ptr);
@@ -86,53 +119,63 @@ static void __init pte_basic_tests(unsigned long pfn, int idx)
WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte))));
}
-static void __init pte_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pte_t *ptep,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+static void __init pte_advanced_tests(struct pgtable_debug_args *args)
{
+ struct page *page;
pte_t pte;
/*
* Architectures optimize set_pte_at by avoiding TLB flush.
* This requires set_pte_at to be not used to update an
* existing pte entry. Clear pte before we do set_pte_at
+ *
+ * flush_dcache_page() is called after set_pte_at() to clear
+ * PG_arch_1 for the page on ARM64. The page flag isn't cleared
+ * when it's released and page allocation check will fail when
+ * the page is allocated again. For architectures other than ARM64,
+ * the unexpected overhead of cache flushing is acceptable.
*/
+ page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL;
+ if (!page)
+ return;
pr_debug("Validating PTE advanced\n");
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
- ptep_set_wrprotect(mm, vaddr, ptep);
- pte = ptep_get(ptep);
+ pte = pfn_pte(args->pte_pfn, args->page_prot);
+ set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+ flush_dcache_page(page);
+ ptep_set_wrprotect(args->mm, args->vaddr, args->ptep);
+ pte = ptep_get(args->ptep);
WARN_ON(pte_write(pte));
- ptep_get_and_clear(mm, vaddr, ptep);
- pte = ptep_get(ptep);
+ ptep_get_and_clear(args->mm, args->vaddr, args->ptep);
+ pte = ptep_get(args->ptep);
WARN_ON(!pte_none(pte));
- pte = pfn_pte(pfn, prot);
+ pte = pfn_pte(args->pte_pfn, args->page_prot);
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
- set_pte_at(mm, vaddr, ptep, pte);
+ set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+ flush_dcache_page(page);
pte = pte_mkwrite(pte);
pte = pte_mkdirty(pte);
- ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
- pte = ptep_get(ptep);
+ ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1);
+ pte = ptep_get(args->ptep);
WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
- ptep_get_and_clear_full(mm, vaddr, ptep, 1);
- pte = ptep_get(ptep);
+ ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1);
+ pte = ptep_get(args->ptep);
WARN_ON(!pte_none(pte));
- pte = pfn_pte(pfn, prot);
+ pte = pfn_pte(args->pte_pfn, args->page_prot);
pte = pte_mkyoung(pte);
- set_pte_at(mm, vaddr, ptep, pte);
- ptep_test_and_clear_young(vma, vaddr, ptep);
- pte = ptep_get(ptep);
+ set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+ flush_dcache_page(page);
+ ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep);
+ pte = ptep_get(args->ptep);
WARN_ON(pte_young(pte));
}
-static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_savedwrite_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none);
if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
return;
@@ -143,7 +186,7 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_basic_tests(unsigned long pfn, int idx)
+static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx)
{
pgprot_t prot = protection_map[idx];
unsigned long val = idx, *ptr = &val;
@@ -153,7 +196,7 @@ static void __init pmd_basic_tests(unsigned long pfn, int idx)
return;
pr_debug("Validating PMD basic (%pGv)\n", ptr);
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, prot);
/*
* This test needs to be executed after the given page table entry
@@ -181,57 +224,70 @@ static void __init pmd_basic_tests(unsigned long pfn, int idx)
WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
}
-static void __init pmd_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pmd_t *pmdp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot, pgtable_t pgtable)
+static void __init pmd_advanced_tests(struct pgtable_debug_args *args)
{
+ struct page *page;
pmd_t pmd;
+ unsigned long vaddr = args->vaddr;
if (!has_transparent_hugepage())
return;
+ page = (args->pmd_pfn != ULONG_MAX) ? pfn_to_page(args->pmd_pfn) : NULL;
+ if (!page)
+ return;
+
+ /*
+ * flush_dcache_page() is called after set_pmd_at() to clear
+ * PG_arch_1 for the page on ARM64. The page flag isn't cleared
+ * when it's released and page allocation check will fail when
+ * the page is allocated again. For architectures other than ARM64,
+ * the unexpected overhead of cache flushing is acceptable.
+ */
pr_debug("Validating PMD advanced\n");
/* Align the address wrt HPAGE_PMD_SIZE */
vaddr &= HPAGE_PMD_MASK;
- pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+ pgtable_trans_huge_deposit(args->mm, args->pmdp, args->start_ptep);
- pmd = pfn_pmd(pfn, prot);
- set_pmd_at(mm, vaddr, pmdp, pmd);
- pmdp_set_wrprotect(mm, vaddr, pmdp);
- pmd = READ_ONCE(*pmdp);
+ pmd = pfn_pmd(args->pmd_pfn, args->page_prot);
+ set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
+ flush_dcache_page(page);
+ pmdp_set_wrprotect(args->mm, vaddr, args->pmdp);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(pmd_write(pmd));
- pmdp_huge_get_and_clear(mm, vaddr, pmdp);
- pmd = READ_ONCE(*pmdp);
+ pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(!pmd_none(pmd));
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->pmd_pfn, args->page_prot);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
- set_pmd_at(mm, vaddr, pmdp, pmd);
+ set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
+ flush_dcache_page(page);
pmd = pmd_mkwrite(pmd);
pmd = pmd_mkdirty(pmd);
- pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
- pmd = READ_ONCE(*pmdp);
+ pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
- pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
- pmd = READ_ONCE(*pmdp);
+ pmdp_huge_get_and_clear_full(args->vma, vaddr, args->pmdp, 1);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(!pmd_none(pmd));
- pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+ pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot));
pmd = pmd_mkyoung(pmd);
- set_pmd_at(mm, vaddr, pmdp, pmd);
- pmdp_test_and_clear_young(vma, vaddr, pmdp);
- pmd = READ_ONCE(*pmdp);
+ set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
+ flush_dcache_page(page);
+ pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(pmd_young(pmd));
/* Clear the pte entries */
- pmdp_huge_get_and_clear(mm, vaddr, pmdp);
- pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
+ pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp);
+ pgtable_trans_huge_withdraw(args->mm, args->pmdp);
}
-static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_leaf_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -239,7 +295,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD leaf\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
/*
* PMD based THP is a leaf entry.
@@ -248,7 +304,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_leaf(pmd));
}
-static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -259,13 +315,13 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD saved write\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none);
WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx)
+static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx)
{
pgprot_t prot = protection_map[idx];
unsigned long val = idx, *ptr = &val;
@@ -275,7 +331,7 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int
return;
pr_debug("Validating PUD basic (%pGv)\n", ptr);
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->fixed_pud_pfn, prot);
/*
* This test needs to be executed after the given page table entry
@@ -296,7 +352,7 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int
WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud))));
WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud))));
- if (mm_pmd_folded(mm))
+ if (mm_pmd_folded(args->mm))
return;
/*
@@ -306,58 +362,71 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int
WARN_ON(!pud_bad(pud_mkhuge(pud)));
}
-static void __init pud_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pud_t *pudp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+static void __init pud_advanced_tests(struct pgtable_debug_args *args)
{
+ struct page *page;
+ unsigned long vaddr = args->vaddr;
pud_t pud;
if (!has_transparent_hugepage())
return;
+ page = (args->pud_pfn != ULONG_MAX) ? pfn_to_page(args->pud_pfn) : NULL;
+ if (!page)
+ return;
+
+ /*
+ * flush_dcache_page() is called after set_pud_at() to clear
+ * PG_arch_1 for the page on ARM64. The page flag isn't cleared
+ * when it's released and page allocation check will fail when
+ * the page is allocated again. For architectures other than ARM64,
+ * the unexpected overhead of cache flushing is acceptable.
+ */
pr_debug("Validating PUD advanced\n");
/* Align the address wrt HPAGE_PUD_SIZE */
vaddr &= HPAGE_PUD_MASK;
- pud = pfn_pud(pfn, prot);
- set_pud_at(mm, vaddr, pudp, pud);
- pudp_set_wrprotect(mm, vaddr, pudp);
- pud = READ_ONCE(*pudp);
+ pud = pfn_pud(args->pud_pfn, args->page_prot);
+ set_pud_at(args->mm, vaddr, args->pudp, pud);
+ flush_dcache_page(page);
+ pudp_set_wrprotect(args->mm, vaddr, args->pudp);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(pud_write(pud));
#ifndef __PAGETABLE_PMD_FOLDED
- pudp_huge_get_and_clear(mm, vaddr, pudp);
- pud = READ_ONCE(*pudp);
+ pudp_huge_get_and_clear(args->mm, vaddr, args->pudp);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->pud_pfn, args->page_prot);
pud = pud_wrprotect(pud);
pud = pud_mkclean(pud);
- set_pud_at(mm, vaddr, pudp, pud);
+ set_pud_at(args->mm, vaddr, args->pudp, pud);
+ flush_dcache_page(page);
pud = pud_mkwrite(pud);
pud = pud_mkdirty(pud);
- pudp_set_access_flags(vma, vaddr, pudp, pud, 1);
- pud = READ_ONCE(*pudp);
+ pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
#ifndef __PAGETABLE_PMD_FOLDED
- pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
- pud = READ_ONCE(*pudp);
+ pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->pud_pfn, args->page_prot);
pud = pud_mkyoung(pud);
- set_pud_at(mm, vaddr, pudp, pud);
- pudp_test_and_clear_young(vma, vaddr, pudp);
- pud = READ_ONCE(*pudp);
+ set_pud_at(args->mm, vaddr, args->pudp, pud);
+ flush_dcache_page(page);
+ pudp_test_and_clear_young(args->vma, vaddr, args->pudp);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(pud_young(pud));
- pudp_huge_get_and_clear(mm, vaddr, pudp);
+ pudp_huge_get_and_clear(args->mm, vaddr, args->pudp);
}
-static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_leaf_tests(struct pgtable_debug_args *args)
{
pud_t pud;
@@ -365,7 +434,7 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PUD leaf\n");
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
/*
* PUD based THP is a leaf entry.
*/
@@ -373,41 +442,26 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pud_leaf(pud));
}
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
-static void __init pud_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pud_t *pudp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
-{
-}
-static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { }
+static void __init pud_advanced_tests(struct pgtable_debug_args *args) { }
+static void __init pud_leaf_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_basic_tests(unsigned long pfn, int idx) { }
-static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
-static void __init pmd_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pmd_t *pmdp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot, pgtable_t pgtable)
-{
-}
-static void __init pud_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pud_t *pudp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
-{
-}
-static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { }
+static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { }
+static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { }
+static void __init pud_advanced_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { }
+static void __init pud_leaf_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+static void __init pmd_huge_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
- if (!arch_vmap_pmd_supported(prot))
+ if (!arch_vmap_pmd_supported(args->page_prot))
return;
pr_debug("Validating PMD huge\n");
@@ -415,18 +469,18 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
* X86 defined pmd_set_huge() verifies that the given
* PMD is not a populated non-leaf entry.
*/
- WRITE_ONCE(*pmdp, __pmd(0));
- WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
- WARN_ON(!pmd_clear_huge(pmdp));
- pmd = READ_ONCE(*pmdp);
+ WRITE_ONCE(*args->pmdp, __pmd(0));
+ WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot));
+ WARN_ON(!pmd_clear_huge(args->pmdp));
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(!pmd_none(pmd));
}
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+static void __init pud_huge_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (!arch_vmap_pud_supported(prot))
+ if (!arch_vmap_pud_supported(args->page_prot))
return;
pr_debug("Validating PUD huge\n");
@@ -434,18 +488,18 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
* X86 defined pud_set_huge() verifies that the given
* PUD is not a populated non-leaf entry.
*/
- WRITE_ONCE(*pudp, __pud(0));
- WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
- WARN_ON(!pud_clear_huge(pudp));
- pud = READ_ONCE(*pudp);
+ WRITE_ONCE(*args->pudp, __pud(0));
+ WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot));
+ WARN_ON(!pud_clear_huge(args->pudp));
+ pud = READ_ONCE(*args->pudp);
WARN_ON(!pud_none(pud));
}
#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_huge_tests(struct pgtable_debug_args *args) { }
+static void __init pud_huge_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init p4d_basic_tests(struct pgtable_debug_args *args)
{
p4d_t p4d;
@@ -454,7 +508,7 @@ static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!p4d_same(p4d, p4d));
}
-static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init pgd_basic_tests(struct pgtable_debug_args *args)
{
pgd_t pgd;
@@ -464,27 +518,26 @@ static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
}
#ifndef __PAGETABLE_PUD_FOLDED
-static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
+static void __init pud_clear_tests(struct pgtable_debug_args *args)
{
- pud_t pud = READ_ONCE(*pudp);
+ pud_t pud = READ_ONCE(*args->pudp);
- if (mm_pmd_folded(mm))
+ if (mm_pmd_folded(args->mm))
return;
pr_debug("Validating PUD clear\n");
pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
- WRITE_ONCE(*pudp, pud);
- pud_clear(pudp);
- pud = READ_ONCE(*pudp);
+ WRITE_ONCE(*args->pudp, pud);
+ pud_clear(args->pudp);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(!pud_none(pud));
}
-static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
- pmd_t *pmdp)
+static void __init pud_populate_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (mm_pmd_folded(mm))
+ if (mm_pmd_folded(args->mm))
return;
pr_debug("Validating PUD populate\n");
@@ -492,40 +545,36 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
* This entry points to next level page table page.
* Hence this must not qualify as pud_bad().
*/
- pud_populate(mm, pudp, pmdp);
- pud = READ_ONCE(*pudp);
+ pud_populate(args->mm, args->pudp, args->start_pmdp);
+ pud = READ_ONCE(*args->pudp);
WARN_ON(pud_bad(pud));
}
#else /* !__PAGETABLE_PUD_FOLDED */
-static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { }
-static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
- pmd_t *pmdp)
-{
-}
+static void __init pud_clear_tests(struct pgtable_debug_args *args) { }
+static void __init pud_populate_tests(struct pgtable_debug_args *args) { }
#endif /* PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_P4D_FOLDED
-static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
+static void __init p4d_clear_tests(struct pgtable_debug_args *args)
{
- p4d_t p4d = READ_ONCE(*p4dp);
+ p4d_t p4d = READ_ONCE(*args->p4dp);
- if (mm_pud_folded(mm))
+ if (mm_pud_folded(args->mm))
return;
pr_debug("Validating P4D clear\n");
p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
- WRITE_ONCE(*p4dp, p4d);
- p4d_clear(p4dp);
- p4d = READ_ONCE(*p4dp);
+ WRITE_ONCE(*args->p4dp, p4d);
+ p4d_clear(args->p4dp);
+ p4d = READ_ONCE(*args->p4dp);
WARN_ON(!p4d_none(p4d));
}
-static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
- pud_t *pudp)
+static void __init p4d_populate_tests(struct pgtable_debug_args *args)
{
p4d_t p4d;
- if (mm_pud_folded(mm))
+ if (mm_pud_folded(args->mm))
return;
pr_debug("Validating P4D populate\n");
@@ -533,34 +582,33 @@ static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
* This entry points to next level page table page.
* Hence this must not qualify as p4d_bad().
*/
- pud_clear(pudp);
- p4d_clear(p4dp);
- p4d_populate(mm, p4dp, pudp);
- p4d = READ_ONCE(*p4dp);
+ pud_clear(args->pudp);
+ p4d_clear(args->p4dp);
+ p4d_populate(args->mm, args->p4dp, args->start_pudp);
+ p4d = READ_ONCE(*args->p4dp);
WARN_ON(p4d_bad(p4d));
}
-static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
+static void __init pgd_clear_tests(struct pgtable_debug_args *args)
{
- pgd_t pgd = READ_ONCE(*pgdp);
+ pgd_t pgd = READ_ONCE(*(args->pgdp));
- if (mm_p4d_folded(mm))
+ if (mm_p4d_folded(args->mm))
return;
pr_debug("Validating PGD clear\n");
pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
- WRITE_ONCE(*pgdp, pgd);
- pgd_clear(pgdp);
- pgd = READ_ONCE(*pgdp);
+ WRITE_ONCE(*args->pgdp, pgd);
+ pgd_clear(args->pgdp);
+ pgd = READ_ONCE(*args->pgdp);
WARN_ON(!pgd_none(pgd));
}
-static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
- p4d_t *p4dp)
+static void __init pgd_populate_tests(struct pgtable_debug_args *args)
{
pgd_t pgd;
- if (mm_p4d_folded(mm))
+ if (mm_p4d_folded(args->mm))
return;
pr_debug("Validating PGD populate\n");
@@ -568,56 +616,60 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
* This entry points to next level page table page.
* Hence this must not qualify as pgd_bad().
*/
- p4d_clear(p4dp);
- pgd_clear(pgdp);
- pgd_populate(mm, pgdp, p4dp);
- pgd = READ_ONCE(*pgdp);
+ p4d_clear(args->p4dp);
+ pgd_clear(args->pgdp);
+ pgd_populate(args->mm, args->pgdp, args->start_p4dp);
+ pgd = READ_ONCE(*args->pgdp);
WARN_ON(pgd_bad(pgd));
}
#else /* !__PAGETABLE_P4D_FOLDED */
-static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { }
-static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { }
-static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
- pud_t *pudp)
-{
-}
-static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
- p4d_t *p4dp)
-{
-}
+static void __init p4d_clear_tests(struct pgtable_debug_args *args) { }
+static void __init pgd_clear_tests(struct pgtable_debug_args *args) { }
+static void __init p4d_populate_tests(struct pgtable_debug_args *args) { }
+static void __init pgd_populate_tests(struct pgtable_debug_args *args) { }
#endif /* PAGETABLE_P4D_FOLDED */
-static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+static void __init pte_clear_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ struct page *page;
+ pte_t pte = pfn_pte(args->pte_pfn, args->page_prot);
+
+ page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL;
+ if (!page)
+ return;
+ /*
+ * flush_dcache_page() is called after set_pte_at() to clear
+ * PG_arch_1 for the page on ARM64. The page flag isn't cleared
+ * when it's released and page allocation check will fail when
+ * the page is allocated again. For architectures other than ARM64,
+ * the unexpected overhead of cache flushing is acceptable.
+ */
pr_debug("Validating PTE clear\n");
#ifndef CONFIG_RISCV
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
#endif
- set_pte_at(mm, vaddr, ptep, pte);
+ set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+ flush_dcache_page(page);
barrier();
- pte_clear(mm, vaddr, ptep);
- pte = ptep_get(ptep);
+ pte_clear(args->mm, args->vaddr, args->ptep);
+ pte = ptep_get(args->ptep);
WARN_ON(!pte_none(pte));
}
-static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
+static void __init pmd_clear_tests(struct pgtable_debug_args *args)
{
- pmd_t pmd = READ_ONCE(*pmdp);
+ pmd_t pmd = READ_ONCE(*args->pmdp);
pr_debug("Validating PMD clear\n");
pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
- WRITE_ONCE(*pmdp, pmd);
- pmd_clear(pmdp);
- pmd = READ_ONCE(*pmdp);
+ WRITE_ONCE(*args->pmdp, pmd);
+ pmd_clear(args->pmdp);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(!pmd_none(pmd));
}
-static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
- pgtable_t pgtable)
+static void __init pmd_populate_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -626,14 +678,14 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
* This entry points to next level page table page.
* Hence this must not qualify as pmd_bad().
*/
- pmd_populate(mm, pmdp, pgtable);
- pmd = READ_ONCE(*pmdp);
+ pmd_populate(args->mm, args->pmdp, args->start_ptep);
+ pmd = READ_ONCE(*args->pmdp);
WARN_ON(pmd_bad(pmd));
}
-static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_special_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL))
return;
@@ -642,9 +694,9 @@ static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pte_special(pte_mkspecial(pte)));
}
-static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_protnone_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none);
if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
return;
@@ -655,7 +707,7 @@ static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_protnone_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -666,25 +718,25 @@ static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD protnone\n");
- pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+ pmd = pmd_mkhuge(pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none));
WARN_ON(!pmd_protnone(pmd));
WARN_ON(!pmd_present(pmd));
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_devmap_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
pr_debug("Validating PTE devmap\n");
WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_devmap_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -692,12 +744,12 @@ static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD devmap\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_devmap_tests(struct pgtable_debug_args *args)
{
pud_t pud;
@@ -705,25 +757,25 @@ static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PUD devmap\n");
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
}
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
+static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#else
-static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pte_devmap_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
+static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
-static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
return;
@@ -733,9 +785,9 @@ static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte)));
}
-static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
return;
@@ -746,7 +798,7 @@ static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -757,12 +809,12 @@ static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD soft dirty\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd)));
WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
}
-static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -774,31 +826,29 @@ static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD swap soft dirty\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
-{
-}
+static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_swap_tests(struct pgtable_debug_args *args)
{
swp_entry_t swp;
pte_t pte;
pr_debug("Validating PTE swap\n");
- pte = pfn_pte(pfn, prot);
+ pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
swp = __pte_to_swp_entry(pte);
pte = __swp_entry_to_pte(swp);
- WARN_ON(pfn != pte_pfn(pte));
+ WARN_ON(args->fixed_pte_pfn != pte_pfn(pte));
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_swap_tests(struct pgtable_debug_args *args)
{
swp_entry_t swp;
pmd_t pmd;
@@ -807,16 +857,16 @@ static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD swap\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
swp = __pmd_to_swp_entry(pmd);
pmd = __swp_entry_to_pmd(swp);
- WARN_ON(pfn != pmd_pfn(pmd));
+ WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd));
}
#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
-static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_swap_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
-static void __init swap_migration_tests(void)
+static void __init swap_migration_tests(struct pgtable_debug_args *args)
{
struct page *page;
swp_entry_t swp;
@@ -824,19 +874,18 @@ static void __init swap_migration_tests(void)
if (!IS_ENABLED(CONFIG_MIGRATION))
return;
- pr_debug("Validating swap migration\n");
/*
* swap_migration_tests() requires a dedicated page as it needs to
* be locked before creating a migration entry from it. Locking the
* page that actually maps kernel text ('start_kernel') can be real
- * problematic. Lets allocate a dedicated page explicitly for this
- * purpose that will be freed subsequently.
+ * problematic. Lets use the allocated page explicitly for this
+ * purpose.
*/
- page = alloc_page(GFP_KERNEL);
- if (!page) {
- pr_err("page allocation failed\n");
+ page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL;
+ if (!page)
return;
- }
+
+ pr_debug("Validating swap migration\n");
/*
* make_migration_entry() expects given page to be
@@ -855,11 +904,10 @@ static void __init swap_migration_tests(void)
WARN_ON(!is_migration_entry(swp));
WARN_ON(is_writable_migration_entry(swp));
__ClearPageLocked(page);
- __free_page(page);
}
#ifdef CONFIG_HUGETLB_PAGE
-static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init hugetlb_basic_tests(struct pgtable_debug_args *args)
{
struct page *page;
pte_t pte;
@@ -869,25 +917,25 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
* Accessing the page associated with the pfn is safe here,
* as it was previously derived from a real kernel symbol.
*/
- page = pfn_to_page(pfn);
- pte = mk_huge_pte(page, prot);
+ page = pfn_to_page(args->fixed_pmd_pfn);
+ pte = mk_huge_pte(page, args->page_prot);
WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
- pte = pfn_pte(pfn, prot);
+ pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pte_huge(pte_mkhuge(pte)));
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
#else /* !CONFIG_HUGETLB_PAGE */
-static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_thp_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -906,7 +954,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
* needs to return true. pmd_present() should be true whenever
* pmd_trans_huge() returns true.
*/
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd)));
#ifndef __HAVE_ARCH_PMDP_INVALIDATE
@@ -916,7 +964,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_thp_tests(struct pgtable_debug_args *args)
{
pud_t pud;
@@ -924,7 +972,7 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PUD based THP\n");
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
WARN_ON(!pud_trans_huge(pud_mkhuge(pud)));
/*
@@ -936,11 +984,11 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
*/
}
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_thp_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_thp_tests(struct pgtable_debug_args *args) { }
+static void __init pud_thp_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static unsigned long __init get_random_vaddr(void)
@@ -955,43 +1003,179 @@ static unsigned long __init get_random_vaddr(void)
return random_vaddr;
}
-static int __init debug_vm_pgtable(void)
+static void __init destroy_args(struct pgtable_debug_args *args)
{
- struct vm_area_struct *vma;
- struct mm_struct *mm;
- pgd_t *pgdp;
- p4d_t *p4dp, *saved_p4dp;
- pud_t *pudp, *saved_pudp;
- pmd_t *pmdp, *saved_pmdp, pmd;
- pte_t *ptep;
- pgtable_t saved_ptep;
- pgprot_t prot, protnone;
- phys_addr_t paddr;
- unsigned long vaddr, pte_aligned, pmd_aligned;
- unsigned long pud_aligned, p4d_aligned, pgd_aligned;
- spinlock_t *ptl = NULL;
- int idx;
+ struct page *page = NULL;
+
+ /* Free (huge) page */
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+ has_transparent_hugepage() &&
+ args->pud_pfn != ULONG_MAX) {
+ if (args->is_contiguous_page) {
+ free_contig_range(args->pud_pfn,
+ (1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT)));
+ } else {
+ page = pfn_to_page(args->pud_pfn);
+ __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT);
+ }
+
+ args->pud_pfn = ULONG_MAX;
+ args->pmd_pfn = ULONG_MAX;
+ args->pte_pfn = ULONG_MAX;
+ }
- pr_info("Validating architecture page table helpers\n");
- prot = vm_get_page_prot(VMFLAGS);
- vaddr = get_random_vaddr();
- mm = mm_alloc();
- if (!mm) {
- pr_err("mm_struct allocation failed\n");
- return 1;
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ has_transparent_hugepage() &&
+ args->pmd_pfn != ULONG_MAX) {
+ if (args->is_contiguous_page) {
+ free_contig_range(args->pmd_pfn, (1 << HPAGE_PMD_ORDER));
+ } else {
+ page = pfn_to_page(args->pmd_pfn);
+ __free_pages(page, HPAGE_PMD_ORDER);
+ }
+
+ args->pmd_pfn = ULONG_MAX;
+ args->pte_pfn = ULONG_MAX;
}
+ if (args->pte_pfn != ULONG_MAX) {
+ page = pfn_to_page(args->pte_pfn);
+ __free_pages(page, 0);
+
+ args->pte_pfn = ULONG_MAX;
+ }
+
+ /* Free page table entries */
+ if (args->start_ptep) {
+ pte_free(args->mm, args->start_ptep);
+ mm_dec_nr_ptes(args->mm);
+ }
+
+ if (args->start_pmdp) {
+ pmd_free(args->mm, args->start_pmdp);
+ mm_dec_nr_pmds(args->mm);
+ }
+
+ if (args->start_pudp) {
+ pud_free(args->mm, args->start_pudp);
+ mm_dec_nr_puds(args->mm);
+ }
+
+ if (args->start_p4dp)
+ p4d_free(args->mm, args->start_p4dp);
+
+ /* Free vma and mm struct */
+ if (args->vma)
+ vm_area_free(args->vma);
+
+ if (args->mm)
+ mmdrop(args->mm);
+}
+
+static struct page * __init
+debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
+{
+ struct page *page = NULL;
+
+#ifdef CONFIG_CONTIG_ALLOC
+ if (order >= MAX_ORDER) {
+ page = alloc_contig_pages((1 << order), GFP_KERNEL,
+ first_online_node, NULL);
+ if (page) {
+ args->is_contiguous_page = true;
+ return page;
+ }
+ }
+#endif
+
+ if (order < MAX_ORDER)
+ page = alloc_pages(GFP_KERNEL, order);
+
+ return page;
+}
+
+static int __init init_args(struct pgtable_debug_args *args)
+{
+ struct page *page = NULL;
+ phys_addr_t phys;
+ int ret = 0;
+
/*
+ * Initialize the debugging data.
+ *
* __P000 (or even __S000) will help create page table entries with
* PROT_NONE permission as required for pxx_protnone_tests().
*/
- protnone = __P000;
+ memset(args, 0, sizeof(*args));
+ args->vaddr = get_random_vaddr();
+ args->page_prot = vm_get_page_prot(VMFLAGS);
+ args->page_prot_none = __P000;
+ args->is_contiguous_page = false;
+ args->pud_pfn = ULONG_MAX;
+ args->pmd_pfn = ULONG_MAX;
+ args->pte_pfn = ULONG_MAX;
+ args->fixed_pgd_pfn = ULONG_MAX;
+ args->fixed_p4d_pfn = ULONG_MAX;
+ args->fixed_pud_pfn = ULONG_MAX;
+ args->fixed_pmd_pfn = ULONG_MAX;
+ args->fixed_pte_pfn = ULONG_MAX;
+
+ /* Allocate mm and vma */
+ args->mm = mm_alloc();
+ if (!args->mm) {
+ pr_err("Failed to allocate mm struct\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ args->vma = vm_area_alloc(args->mm);
+ if (!args->vma) {
+ pr_err("Failed to allocate vma\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ /*
+ * Allocate page table entries. They will be modified in the tests.
+ * Lets save the page table entries so that they can be released
+ * when the tests are completed.
+ */
+ args->pgdp = pgd_offset(args->mm, args->vaddr);
+ args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
+ if (!args->p4dp) {
+ pr_err("Failed to allocate p4d entries\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+ args->start_p4dp = p4d_offset(args->pgdp, 0UL);
+ WARN_ON(!args->start_p4dp);
+
+ args->pudp = pud_alloc(args->mm, args->p4dp, args->vaddr);
+ if (!args->pudp) {
+ pr_err("Failed to allocate pud entries\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+ args->start_pudp = pud_offset(args->p4dp, 0UL);
+ WARN_ON(!args->start_pudp);
+
+ args->pmdp = pmd_alloc(args->mm, args->pudp, args->vaddr);
+ if (!args->pmdp) {
+ pr_err("Failed to allocate pmd entries\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+ args->start_pmdp = pmd_offset(args->pudp, 0UL);
+ WARN_ON(!args->start_pmdp);
- vma = vm_area_alloc(mm);
- if (!vma) {
- pr_err("vma allocation failed\n");
- return 1;
+ if (pte_alloc(args->mm, args->pmdp)) {
+ pr_err("Failed to allocate pte entries\n");
+ ret = -ENOMEM;
+ goto error;
}
+ args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp));
+ WARN_ON(!args->start_ptep);
/*
* PFN for mapping at PTE level is determined from a standard kernel
@@ -1000,40 +1184,65 @@ static int __init debug_vm_pgtable(void)
* exist on the platform but that does not really matter as pfn_pxx()
* helpers will still create appropriate entries for the test. This
* helps avoid large memory block allocations to be used for mapping
- * at higher page table levels.
+ * at higher page table levels in some of the tests.
*/
- paddr = __pa_symbol(&start_kernel);
-
- pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT;
- pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT;
- pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT;
- p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT;
- pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT;
- WARN_ON(!pfn_valid(pte_aligned));
-
- pgdp = pgd_offset(mm, vaddr);
- p4dp = p4d_alloc(mm, pgdp, vaddr);
- pudp = pud_alloc(mm, p4dp, vaddr);
- pmdp = pmd_alloc(mm, pudp, vaddr);
+ phys = __pa_symbol(&start_kernel);
+ args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
+ args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
+ args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
+ args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
+ args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
+ WARN_ON(!pfn_valid(args->fixed_pte_pfn));
+
/*
- * Allocate pgtable_t
+ * Allocate (huge) pages because some of the tests need to access
+ * the data in the pages. The corresponding tests will be skipped
+ * if we fail to allocate (huge) pages.
*/
- if (pte_alloc(mm, pmdp)) {
- pr_err("pgtable allocation failed\n");
- return 1;
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+ has_transparent_hugepage()) {
+ page = debug_vm_pgtable_alloc_huge_page(args,
+ HPAGE_PUD_SHIFT - PAGE_SHIFT);
+ if (page) {
+ args->pud_pfn = page_to_pfn(page);
+ args->pmd_pfn = args->pud_pfn;
+ args->pte_pfn = args->pud_pfn;
+ return 0;
+ }
}
- /*
- * Save all the page table page addresses as the page table
- * entries will be used for testing with random or garbage
- * values. These saved addresses will be used for freeing
- * page table pages.
- */
- pmd = READ_ONCE(*pmdp);
- saved_p4dp = p4d_offset(pgdp, 0UL);
- saved_pudp = pud_offset(p4dp, 0UL);
- saved_pmdp = pmd_offset(pudp, 0UL);
- saved_ptep = pmd_pgtable(pmd);
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ has_transparent_hugepage()) {
+ page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PMD_ORDER);
+ if (page) {
+ args->pmd_pfn = page_to_pfn(page);
+ args->pte_pfn = args->pmd_pfn;
+ return 0;
+ }
+ }
+
+ page = alloc_pages(GFP_KERNEL, 0);
+ if (page)
+ args->pte_pfn = page_to_pfn(page);
+
+ return 0;
+
+error:
+ destroy_args(args);
+ return ret;
+}
+
+static int __init debug_vm_pgtable(void)
+{
+ struct pgtable_debug_args args;
+ spinlock_t *ptl = NULL;
+ int idx, ret;
+
+ pr_info("Validating architecture page table helpers\n");
+ ret = init_args(&args);
+ if (ret)
+ return ret;
/*
* Iterate over the protection_map[] to make sure that all
@@ -1042,9 +1251,9 @@ static int __init debug_vm_pgtable(void)
* given page table entry.
*/
for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) {
- pte_basic_tests(pte_aligned, idx);
- pmd_basic_tests(pmd_aligned, idx);
- pud_basic_tests(mm, pud_aligned, idx);
+ pte_basic_tests(&args, idx);
+ pmd_basic_tests(&args, idx);
+ pud_basic_tests(&args, idx);
}
/*
@@ -1054,79 +1263,70 @@ static int __init debug_vm_pgtable(void)
* the above iteration for now to save some test execution
* time.
*/
- p4d_basic_tests(p4d_aligned, prot);
- pgd_basic_tests(pgd_aligned, prot);
+ p4d_basic_tests(&args);
+ pgd_basic_tests(&args);
- pmd_leaf_tests(pmd_aligned, prot);
- pud_leaf_tests(pud_aligned, prot);
+ pmd_leaf_tests(&args);
+ pud_leaf_tests(&args);
- pte_savedwrite_tests(pte_aligned, protnone);
- pmd_savedwrite_tests(pmd_aligned, protnone);
+ pte_savedwrite_tests(&args);
+ pmd_savedwrite_tests(&args);
- pte_special_tests(pte_aligned, prot);
- pte_protnone_tests(pte_aligned, protnone);
- pmd_protnone_tests(pmd_aligned, protnone);
+ pte_special_tests(&args);
+ pte_protnone_tests(&args);
+ pmd_protnone_tests(&args);
- pte_devmap_tests(pte_aligned, prot);
- pmd_devmap_tests(pmd_aligned, prot);
- pud_devmap_tests(pud_aligned, prot);
+ pte_devmap_tests(&args);
+ pmd_devmap_tests(&args);
+ pud_devmap_tests(&args);
- pte_soft_dirty_tests(pte_aligned, prot);
- pmd_soft_dirty_tests(pmd_aligned, prot);
- pte_swap_soft_dirty_tests(pte_aligned, prot);
- pmd_swap_soft_dirty_tests(pmd_aligned, prot);
+ pte_soft_dirty_tests(&args);
+ pmd_soft_dirty_tests(&args);
+ pte_swap_soft_dirty_tests(&args);
+ pmd_swap_soft_dirty_tests(&args);
- pte_swap_tests(pte_aligned, prot);
- pmd_swap_tests(pmd_aligned, prot);
+ pte_swap_tests(&args);
+ pmd_swap_tests(&args);
- swap_migration_tests();
+ swap_migration_tests(&args);
- pmd_thp_tests(pmd_aligned, prot);
- pud_thp_tests(pud_aligned, prot);
+ pmd_thp_tests(&args);
+ pud_thp_tests(&args);
- hugetlb_basic_tests(pte_aligned, prot);
+ hugetlb_basic_tests(&args);
/*
* Page table modifying tests. They need to hold
* proper page table lock.
*/
- ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl);
- pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot);
- pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
- pte_unmap_unlock(ptep, ptl);
+ args.ptep = pte_offset_map_lock(args.mm, args.pmdp, args.vaddr, &ptl);
+ pte_clear_tests(&args);
+ pte_advanced_tests(&args);
+ pte_unmap_unlock(args.ptep, ptl);
- ptl = pmd_lock(mm, pmdp);
- pmd_clear_tests(mm, pmdp);
- pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
- pmd_huge_tests(pmdp, pmd_aligned, prot);
- pmd_populate_tests(mm, pmdp, saved_ptep);
+ ptl = pmd_lock(args.mm, args.pmdp);
+ pmd_clear_tests(&args);
+ pmd_advanced_tests(&args);
+ pmd_huge_tests(&args);
+ pmd_populate_tests(&args);
spin_unlock(ptl);
- ptl = pud_lock(mm, pudp);
- pud_clear_tests(mm, pudp);
- pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
- pud_huge_tests(pudp, pud_aligned, prot);
- pud_populate_tests(mm, pudp, saved_pmdp);
+ ptl = pud_lock(args.mm, args.pudp);
+ pud_clear_tests(&args);
+ pud_advanced_tests(&args);
+ pud_huge_tests(&args);
+ pud_populate_tests(&args);
spin_unlock(ptl);
- spin_lock(&mm->page_table_lock);
- p4d_clear_tests(mm, p4dp);
- pgd_clear_tests(mm, pgdp);
- p4d_populate_tests(mm, p4dp, saved_pudp);
- pgd_populate_tests(mm, pgdp, saved_p4dp);
- spin_unlock(&mm->page_table_lock);
-
- p4d_free(mm, saved_p4dp);
- pud_free(mm, saved_pudp);
- pmd_free(mm, saved_pmdp);
- pte_free(mm, saved_ptep);
-
- vm_area_free(vma);
- mm_dec_nr_puds(mm);
- mm_dec_nr_pmds(mm);
- mm_dec_nr_ptes(mm);
- mmdrop(mm);
+ spin_lock(&(args.mm->page_table_lock));
+ p4d_clear_tests(&args);
+ pgd_clear_tests(&args);
+ p4d_populate_tests(&args);
+ pgd_populate_tests(&args);
+ spin_unlock(&(args.mm->page_table_lock));
+
+ destroy_args(&args);
return 0;
}
late_initcall(debug_vm_pgtable);
diff --git a/mm/filemap.c b/mm/filemap.c
index 920e8dc03251..dae481293b5d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -260,12 +260,11 @@ static void page_cache_free_page(struct address_space *mapping,
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- unsigned long flags;
BUG_ON(!PageLocked(page));
- xa_lock_irqsave(&mapping->i_pages, flags);
+ xa_lock_irq(&mapping->i_pages);
__delete_from_page_cache(page, NULL);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
page_cache_free_page(mapping, page);
}
@@ -337,19 +336,18 @@ void delete_from_page_cache_batch(struct address_space *mapping,
struct pagevec *pvec)
{
int i;
- unsigned long flags;
if (!pagevec_count(pvec))
return;
- xa_lock_irqsave(&mapping->i_pages, flags);
+ xa_lock_irq(&mapping->i_pages);
for (i = 0; i < pagevec_count(pvec); i++) {
trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
unaccount_page_cache_page(mapping, pvec->pages[i]);
}
page_cache_delete_batch(mapping, pvec);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
for (i = 0; i < pagevec_count(pvec); i++)
page_cache_free_page(mapping, pvec->pages[i]);
@@ -841,7 +839,6 @@ void replace_page_cache_page(struct page *old, struct page *new)
void (*freepage)(struct page *) = mapping->a_ops->freepage;
pgoff_t offset = old->index;
XA_STATE(xas, &mapping->i_pages, offset);
- unsigned long flags;
VM_BUG_ON_PAGE(!PageLocked(old), old);
VM_BUG_ON_PAGE(!PageLocked(new), new);
@@ -853,7 +850,7 @@ void replace_page_cache_page(struct page *old, struct page *new)
mem_cgroup_migrate(old, new);
- xas_lock_irqsave(&xas, flags);
+ xas_lock_irq(&xas);
xas_store(&xas, new);
old->mapping = NULL;
@@ -866,7 +863,7 @@ void replace_page_cache_page(struct page *old, struct page *new)
__dec_lruvec_page_state(old, NR_SHMEM);
if (PageSwapBacked(new))
__inc_lruvec_page_state(new, NR_SHMEM);
- xas_unlock_irqrestore(&xas, flags);
+ xas_unlock_irq(&xas);
if (freepage)
freepage(old);
put_page(old);
diff --git a/mm/gup.c b/mm/gup.c
index b94717977d17..9935a4480710 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -62,11 +62,24 @@ static void put_page_refs(struct page *page, int refs)
put_page(page);
}
-/*
- * Return the compound head page with ref appropriately incremented,
- * or NULL if that failed.
+/**
+ * try_get_compound_head() - return the compound head page with refcount
+ * appropriately incremented, or NULL if that failed.
+ *
+ * This handles potential refcount overflow correctly. It also works correctly
+ * for various lockless get_user_pages()-related callers, due to the use of
+ * page_cache_add_speculative().
+ *
+ * Even though the name includes "compound_head", this function is still
+ * appropriate for callers that have a non-compound @page to get.
+ *
+ * @page: pointer to page to be gotten
+ * @refs: the value to add to the page's refcount
+ *
+ * Return: head page (with refcount appropriately incremented) for success, or
+ * NULL upon failure.
*/
-static inline struct page *try_get_compound_head(struct page *page, int refs)
+struct page *try_get_compound_head(struct page *page, int refs)
{
struct page *head = compound_head(page);
@@ -92,10 +105,17 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
return head;
}
-/*
+/**
* try_grab_compound_head() - attempt to elevate a page's refcount, by a
* flags-dependent amount.
*
+ * Even though the name includes "compound_head", this function is still
+ * appropriate for callers that have a non-compound @page to get.
+ *
+ * @page: pointer to page to be grabbed
+ * @refs: the value to (effectively) add to the page's refcount
+ * @flags: gup flags: these are the FOLL_* flag values.
+ *
* "grab" names in this file mean, "look at flags to decide whether to use
* FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
*
@@ -103,22 +123,26 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
* same time. (That's true throughout the get_user_pages*() and
* pin_user_pages*() APIs.) Cases:
*
- * FOLL_GET: page's refcount will be incremented by 1.
- * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
+ * FOLL_GET: page's refcount will be incremented by @refs.
+ *
+ * FOLL_PIN on compound pages that are > two pages long: page's refcount will
+ * be incremented by @refs, and page[2].hpage_pinned_refcount will be
+ * incremented by @refs * GUP_PIN_COUNTING_BIAS.
+ *
+ * FOLL_PIN on normal pages, or compound pages that are two pages long:
+ * page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS.
*
* Return: head page (with refcount appropriately incremented) for success, or
* NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
* considered failure, and furthermore, a likely bug in the caller, so a warning
* is also emitted.
*/
-__maybe_unused struct page *try_grab_compound_head(struct page *page,
- int refs, unsigned int flags)
+struct page *try_grab_compound_head(struct page *page,
+ int refs, unsigned int flags)
{
if (flags & FOLL_GET)
return try_get_compound_head(page, refs);
else if (flags & FOLL_PIN) {
- int orig_refs = refs;
-
/*
* Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
* right zone, so fail and let the caller fall back to the slow
@@ -143,6 +167,8 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page,
*
* However, be sure to *also* increment the normal page refcount
* field at least once, so that the page really is pinned.
+ * That's why the refcount from the earlier
+ * try_get_compound_head() is left intact.
*/
if (hpage_pincount_available(page))
hpage_pincount_add(page, refs);
@@ -150,7 +176,7 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page,
page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));
mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
- orig_refs);
+ refs);
return page;
}
@@ -186,10 +212,8 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags)
* @flags: gup flags: these are the FOLL_* flag values.
*
* Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
- * time. Cases:
- *
- * FOLL_GET: page's refcount will be incremented by 1.
- * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
+ * time. Cases: please see the try_grab_compound_head() documentation, with
+ * "refs=1".
*
* Return: true for success, or if no action was required (if neither FOLL_PIN
* nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
@@ -197,35 +221,10 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags)
*/
bool __must_check try_grab_page(struct page *page, unsigned int flags)
{
- WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
+ if (!(flags & (FOLL_GET | FOLL_PIN)))
+ return true;
- if (flags & FOLL_GET)
- return try_get_page(page);
- else if (flags & FOLL_PIN) {
- int refs = 1;
-
- page = compound_head(page);
-
- if (WARN_ON_ONCE(page_ref_count(page) <= 0))
- return false;
-
- if (hpage_pincount_available(page))
- hpage_pincount_add(page, 1);
- else
- refs = GUP_PIN_COUNTING_BIAS;
-
- /*
- * Similar to try_grab_compound_head(): even if using the
- * hpage_pincount_add/_sub() routines, be sure to
- * *also* increment the normal page refcount field at least
- * once, so that the page really is pinned.
- */
- page_ref_add(page, refs);
-
- mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
- }
-
- return true;
+ return try_grab_compound_head(page, 1, flags);
}
/**
@@ -1151,7 +1150,6 @@ static long __get_user_pages(struct mm_struct *mm,
* We must stop here.
*/
BUG_ON(gup_flags & FOLL_NOWAIT);
- BUG_ON(ret != 0);
goto out;
}
continue;
@@ -1276,7 +1274,7 @@ int fixup_user_fault(struct mm_struct *mm,
bool *unlocked)
{
struct vm_area_struct *vma;
- vm_fault_t ret, major = 0;
+ vm_fault_t ret;
address = untagged_addr(address);
@@ -1296,7 +1294,6 @@ retry:
return -EINTR;
ret = handle_mm_fault(vma, address, fault_flags, NULL);
- major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, 0);
@@ -1475,8 +1472,8 @@ long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long nr_pages = (end - start) / PAGE_SIZE;
int gup_flags;
- VM_BUG_ON(start & ~PAGE_MASK);
- VM_BUG_ON(end & ~PAGE_MASK);
+ VM_BUG_ON(!PAGE_ALIGNED(start));
+ VM_BUG_ON(!PAGE_ALIGNED(end));
VM_BUG_ON_VMA(start < vma->vm_start, vma);
VM_BUG_ON_VMA(end > vma->vm_end, vma);
mmap_assert_locked(mm);
@@ -1775,7 +1772,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
if (!list_empty(&movable_page_list)) {
ret = migrate_pages(&movable_page_list, alloc_migration_target,
NULL, (unsigned long)&mtc, MIGRATE_SYNC,
- MR_LONGTERM_PIN);
+ MR_LONGTERM_PIN, NULL);
if (ret && !list_empty(&movable_page_list))
putback_movable_pages(&movable_page_list);
}
@@ -2244,6 +2241,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
{
int nr_start = *nr;
struct dev_pagemap *pgmap = NULL;
+ int ret = 1;
do {
struct page *page = pfn_to_page(pfn);
@@ -2251,21 +2249,22 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
pgmap = get_dev_pagemap(pfn, pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, flags, pages);
- return 0;
+ ret = 0;
+ break;
}
SetPageReferenced(page);
pages[*nr] = page;
if (unlikely(!try_grab_page(page, flags))) {
undo_dev_pagemap(nr, nr_start, flags, pages);
- return 0;
+ ret = 0;
+ break;
}
(*nr)++;
pfn++;
} while (addr += PAGE_SIZE, addr != end);
- if (pgmap)
- put_dev_pagemap(pgmap);
- return 1;
+ put_dev_pagemap(pgmap);
+ return ret;
}
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index afff3ac87067..5e9ef0fc261e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1440,32 +1440,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
goto out;
}
- /*
- * Since we took the NUMA fault, we must have observed the !accessible
- * bit. Make sure all other CPUs agree with that, to avoid them
- * modifying the page we're about to migrate.
- *
- * Must be done under PTL such that we'll observe the relevant
- * inc_tlb_flush_pending().
- *
- * We are not sure a pending tlb flush here is for a huge page
- * mapping or not. Hence use the tlb range variant
- */
- if (mm_tlb_flush_pending(vma->vm_mm)) {
- flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
- /*
- * change_huge_pmd() released the pmd lock before
- * invalidating the secondary MMUs sharing the primary
- * MMU pagetables (with ->invalidate_range()). The
- * mmu_notifier_invalidate_range_end() (which
- * internally calls ->invalidate_range()) in
- * change_pmd_range() will run after us, so we can't
- * rely on it here and we need an explicit invalidate.
- */
- mmu_notifier_invalidate_range(vma->vm_mm, haddr,
- haddr + HPAGE_PMD_SIZE);
- }
-
pmd = pmd_modify(oldpmd, vma->vm_page_prot);
page = vm_normal_page_pmd(vma, haddr, pmd);
if (!page)
@@ -2454,11 +2428,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
- /* Some pages can be beyond i_size: drop them from page cache */
+ /* Some pages can be beyond EOF: drop them from page cache */
if (head[i].index >= end) {
ClearPageDirty(head + i);
__delete_from_page_cache(head + i, NULL);
- if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
+ if (shmem_mapping(head->mapping))
shmem_uncharge(head->mapping->host, 1);
put_page(head + i);
} else if (!PageAnon(page)) {
@@ -2686,6 +2660,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* head page lock is good enough to serialize the trimming.
*/
end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+ if (shmem_mapping(mapping))
+ end = shmem_fallocend(mapping->host, end);
}
/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8ea35ba6699f..95dc7b83381f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1072,6 +1072,8 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
int nid = page_to_nid(page);
lockdep_assert_held(&hugetlb_lock);
+ VM_BUG_ON_PAGE(page_count(page), page);
+
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
@@ -1143,7 +1145,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
unsigned long address, int avoid_reserve,
long chg)
{
- struct page *page;
+ struct page *page = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask;
nodemask_t *nodemask;
@@ -1164,7 +1166,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
gfp_mask = htlb_alloc_mask(h);
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+
+ if (mpol_is_preferred_many(mpol)) {
+ page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+
+ /* Fallback to all nodes if page==NULL */
+ nodemask = NULL;
+ }
+
+ if (!page)
+ page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+
if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
SetHPageRestoreReserve(page);
h->resv_huge_pages--;
@@ -1368,8 +1380,28 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
h->surplus_huge_pages_node[nid]--;
}
+ /*
+ * Very subtle
+ *
+ * For non-gigantic pages set the destructor to the normal compound
+ * page dtor. This is needed in case someone takes an additional
+ * temporary ref to the page, and freeing is delayed until they drop
+ * their reference.
+ *
+ * For gigantic pages set the destructor to the null dtor. This
+ * destructor will never be called. Before freeing the gigantic
+ * page destroy_compound_gigantic_page will turn the compound page
+ * into a simple group of pages. After this the destructor does not
+ * apply.
+ *
+ * This handles the case where more than one ref is held when and
+ * after update_and_free_page is called.
+ */
set_page_refcounted(page);
- set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ if (hstate_is_gigantic(h))
+ set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ else
+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
h->nr_huge_pages--;
h->nr_huge_pages_node[nid]--;
@@ -1399,11 +1431,20 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
SetHPageVmemmapOptimized(page);
/*
- * This page is now managed by the hugetlb allocator and has
- * no users -- drop the last reference.
+ * This page is about to be managed by the hugetlb allocator and
+ * should have no users. Drop our reference, and check for others
+ * just in case.
*/
zeroed = put_page_testzero(page);
- VM_BUG_ON_PAGE(!zeroed, page);
+ if (!zeroed)
+ /*
+ * It is VERY unlikely soneone else has taken a ref on
+ * the page. In this case, we simply return as the
+ * hugetlb destructor (free_huge_page) will be called
+ * when this other ref is dropped.
+ */
+ return;
+
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
}
@@ -1657,16 +1698,14 @@ static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
* cache adding could take a ref on a 'to be' tail page.
* We need to respect any increased ref count, and only set
* the ref count to zero if count is currently 1. If count
- * is not 1, we call synchronize_rcu in the hope that a rcu
- * grace period will cause ref count to drop and then retry.
- * If count is still inflated on retry we return an error and
- * must discard the pages.
+ * is not 1, we return an error. An error return indicates
+ * the set of pages can not be converted to a gigantic page.
+ * The caller who allocated the pages should then discard the
+ * pages using the appropriate free interface.
*/
if (!page_ref_freeze(p, 1)) {
- pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
- synchronize_rcu();
- if (!page_ref_freeze(p, 1))
- goto out_error;
+ pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+ goto out_error;
}
set_page_count(p, 0);
set_compound_head(p, page);
@@ -1830,7 +1869,6 @@ retry:
retry = true;
goto retry;
}
- pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
return NULL;
}
}
@@ -2020,9 +2058,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
* Allocates a fresh surplus page from the page allocator.
*/
static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask)
+ int nid, nodemask_t *nmask, bool zero_ref)
{
struct page *page = NULL;
+ bool retry = false;
if (hstate_is_gigantic(h))
return NULL;
@@ -2032,6 +2071,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
+retry:
page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -2049,11 +2089,35 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
spin_unlock_irq(&hugetlb_lock);
put_page(page);
return NULL;
- } else {
- h->surplus_huge_pages++;
- h->surplus_huge_pages_node[page_to_nid(page)]++;
}
+ if (zero_ref) {
+ /*
+ * Caller requires a page with zero ref count.
+ * We will drop ref count here. If someone else is holding
+ * a ref, the page will be freed when they drop it. Abuse
+ * temporary page flag to accomplish this.
+ */
+ SetHPageTemporary(page);
+ if (!put_page_testzero(page)) {
+ /*
+ * Unexpected inflated ref count on freshly allocated
+ * huge. Retry once.
+ */
+ pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
+ spin_unlock_irq(&hugetlb_lock);
+ if (retry)
+ return NULL;
+
+ retry = true;
+ goto retry;
+ }
+ ClearHPageTemporary(page);
+ }
+
+ h->surplus_huge_pages++;
+ h->surplus_huge_pages_node[page_to_nid(page)]++;
+
out_unlock:
spin_unlock_irq(&hugetlb_lock);
@@ -2088,16 +2152,26 @@ static
struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- struct page *page;
+ struct page *page = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask = htlb_alloc_mask(h);
int nid;
nodemask_t *nodemask;
nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
- page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
- mpol_cond_put(mpol);
+ if (mpol_is_preferred_many(mpol)) {
+ gfp_t gfp = gfp_mask | __GFP_NOWARN;
+ gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+ page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
+
+ /* Fallback to all nodes if page==NULL */
+ nodemask = NULL;
+ }
+
+ if (!page)
+ page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
+ mpol_cond_put(mpol);
return page;
}
@@ -2167,7 +2241,7 @@ retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
- NUMA_NO_NODE, NULL);
+ NUMA_NO_NODE, NULL, true);
if (!page) {
alloc_ok = false;
break;
@@ -2208,24 +2282,20 @@ retry:
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
- int zeroed;
-
if ((--needed) < 0)
break;
- /*
- * This page is now managed by the hugetlb allocator and has
- * no users -- drop the buddy allocator's reference.
- */
- zeroed = put_page_testzero(page);
- VM_BUG_ON_PAGE(!zeroed, page);
+ /* Add the page to the hugetlb allocator */
enqueue_huge_page(h, page);
}
free:
spin_unlock_irq(&hugetlb_lock);
- /* Free unnecessary surplus pages to the buddy allocator */
+ /*
+ * Free unnecessary surplus pages to the buddy allocator.
+ * Pages have no ref count, call free_huge_page directly.
+ */
list_for_each_entry_safe(page, tmp, &surplus_list, lru)
- put_page(page);
+ free_huge_page(page);
spin_lock_irq(&hugetlb_lock);
return ret;
@@ -2534,6 +2604,7 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
int nid = page_to_nid(old_page);
+ bool alloc_retry = false;
struct page *new_page;
int ret = 0;
@@ -2544,9 +2615,30 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
* the pool. This simplifies and let us do most of the processing
* under the lock.
*/
+alloc_retry:
new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
if (!new_page)
return -ENOMEM;
+ /*
+ * If all goes well, this page will be directly added to the free
+ * list in the pool. For this the ref count needs to be zero.
+ * Attempt to drop now, and retry once if needed. It is VERY
+ * unlikely there is another ref on the page.
+ *
+ * If someone else has a reference to the page, it will be freed
+ * when they drop their ref. Abuse temporary page flag to accomplish
+ * this. Retry once if there is an inflated ref count.
+ */
+ SetHPageTemporary(new_page);
+ if (!put_page_testzero(new_page)) {
+ if (alloc_retry)
+ return -EBUSY;
+
+ alloc_retry = true;
+ goto alloc_retry;
+ }
+ ClearHPageTemporary(new_page);
+
__prep_new_huge_page(h, new_page);
retry:
@@ -2586,11 +2678,10 @@ retry:
remove_hugetlb_page(h, old_page, false);
/*
- * Reference count trick is needed because allocator gives us
- * referenced page but the pool requires pages with 0 refcount.
+ * Ref count on new page is already zero as it was dropped
+ * earlier. It can be directly added to the pool free list.
*/
__prep_account_new_huge_page(h, nid);
- page_ref_dec(new_page);
enqueue_huge_page(h, new_page);
/*
@@ -2604,6 +2695,8 @@ retry:
free_new:
spin_unlock_irq(&hugetlb_lock);
+ /* Page has a zero ref count, but needs a ref to be freed */
+ set_page_refcounted(new_page);
update_and_free_page(h, new_page, false);
return ret;
@@ -2828,8 +2921,8 @@ static void __init gather_bootmem_prealloc(void)
prep_new_huge_page(h, page, page_to_nid(page));
put_page(page); /* add to the hugepage allocator */
} else {
+ /* VERY unlikely inflated ref count on a tail page */
free_gigantic_page(page, huge_page_order(h));
- pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
}
/*
@@ -4033,8 +4126,10 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
* after this open call completes. It is therefore safe to take a
* new reference here without additional locking.
*/
- if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
kref_get(&resv->refs);
+ }
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 1ae1ebc2b9b1..aff4d27ec235 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -30,7 +30,7 @@ static int hwpoison_inject(void *data, u64 val)
if (!hwpoison_filter_enable)
goto inject;
- shake_page(hpage, 0);
+ shake_page(hpage);
/*
* This implies unable to support non-LRU pages.
*/
diff --git a/mm/internal.h b/mm/internal.h
index 31ff935b2547..cf3cb933eba3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -211,6 +211,10 @@ extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
+extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr,
+ int nid, bool exact_nid);
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -539,12 +543,17 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
#ifdef CONFIG_NUMA
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
unsigned int order)
{
return NODE_RECLAIM_NOSCAN;
}
+static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+ return NUMA_NO_NODE;
+}
#endif
extern int hwpoison_filter(struct page *p);
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index e4c16f6b6680..05d1e9460e2e 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -37,16 +37,9 @@ enum kasan_arg_stacktrace {
KASAN_ARG_STACKTRACE_ON,
};
-enum kasan_arg_fault {
- KASAN_ARG_FAULT_DEFAULT,
- KASAN_ARG_FAULT_REPORT,
- KASAN_ARG_FAULT_PANIC,
-};
-
static enum kasan_arg kasan_arg __ro_after_init;
static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
-static enum kasan_arg_fault kasan_arg_fault __ro_after_init;
/* Whether KASAN is enabled at all. */
DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
@@ -59,9 +52,6 @@ EXPORT_SYMBOL_GPL(kasan_flag_async);
/* Whether to collect alloc/free stack traces. */
DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
-/* Whether to panic or print a report and disable tag checking on fault. */
-bool kasan_flag_panic __ro_after_init;
-
/* kasan=off/on */
static int __init early_kasan_flag(char *arg)
{
@@ -113,23 +103,6 @@ static int __init early_kasan_flag_stacktrace(char *arg)
}
early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
-/* kasan.fault=report/panic */
-static int __init early_kasan_fault(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (!strcmp(arg, "report"))
- kasan_arg_fault = KASAN_ARG_FAULT_REPORT;
- else if (!strcmp(arg, "panic"))
- kasan_arg_fault = KASAN_ARG_FAULT_PANIC;
- else
- return -EINVAL;
-
- return 0;
-}
-early_param("kasan.fault", early_kasan_fault);
-
/* kasan_init_hw_tags_cpu() is called for each CPU. */
void kasan_init_hw_tags_cpu(void)
{
@@ -195,22 +168,6 @@ void __init kasan_init_hw_tags(void)
break;
}
- switch (kasan_arg_fault) {
- case KASAN_ARG_FAULT_DEFAULT:
- /*
- * Default to no panic on report.
- * Do nothing, kasan_flag_panic keeps its default value.
- */
- break;
- case KASAN_ARG_FAULT_REPORT:
- /* Do nothing, kasan_flag_panic keeps its default value. */
- break;
- case KASAN_ARG_FAULT_PANIC:
- /* Enable panic on report. */
- kasan_flag_panic = true;
- break;
- }
-
pr_info("KernelAddressSanitizer initialized\n");
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index fff93b0bcb08..8bf568a80eb8 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -38,7 +38,6 @@ static inline bool kasan_async_mode_enabled(void)
#endif
-extern bool kasan_flag_panic __ro_after_init;
extern bool kasan_flag_async __ro_after_init;
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 8fff1825b22c..884a950c7026 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -39,6 +39,31 @@ static unsigned long kasan_flags;
#define KASAN_BIT_REPORTED 0
#define KASAN_BIT_MULTI_SHOT 1
+enum kasan_arg_fault {
+ KASAN_ARG_FAULT_DEFAULT,
+ KASAN_ARG_FAULT_REPORT,
+ KASAN_ARG_FAULT_PANIC,
+};
+
+static enum kasan_arg_fault kasan_arg_fault __ro_after_init = KASAN_ARG_FAULT_DEFAULT;
+
+/* kasan.fault=report/panic */
+static int __init early_kasan_fault(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "report"))
+ kasan_arg_fault = KASAN_ARG_FAULT_REPORT;
+ else if (!strcmp(arg, "panic"))
+ kasan_arg_fault = KASAN_ARG_FAULT_PANIC;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.fault", early_kasan_fault);
+
bool kasan_save_enable_multi_shot(void)
{
return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
@@ -102,10 +127,8 @@ static void end_report(unsigned long *flags, unsigned long addr)
panic_on_warn = 0;
panic("panic_on_warn set ...\n");
}
-#ifdef CONFIG_KASAN_HW_TAGS
- if (kasan_flag_panic)
+ if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC)
panic("kasan.fault=panic set ...\n");
-#endif
kasan_enable_current();
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b0412be08fa2..045cc579f724 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1721,7 +1721,7 @@ static void collapse_file(struct mm_struct *mm,
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
if (shmem_getpage(mapping->host, index, &page,
- SGP_NOHUGE)) {
+ SGP_NOALLOC)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 3fa9bc8a67cf..025338128cd9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -259,7 +259,7 @@ static unsigned long ksm_stable_node_chains;
static unsigned long ksm_stable_node_dups;
/* Delay in pruning stale stable_node_dups in the stable_node_chains */
-static int ksm_stable_node_chains_prune_millisecs = 2000;
+static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
/* Maximum number of page slots sharing a stable node */
static int ksm_max_page_sharing = 256;
@@ -3105,11 +3105,11 @@ stable_node_chains_prune_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- unsigned long msecs;
+ unsigned int msecs;
int err;
- err = kstrtoul(buf, 10, &msecs);
- if (err || msecs > UINT_MAX)
+ err = kstrtouint(buf, 10, &msecs);
+ if (err)
return -EINVAL;
ksm_stable_node_chains_prune_millisecs = msecs;
diff --git a/mm/madvise.c b/mm/madvise.c
index 56324a3dbc4e..0734db8d53a7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1048,6 +1048,7 @@ process_madvise_behavior_valid(int behavior)
switch (behavior) {
case MADV_COLD:
case MADV_PAGEOUT:
+ case MADV_WILLNEED:
return true;
default:
return false;
diff --git a/mm/memblock.c b/mm/memblock.c
index e2ca8ddc8ebe..0ab5a749bfa6 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -315,7 +315,7 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
* Return:
* Found address on success, 0 on failure.
*/
-phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
+static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align)
{
@@ -1496,18 +1496,12 @@ void * __init memblock_alloc_exact_nid_raw(
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
{
- void *ptr;
-
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
- ptr = memblock_alloc_internal(size, align,
- min_addr, max_addr, nid, true);
- if (ptr && size > 0)
- page_init_poison(ptr, size);
-
- return ptr;
+ return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
+ true);
}
/**
@@ -1534,18 +1528,12 @@ void * __init memblock_alloc_try_nid_raw(
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
{
- void *ptr;
-
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
- ptr = memblock_alloc_internal(size, align,
- min_addr, max_addr, nid, false);
- if (ptr && size > 0)
- page_init_poison(ptr, size);
-
- return ptr;
+ return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
+ false);
}
/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 389b5766e74f..b762215d73eb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -103,6 +103,14 @@ static bool do_memsw_account(void)
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
}
+/* memcg and lruvec stats flushing */
+static void flush_memcg_stats_dwork(struct work_struct *w);
+static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+static void flush_memcg_stats_work(struct work_struct *w);
+static DECLARE_WORK(stats_flush_work, flush_memcg_stats_work);
+static DEFINE_PER_CPU(unsigned int, stats_flush_threshold);
+static DEFINE_SPINLOCK(stats_flush_lock);
+
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
@@ -248,9 +256,9 @@ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
return &memcg->vmpressure;
}
-struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
{
- return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+ return container_of(vmpr, struct mem_cgroup, vmpressure);
}
#ifdef CONFIG_MEMCG_KMEM
@@ -646,17 +654,6 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
}
/* idx can be of type enum memcg_stat_item or node_stat_item. */
-static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
-{
- long x = READ_ONCE(memcg->vmstats.state[idx]);
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- return x;
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item. */
static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
{
long x = 0;
@@ -671,23 +668,11 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
return x;
}
-static struct mem_cgroup_per_node *
-parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
-{
- struct mem_cgroup *parent;
-
- parent = parent_mem_cgroup(pn->memcg);
- if (!parent)
- return NULL;
- return parent->nodeinfo[nid];
-}
-
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
- long x, threshold = MEMCG_CHARGE_BATCH;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
@@ -696,21 +681,9 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__mod_memcg_state(memcg, idx, val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
-
- if (vmstat_item_in_bytes(idx))
- threshold <<= PAGE_SHIFT;
-
- x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
- if (unlikely(abs(x) > threshold)) {
- pg_data_t *pgdat = lruvec_pgdat(lruvec);
- struct mem_cgroup_per_node *pi;
-
- for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
- atomic_long_add(x, &pi->lruvec_stat[idx]);
- x = 0;
- }
- __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+ if (!(__this_cpu_inc_return(stats_flush_threshold) % MEMCG_CHARGE_BATCH))
+ queue_work(system_unbound_wq, &stats_flush_work);
}
/**
@@ -905,7 +878,7 @@ EXPORT_SYMBOL(mem_cgroup_from_task);
static __always_inline struct mem_cgroup *active_memcg(void)
{
- if (in_interrupt())
+ if (!in_task())
return this_cpu_read(int_active_memcg);
else
return current->active_memcg;
@@ -2205,8 +2178,9 @@ static void drain_local_stock(struct work_struct *dummy)
unsigned long flags;
/*
- * The only protection from memory hotplug vs. drain_stock races is
- * that we always operate on local CPU stock here with IRQ disabled
+ * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
+ * drain_stock races is that we always operate on local CPU stock
+ * here with IRQ disabled
*/
local_irq_save(flags);
@@ -2273,7 +2247,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
if (memcg && stock->nr_pages &&
mem_cgroup_is_descendant(memcg, root_memcg))
flush = true;
- if (obj_stock_flush_required(stock, root_memcg))
+ else if (obj_stock_flush_required(stock, root_memcg))
flush = true;
rcu_read_unlock();
@@ -2289,40 +2263,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
mutex_unlock(&percpu_charge_mutex);
}
-static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
-{
- int nid;
-
- for_each_node(nid) {
- struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
- unsigned long stat[NR_VM_NODE_STAT_ITEMS];
- struct batched_lruvec_stat *lstatc;
- int i;
-
- lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
- stat[i] = lstatc->count[i];
- lstatc->count[i] = 0;
- }
-
- do {
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- atomic_long_add(stat[i], &pn->lruvec_stat[i]);
- } while ((pn = parent_nodeinfo(pn, nid)));
- }
-}
-
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
- struct mem_cgroup *memcg;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
- for_each_mem_cgroup(memcg)
- memcg_flush_lruvec_page_state(memcg, cpu);
-
return 0;
}
@@ -4116,7 +4063,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- if (val > 100)
+ if (val > 200)
return -EINVAL;
if (!mem_cgroup_is_root(memcg))
@@ -4668,7 +4615,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
atomic_read(&frn->done.cnt) == 1) {
frn->at = 0;
trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
- cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
WB_REASON_FOREIGN_FLUSH,
&frn->done);
}
@@ -4892,9 +4839,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
vfs_poll(efile.file, &event->pt);
- spin_lock(&memcg->event_list_lock);
+ spin_lock_irq(&memcg->event_list_lock);
list_add(&event->list, &memcg->event_list);
- spin_unlock(&memcg->event_list_lock);
+ spin_unlock_irq(&memcg->event_list_lock);
fdput(cfile);
fdput(efile);
@@ -5129,17 +5076,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
- pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
- GFP_KERNEL_ACCOUNT);
- if (!pn->lruvec_stat_local) {
- kfree(pn);
- return 1;
- }
-
- pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
- GFP_KERNEL_ACCOUNT);
- if (!pn->lruvec_stat_cpu) {
- free_percpu(pn->lruvec_stat_local);
+ pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
+ GFP_KERNEL_ACCOUNT);
+ if (!pn->lruvec_stats_percpu) {
kfree(pn);
return 1;
}
@@ -5160,8 +5099,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return;
- free_percpu(pn->lruvec_stat_cpu);
- free_percpu(pn->lruvec_stat_local);
+ free_percpu(pn->lruvec_stats_percpu);
kfree(pn);
}
@@ -5177,15 +5115,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
- int cpu;
-
memcg_wb_domain_exit(memcg);
- /*
- * Flush percpu lruvec stats to guarantee the value
- * correctness on parent's and all ancestor levels.
- */
- for_each_online_cpu(cpu)
- memcg_flush_lruvec_page_state(memcg, cpu);
__mem_cgroup_free(memcg);
}
@@ -5321,6 +5251,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
/* Online state pins memcg ID, memcg ID pins CSS */
refcount_set(&memcg->id.ref, 1);
css_get(css);
+
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
+ 2UL*HZ);
return 0;
}
@@ -5334,12 +5268,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace.
*/
- spin_lock(&memcg->event_list_lock);
+ spin_lock_irq(&memcg->event_list_lock);
list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
list_del_init(&event->list);
schedule_work(&event->remove);
}
- spin_unlock(&memcg->event_list_lock);
+ spin_unlock_irq(&memcg->event_list_lock);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
@@ -5412,13 +5346,33 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
memcg_wb_domain_size_changed(memcg);
}
+void mem_cgroup_flush_stats(void)
+{
+ if (!spin_trylock(&stats_flush_lock))
+ return;
+
+ cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+ spin_unlock(&stats_flush_lock);
+}
+
+static void flush_memcg_stats_dwork(struct work_struct *w)
+{
+ mem_cgroup_flush_stats();
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
+}
+
+static void flush_memcg_stats_work(struct work_struct *w)
+{
+ mem_cgroup_flush_stats();
+}
+
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct memcg_vmstats_percpu *statc;
long delta, v;
- int i;
+ int i, nid;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
@@ -5466,6 +5420,36 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (parent)
parent->vmstats.events_pending[i] += delta;
}
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct mem_cgroup_per_node *ppn = NULL;
+ struct lruvec_stats_percpu *lstatc;
+
+ if (parent)
+ ppn = parent->nodeinfo[nid];
+
+ lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ delta = pn->lruvec_stats.state_pending[i];
+ if (delta)
+ pn->lruvec_stats.state_pending[i] = 0;
+
+ v = READ_ONCE(lstatc->state[i]);
+ if (v != lstatc->state_prev[i]) {
+ delta += v - lstatc->state_prev[i];
+ lstatc->state_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ pn->lruvec_stats.state[i] += delta;
+ if (ppn)
+ ppn->lruvec_stats.state_pending[i] += delta;
+ }
+ }
}
#ifdef CONFIG_MMU
@@ -6399,6 +6383,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
int i;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ cgroup_rstat_flush(memcg->css.cgroup);
+
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
int nid;
@@ -6704,8 +6690,7 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
atomic_long_read(&parent->memory.children_low_usage)));
}
-static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
- gfp_t gfp)
+static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp)
{
unsigned int nr_pages = thp_nr_pages(page);
int ret;
@@ -6726,7 +6711,7 @@ out:
}
/**
- * mem_cgroup_charge - charge a newly allocated page to a cgroup
+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
* @page: page to charge
* @mm: mm context of the victim
* @gfp_mask: reclaim mode
@@ -6739,16 +6724,14 @@ out:
*
* Returns 0 on success. Otherwise, an error code is returned.
*/
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
{
struct mem_cgroup *memcg;
int ret;
- if (mem_cgroup_disabled())
- return 0;
-
memcg = get_mem_cgroup_from_mm(mm);
- ret = __mem_cgroup_charge(page, memcg, gfp_mask);
+ ret = charge_memcg(page, memcg, gfp_mask);
css_put(&memcg->css);
return ret;
@@ -6783,7 +6766,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
memcg = get_mem_cgroup_from_mm(mm);
rcu_read_unlock();
- ret = __mem_cgroup_charge(page, memcg, gfp);
+ ret = charge_memcg(page, memcg, gfp);
css_put(&memcg->css);
return ret;
@@ -6919,18 +6902,15 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
}
/**
- * mem_cgroup_uncharge - uncharge a page
+ * __mem_cgroup_uncharge - uncharge a page
* @page: page to uncharge
*
- * Uncharge a page previously charged with mem_cgroup_charge().
+ * Uncharge a page previously charged with __mem_cgroup_charge().
*/
-void mem_cgroup_uncharge(struct page *page)
+void __mem_cgroup_uncharge(struct page *page)
{
struct uncharge_gather ug;
- if (mem_cgroup_disabled())
- return;
-
/* Don't touch page->lru of any random page, pre-check: */
if (!page_memcg(page))
return;
@@ -6941,20 +6921,17 @@ void mem_cgroup_uncharge(struct page *page)
}
/**
- * mem_cgroup_uncharge_list - uncharge a list of page
+ * __mem_cgroup_uncharge_list - uncharge a list of page
* @page_list: list of pages to uncharge
*
* Uncharge a list of pages previously charged with
- * mem_cgroup_charge().
+ * __mem_cgroup_charge().
*/
-void mem_cgroup_uncharge_list(struct list_head *page_list)
+void __mem_cgroup_uncharge_list(struct list_head *page_list)
{
struct uncharge_gather ug;
struct page *page;
- if (mem_cgroup_disabled())
- return;
-
uncharge_gather_clear(&ug);
list_for_each_entry(page, page_list, lru)
uncharge_page(page, &ug);
@@ -7244,7 +7221,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
}
/**
- * mem_cgroup_try_charge_swap - try charging swap space for a page
+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
* @page: page being added to swap
* @entry: swap entry to charge
*
@@ -7252,16 +7229,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
*
* Returns 0 on success, -ENOMEM on failure.
*/
-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
{
unsigned int nr_pages = thp_nr_pages(page);
struct page_counter *counter;
struct mem_cgroup *memcg;
unsigned short oldid;
- if (mem_cgroup_disabled())
- return 0;
-
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
@@ -7297,11 +7271,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
}
/**
- * mem_cgroup_uncharge_swap - uncharge swap space
+ * __mem_cgroup_uncharge_swap - uncharge swap space
* @entry: swap entry to uncharge
* @nr_pages: the amount of swap space to uncharge
*/
-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
struct mem_cgroup *memcg;
unsigned short id;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e1f87cf13235..54879c339024 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -68,7 +68,7 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
static bool __page_handle_poison(struct page *page)
{
- bool ret;
+ int ret;
zone_pcp_disable(page_zone(page));
ret = dissolve_free_huge_page(page);
@@ -76,7 +76,7 @@ static bool __page_handle_poison(struct page *page)
ret = take_page_off_buddy(page);
zone_pcp_enable(page_zone(page));
- return ret;
+ return ret > 0;
}
static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
@@ -282,9 +282,9 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
/*
* Unknown page type encountered. Try to check whether it can turn PageLRU by
- * lru_add_drain_all, or a free page by reclaiming slabs when possible.
+ * lru_add_drain_all.
*/
-void shake_page(struct page *p, int access)
+void shake_page(struct page *p)
{
if (PageHuge(p))
return;
@@ -296,11 +296,9 @@ void shake_page(struct page *p, int access)
}
/*
- * Only call shrink_node_slabs here (which would also shrink
- * other caches) if access is not potentially fatal.
+ * TODO: Could shrink slab caches here if a lightweight range-based
+ * shrinker will be available.
*/
- if (access)
- drop_slab_node(page_to_nid(p));
}
EXPORT_SYMBOL_GPL(shake_page);
@@ -391,8 +389,8 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
/*
* Kill the processes that have been collected earlier.
*
- * Only do anything when DOIT is set, otherwise just free the list
- * (this is used for clean pages which do not need killing)
+ * Only do anything when FORCEKILL is set, otherwise just free the
+ * list (this is used for clean pages which do not need killing)
* Also when FAIL is set do a force kill because something went
* wrong earlier.
*/
@@ -632,7 +630,7 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
{
struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
int ret = 0;
- pte_t *ptep;
+ pte_t *ptep, *mapped_pte;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmdp, walk->vma);
@@ -645,14 +643,15 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
if (pmd_trans_unstable(pmdp))
goto out;
- ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl);
+ mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
+ addr, &ptl);
for (; addr != end; ptep++, addr += PAGE_SIZE) {
ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
hwp->pfn, &hwp->tk);
if (ret == 1)
break;
}
- pte_unmap_unlock(ptep - 1, ptl);
+ pte_unmap_unlock(mapped_pte, ptl);
out:
cond_resched();
return ret;
@@ -1204,7 +1203,7 @@ try_again:
* page, retry.
*/
if (pass++ < 3) {
- shake_page(p, 1);
+ shake_page(p);
goto try_again;
}
ret = -EIO;
@@ -1221,7 +1220,7 @@ try_again:
*/
if (pass++ < 3) {
put_page(p);
- shake_page(p, 1);
+ shake_page(p);
count_increased = false;
goto try_again;
}
@@ -1229,6 +1228,9 @@ try_again:
ret = -EIO;
}
out:
+ if (ret == -EIO)
+ dump_page(p, "hwpoison: unhandlable page");
+
return ret;
}
@@ -1270,14 +1272,13 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int flags, struct page **hpagep)
+ int flags, struct page *hpage)
{
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
int kill = 1, forcekill;
- struct page *hpage = *hpagep;
bool mlocked = PageMlocked(hpage);
/*
@@ -1369,7 +1370,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* shake_page() again to ensure that it's flushed.
*/
if (mlocked)
- shake_page(hpage, 0);
+ shake_page(hpage);
/*
* Now that the dirty bit has been propagated to the
@@ -1502,7 +1503,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
goto out;
}
- if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
+ if (!hwpoison_user_mappings(p, pfn, flags, head)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
@@ -1518,7 +1519,6 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
struct dev_pagemap *pgmap)
{
struct page *page = pfn_to_page(pfn);
- const bool unmap_success = true;
unsigned long size = 0;
struct to_kill *tk;
LIST_HEAD(tokill);
@@ -1590,7 +1590,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
start = (page->index << PAGE_SHIFT) & ~(size - 1);
unmap_mapping_range(page->mapping, start, size, 0);
}
- kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+ kill_procs(&tokill, flags & MF_MUST_KILL, false, pfn, flags);
rc = 0;
unlock:
dax_unlock_page(page, cookie);
@@ -1724,7 +1724,7 @@ try_again:
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- shake_page(p, 0);
+ shake_page(p);
lock_page(p);
@@ -1783,7 +1783,7 @@ try_again:
* Now take care of user space mappings.
* Abort on fail: __delete_from_page_cache() assumes unmapped page.
*/
- if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
+ if (!hwpoison_user_mappings(p, pfn, flags, p)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto unlock_page;
@@ -2099,7 +2099,7 @@ static int __soft_offline_page(struct page *page)
if (isolate_page(hpage, &pagelist)) {
ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
if (!ret) {
bool release = !huge;
@@ -2208,9 +2208,6 @@ retry:
try_again = false;
goto retry;
}
- } else if (ret == -EIO) {
- pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n",
- __func__, pfn, page->flags, &page->flags);
}
return ret;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 86c3af79e874..4c527a80b6c9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1469,7 +1469,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (nodes_empty(nmask))
node_set(mtc.nid, nmask);
ret = migrate_pages(&source, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
if (ret) {
list_for_each_entry(page, &source, lru) {
if (__ratelimit(&migrate_rs)) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e32360e90274..5e90b3fb7794 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -31,6 +31,9 @@
* but useful to set in a VMA when you have a non default
* process policy.
*
+ * preferred many Try a set of nodes first before normal fallback. This is
+ * similar to preferred without the special case.
+ *
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
@@ -189,7 +192,7 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
nodes_onto(*ret, tmp, *rel);
}
-static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
+static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
@@ -207,14 +210,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
return 0;
}
-static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
-{
- if (nodes_empty(*nodes))
- return -EINVAL;
- pol->nodes = *nodes;
- return 0;
-}
-
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
@@ -394,7 +389,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.rebind = mpol_rebind_default,
},
[MPOL_INTERLEAVE] = {
- .create = mpol_new_interleave,
+ .create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_PREFERRED] = {
@@ -402,12 +397,16 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.rebind = mpol_rebind_preferred,
},
[MPOL_BIND] = {
- .create = mpol_new_bind,
+ .create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_LOCAL] = {
.rebind = mpol_rebind_default,
},
+ [MPOL_PREFERRED_MANY] = {
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_preferred,
+ },
};
static int migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -900,6 +899,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
*nodes = p->nodes;
break;
case MPOL_LOCAL:
@@ -1084,7 +1084,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
if (err)
putback_movable_pages(&pagelist);
}
@@ -1338,7 +1338,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_page, NULL,
- start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+ start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
if (nr_failed)
putback_movable_pages(&pagelist);
}
@@ -1446,7 +1446,8 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
*flags = *mode & MPOL_MODE_FLAGS;
*mode &= ~MPOL_MODE_FLAGS;
- if ((unsigned int)(*mode) >= MPOL_MAX)
+
+ if ((unsigned int)(*mode) >= MPOL_MAX)
return -EINVAL;
if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
@@ -1875,16 +1876,27 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
*/
nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
+ int mode = policy->mode;
+
/* Lower zones don't get a nodemask applied for MPOL_BIND */
- if (unlikely(policy->mode == MPOL_BIND) &&
- apply_policy_zone(policy, gfp_zone(gfp)) &&
- cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+ if (unlikely(mode == MPOL_BIND) &&
+ apply_policy_zone(policy, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+ return &policy->nodes;
+
+ if (mode == MPOL_PREFERRED_MANY)
return &policy->nodes;
return NULL;
}
-/* Return the node id preferred by the given mempolicy, or the given id */
+/*
+ * Return the preferred node id for 'prefer' mempolicy, and return
+ * the given id for all other policies.
+ *
+ * policy_node() is always coupled with policy_nodemask(), which
+ * secures the nodemask limit for 'bind' and 'prefer-many' policy.
+ */
static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
if (policy->mode == MPOL_PREFERRED) {
@@ -1922,7 +1934,7 @@ unsigned int mempolicy_slab_node(void)
struct mempolicy *policy;
int node = numa_mem_id();
- if (in_interrupt())
+ if (!in_task())
return node;
policy = current->mempolicy;
@@ -1936,7 +1948,9 @@ unsigned int mempolicy_slab_node(void)
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
- case MPOL_BIND: {
+ case MPOL_BIND:
+ case MPOL_PREFERRED_MANY:
+ {
struct zoneref *z;
/*
@@ -2008,12 +2022,12 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
* @addr: address in @vma for shared policy lookup and interleave policy
* @gfp_flags: for requested zone
* @mpol: pointer to mempolicy pointer for reference counted mempolicy
- * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
+ * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
*
* Returns a nid suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
- * If the effective policy is 'BIND, returns a pointer to the mempolicy's
- * @nodemask for filtering the zonelist.
+ * If the effective policy is 'bind' or 'prefer-many', returns a pointer
+ * to the mempolicy's @nodemask for filtering the zonelist.
*
* Must be protected by read_mems_allowed_begin()
*/
@@ -2021,16 +2035,18 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
{
int nid;
+ int mode;
*mpol = get_vma_policy(vma, addr);
- *nodemask = NULL; /* assume !MPOL_BIND */
+ *nodemask = NULL;
+ mode = (*mpol)->mode;
- if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
+ if (unlikely(mode == MPOL_INTERLEAVE)) {
nid = interleave_nid(*mpol, vma, addr,
huge_page_shift(hstate_vma(vma)));
} else {
nid = policy_node(gfp_flags, *mpol, numa_node_id());
- if ((*mpol)->mode == MPOL_BIND)
+ if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
*nodemask = &(*mpol)->nodes;
}
return nid;
@@ -2063,6 +2079,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
*mask = mempolicy->nodes;
@@ -2128,6 +2145,27 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
return page;
}
+static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
+ int nid, struct mempolicy *pol)
+{
+ struct page *page;
+ gfp_t preferred_gfp;
+
+ /*
+ * This is a two pass approach. The first pass will only try the
+ * preferred nodes but skip the direct reclaim and allow the
+ * allocation to fail, while the second pass will try all the
+ * nodes in system.
+ */
+ preferred_gfp = gfp | __GFP_NOWARN;
+ preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+ page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
+ if (!page)
+ page = __alloc_pages(gfp, order, numa_node_id(), NULL);
+
+ return page;
+}
+
/**
* alloc_pages_vma - Allocate a page for a VMA.
* @gfp: GFP flags.
@@ -2163,6 +2201,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
goto out;
}
+ if (pol->mode == MPOL_PREFERRED_MANY) {
+ page = alloc_pages_preferred_many(gfp, order, node, pol);
+ mpol_cond_put(pol);
+ goto out;
+ }
+
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
int hpage_node = node;
@@ -2173,7 +2217,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
* node and don't fall back to other nodes, as the cost of
* remote accesses would likely offset THP benefits.
*
- * If the policy is interleave, or does not allow the current
+ * If the policy is interleave or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
if (pol->mode == MPOL_PREFERRED)
@@ -2240,6 +2284,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
*/
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ else if (pol->mode == MPOL_PREFERRED_MANY)
+ page = alloc_pages_preferred_many(gfp, order,
+ numa_node_id(), pol);
else
page = __alloc_pages(gfp, order,
policy_node(gfp, pol, numa_node_id()),
@@ -2311,6 +2358,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
return !!nodes_equal(a->nodes, b->nodes);
case MPOL_LOCAL:
return true;
@@ -2425,8 +2473,8 @@ static void sp_free(struct sp_node *n)
* node id. Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*
- * Return: -1 if the page is in a node that is valid for this policy, or a
- * suitable node ID to allocate a replacement page from.
+ * Return: NUMA_NO_NODE if the page is in a node that is valid for this
+ * policy, or a suitable node ID to allocate a replacement page from.
*/
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
@@ -2437,7 +2485,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
int polnid = NUMA_NO_NODE;
- int ret = -1;
+ int ret = NUMA_NO_NODE;
pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
@@ -2451,6 +2499,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
case MPOL_PREFERRED:
+ if (node_isset(curnid, pol->nodes))
+ goto out;
polnid = first_node(pol->nodes);
break;
@@ -2465,9 +2515,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
goto out;
}
+ fallthrough;
+ case MPOL_PREFERRED_MANY:
/*
- * allows binding to multiple nodes.
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
@@ -2829,6 +2880,7 @@ static const char * const policy_modes[] =
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
[MPOL_LOCAL] = "local",
+ [MPOL_PREFERRED_MANY] = "prefer (many)",
};
@@ -2907,6 +2959,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
if (!nodelist)
err = 0;
goto out;
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
/*
* Insist on a nodelist
@@ -2993,6 +3046,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
case MPOL_LOCAL:
break;
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
nodes = pol->nodes;
@@ -3021,3 +3075,64 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
+
+bool numa_demotion_enabled = false;
+
+#ifdef CONFIG_SYSFS
+static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n",
+ numa_demotion_enabled? "true" : "false");
+}
+
+static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
+ numa_demotion_enabled = true;
+ else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
+ numa_demotion_enabled = false;
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static struct kobj_attribute numa_demotion_enabled_attr =
+ __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
+ numa_demotion_enabled_store);
+
+static struct attribute *numa_attrs[] = {
+ &numa_demotion_enabled_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group numa_attr_group = {
+ .attrs = numa_attrs,
+};
+
+static int __init numa_init_sysfs(void)
+{
+ int err;
+ struct kobject *numa_kobj;
+
+ numa_kobj = kobject_create_and_add("numa", mm_kobj);
+ if (!numa_kobj) {
+ pr_err("failed to create numa kobject\n");
+ return -ENOMEM;
+ }
+ err = sysfs_create_group(numa_kobj, &numa_attr_group);
+ if (err) {
+ pr_err("failed to register numa group\n");
+ goto delete_obj;
+ }
+ return 0;
+
+delete_obj:
+ kobject_put(numa_kobj);
+ return err;
+}
+subsys_initcall(numa_init_sysfs);
+#endif
diff --git a/mm/migrate.c b/mm/migrate.c
index 7e240437e7d9..a0aeb3fe46a7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -49,6 +49,7 @@
#include <linux/sched/mm.h>
#include <linux/ptrace.h>
#include <linux/oom.h>
+#include <linux/memory.h>
#include <asm/tlbflush.h>
@@ -1099,6 +1100,80 @@ out:
return rc;
}
+
+/*
+ * node_demotion[] example:
+ *
+ * Consider a system with two sockets. Each socket has
+ * three classes of memory attached: fast, medium and slow.
+ * Each memory class is placed in its own NUMA node. The
+ * CPUs are placed in the node with the "fast" memory. The
+ * 6 NUMA nodes (0-5) might be split among the sockets like
+ * this:
+ *
+ * Socket A: 0, 1, 2
+ * Socket B: 3, 4, 5
+ *
+ * When Node 0 fills up, its memory should be migrated to
+ * Node 1. When Node 1 fills up, it should be migrated to
+ * Node 2. The migration path start on the nodes with the
+ * processors (since allocations default to this node) and
+ * fast memory, progress through medium and end with the
+ * slow memory:
+ *
+ * 0 -> 1 -> 2 -> stop
+ * 3 -> 4 -> 5 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *
+ * { 1, // Node 0 migrates to 1
+ * 2, // Node 1 migrates to 2
+ * -1, // Node 2 does not migrate
+ * 4, // Node 3 migrates to 4
+ * 5, // Node 4 migrates to 5
+ * -1} // Node 5 does not migrate
+ */
+
+/*
+ * Writes to this array occur without locking. Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
+ */
+static int node_demotion[MAX_NUMNODES] __read_mostly =
+ {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * Return: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+ int target;
+
+ /*
+ * node_demotion[] is updated without excluding this
+ * function from running. RCU doesn't provide any
+ * compiler barriers, so the READ_ONCE() is required
+ * to avoid compiler reordering or read merging.
+ *
+ * Make sure to use RCU over entire code blocks if
+ * node_demotion[] reads need to be consistent.
+ */
+ rcu_read_lock();
+ target = READ_ONCE(node_demotion[node]);
+ rcu_read_unlock();
+
+ return target;
+}
+
/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
@@ -1354,6 +1429,8 @@ static inline int try_split_thp(struct page *page, struct page **page2,
* @mode: The migration mode that specifies the constraints for
* page migration, if any.
* @reason: The reason for page migration.
+ * @ret_succeeded: Set to the number of pages migrated successfully if
+ * the caller passes a non-NULL pointer.
*
* The function returns after 10 attempts or if no pages are movable any more
* because the list has become empty or no retryable pages exist any more.
@@ -1364,7 +1441,7 @@ static inline int try_split_thp(struct page *page, struct page **page2,
*/
int migrate_pages(struct list_head *from, new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
- enum migrate_mode mode, int reason)
+ enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
{
int retry = 1;
int thp_retry = 1;
@@ -1519,6 +1596,9 @@ out:
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
+ if (ret_succeeded)
+ *ret_succeeded = nr_succeeded;
+
return rc;
}
@@ -1588,7 +1668,7 @@ static int do_move_pages_to_node(struct mm_struct *mm,
};
err = migrate_pages(pagelist, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
if (err)
putback_movable_pages(pagelist);
return err;
@@ -2103,7 +2183,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
list_add(&page->lru, &migratepages);
nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
- MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+ MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
if (nr_remaining) {
if (!list_empty(&migratepages)) {
list_del(&page->lru);
@@ -2982,3 +3062,232 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
}
EXPORT_SYMBOL(migrate_vma_finalize);
#endif /* CONFIG_DEVICE_PRIVATE */
+
+#if defined(CONFIG_MEMORY_HOTPLUG)
+/* Disable reclaim-based migration. */
+static void __disable_all_migrate_targets(void)
+{
+ int node;
+
+ for_each_online_node(node)
+ node_demotion[node] = NUMA_NO_NODE;
+}
+
+static void disable_all_migrate_targets(void)
+{
+ __disable_all_migrate_targets();
+
+ /*
+ * Ensure that the "disable" is visible across the system.
+ * Readers will see either a combination of before+disable
+ * state or disable+after. They will never see before and
+ * after state together.
+ *
+ * The before+after state together might have cycles and
+ * could cause readers to do things like loop until this
+ * function finishes. This ensures they can only see a
+ * single "bad" read and would, for instance, only loop
+ * once.
+ */
+ synchronize_rcu();
+}
+
+/*
+ * Find an automatic demotion target for 'node'.
+ * Failing here is OK. It might just indicate
+ * being at the end of a chain.
+ */
+static int establish_migrate_target(int node, nodemask_t *used)
+{
+ int migration_target;
+
+ /*
+ * Can not set a migration target on a
+ * node with it already set.
+ *
+ * No need for READ_ONCE() here since this
+ * in the write path for node_demotion[].
+ * This should be the only thread writing.
+ */
+ if (node_demotion[node] != NUMA_NO_NODE)
+ return NUMA_NO_NODE;
+
+ migration_target = find_next_best_node(node, used);
+ if (migration_target == NUMA_NO_NODE)
+ return NUMA_NO_NODE;
+
+ node_demotion[node] = migration_target;
+
+ return migration_target;
+}
+
+/*
+ * When memory fills up on a node, memory contents can be
+ * automatically migrated to another node instead of
+ * discarded at reclaim.
+ *
+ * Establish a "migration path" which will start at nodes
+ * with CPUs and will follow the priorities used to build the
+ * page allocator zonelists.
+ *
+ * The difference here is that cycles must be avoided. If
+ * node0 migrates to node1, then neither node1, nor anything
+ * node1 migrates to can migrate to node0.
+ *
+ * This function can run simultaneously with readers of
+ * node_demotion[]. However, it can not run simultaneously
+ * with itself. Exclusion is provided by memory hotplug events
+ * being single-threaded.
+ */
+static void __set_migration_target_nodes(void)
+{
+ nodemask_t next_pass = NODE_MASK_NONE;
+ nodemask_t this_pass = NODE_MASK_NONE;
+ nodemask_t used_targets = NODE_MASK_NONE;
+ int node;
+
+ /*
+ * Avoid any oddities like cycles that could occur
+ * from changes in the topology. This will leave
+ * a momentary gap when migration is disabled.
+ */
+ disable_all_migrate_targets();
+
+ /*
+ * Allocations go close to CPUs, first. Assume that
+ * the migration path starts at the nodes with CPUs.
+ */
+ next_pass = node_states[N_CPU];
+again:
+ this_pass = next_pass;
+ next_pass = NODE_MASK_NONE;
+ /*
+ * To avoid cycles in the migration "graph", ensure
+ * that migration sources are not future targets by
+ * setting them in 'used_targets'. Do this only
+ * once per pass so that multiple source nodes can
+ * share a target node.
+ *
+ * 'used_targets' will become unavailable in future
+ * passes. This limits some opportunities for
+ * multiple source nodes to share a destination.
+ */
+ nodes_or(used_targets, used_targets, this_pass);
+ for_each_node_mask(node, this_pass) {
+ int target_node = establish_migrate_target(node, &used_targets);
+
+ if (target_node == NUMA_NO_NODE)
+ continue;
+
+ /*
+ * Visit targets from this pass in the next pass.
+ * Eventually, every node will have been part of
+ * a pass, and will become set in 'used_targets'.
+ */
+ node_set(target_node, next_pass);
+ }
+ /*
+ * 'next_pass' contains nodes which became migration
+ * targets in this pass. Make additional passes until
+ * no more migrations targets are available.
+ */
+ if (!nodes_empty(next_pass))
+ goto again;
+}
+
+/*
+ * For callers that do not hold get_online_mems() already.
+ */
+static void set_migration_target_nodes(void)
+{
+ get_online_mems();
+ __set_migration_target_nodes();
+ put_online_mems();
+}
+
+/*
+ * React to hotplug events that might affect the migration targets
+ * like events that online or offline NUMA nodes.
+ *
+ * The ordering is also currently dependent on which nodes have
+ * CPUs. That means we need CPU on/offline notification too.
+ */
+static int migration_online_cpu(unsigned int cpu)
+{
+ set_migration_target_nodes();
+ return 0;
+}
+
+static int migration_offline_cpu(unsigned int cpu)
+{
+ set_migration_target_nodes();
+ return 0;
+}
+
+/*
+ * This leaves migrate-on-reclaim transiently disabled between
+ * the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
+ * whether reclaim-based migration is enabled or not, which
+ * ensures that the user can turn reclaim-based migration at
+ * any time without needing to recalculate migration targets.
+ *
+ * These callbacks already hold get_online_mems(). That is why
+ * __set_migration_target_nodes() can be used as opposed to
+ * set_migration_target_nodes().
+ */
+static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ /*
+ * Make sure there are not transient states where
+ * an offline node is a migration target. This
+ * will leave migration disabled until the offline
+ * completes and the MEM_OFFLINE case below runs.
+ */
+ disable_all_migrate_targets();
+ break;
+ case MEM_OFFLINE:
+ case MEM_ONLINE:
+ /*
+ * Recalculate the target nodes once the node
+ * reaches its final state (online or offline).
+ */
+ __set_migration_target_nodes();
+ break;
+ case MEM_CANCEL_OFFLINE:
+ /*
+ * MEM_GOING_OFFLINE disabled all the migration
+ * targets. Reenable them.
+ */
+ __set_migration_target_nodes();
+ break;
+ case MEM_GOING_ONLINE:
+ case MEM_CANCEL_ONLINE:
+ break;
+ }
+
+ return notifier_from_errno(0);
+}
+
+static int __init migrate_on_reclaim_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim",
+ migration_online_cpu,
+ migration_offline_cpu);
+ /*
+ * In the unlikely case that this fails, the automatic
+ * migration targets may become suboptimal for nodes
+ * where N_CPU changes. With such a small impact in a
+ * rare case, do not bother trying to do anything special.
+ */
+ WARN_ON(ret < 0);
+
+ hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+ return 0;
+}
+late_initcall(migrate_on_reclaim_init);
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/mmap.c b/mm/mmap.c
index 181a113b545d..dce46105e3df 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -534,6 +534,7 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
{
struct rb_node **__rb_link, *__rb_parent, *rb_prev;
+ mmap_assert_locked(mm);
__rb_link = &mm->mm_rb.rb_node;
rb_prev = __rb_parent = NULL;
@@ -2297,6 +2298,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
struct rb_node *rb_node;
struct vm_area_struct *vma;
+ mmap_assert_locked(mm);
/* Check the cache first. */
vma = vmacache_find(mm, addr);
if (likely(vma))
@@ -2986,14 +2988,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (mmap_write_lock_killable(mm))
return -EINTR;
- vma = find_vma(mm, start);
+ vma = vma_lookup(mm, start);
if (!vma || !(vma->vm_flags & VM_SHARED))
goto out;
- if (start < vma->vm_start)
- goto out;
-
if (start + size > vma->vm_end) {
struct vm_area_struct *next;
diff --git a/mm/mremap.c b/mm/mremap.c
index 5989d3990020..badfe17ade1f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -686,7 +686,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
- vm_acct_memory(new_len >> PAGE_SHIFT);
+ vm_acct_memory(old_len >> PAGE_SHIFT);
excess = 0;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c729a4c4a1ac..831340e7ad8b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -28,6 +28,7 @@
#include <linux/sched/task.h>
#include <linux/sched/debug.h>
#include <linux/swap.h>
+#include <linux/syscalls.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/cpuset.h>
@@ -1141,3 +1142,72 @@ void pagefault_out_of_memory(void)
out_of_memory(&oc);
mutex_unlock(&oom_lock);
}
+
+SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
+{
+#ifdef CONFIG_MMU
+ struct mm_struct *mm = NULL;
+ struct task_struct *task;
+ struct task_struct *p;
+ unsigned int f_flags;
+ bool reap = true;
+ struct pid *pid;
+ long ret = 0;
+
+ if (flags)
+ return -EINVAL;
+
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ if (!task) {
+ ret = -ESRCH;
+ goto put_pid;
+ }
+
+ /*
+ * Make sure to choose a thread which still has a reference to mm
+ * during the group exit
+ */
+ p = find_lock_task_mm(task);
+ if (!p) {
+ ret = -ESRCH;
+ goto put_task;
+ }
+
+ mm = p->mm;
+ mmgrab(mm);
+
+ /* If the work has been done already, just exit with success */
+ if (test_bit(MMF_OOM_SKIP, &mm->flags))
+ reap = false;
+ else if (!task_will_free_mem(p)) {
+ reap = false;
+ ret = -EINVAL;
+ }
+ task_unlock(p);
+
+ if (!reap)
+ goto drop_mm;
+
+ if (mmap_read_lock_killable(mm)) {
+ ret = -EINTR;
+ goto drop_mm;
+ }
+ if (!__oom_reap_task_mm(mm))
+ ret = -EAGAIN;
+ mmap_read_unlock(mm);
+
+drop_mm:
+ mmdrop(mm);
+put_task:
+ put_task_struct(task);
+put_pid:
+ put_pid(pid);
+ return ret;
+#else
+ return -ENOSYS;
+#endif /* CONFIG_MMU */
+}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c12f67cbfa19..4812a17b288c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -183,7 +183,7 @@ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
static void wb_min_max_ratio(struct bdi_writeback *wb,
unsigned long *minp, unsigned long *maxp)
{
- unsigned long this_bw = wb->avg_write_bandwidth;
+ unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
unsigned long long min = wb->bdi->min_ratio;
unsigned long long max = wb->bdi->max_ratio;
@@ -892,7 +892,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
static void wb_position_ratio(struct dirty_throttle_control *dtc)
{
struct bdi_writeback *wb = dtc->wb;
- unsigned long write_bw = wb->avg_write_bandwidth;
+ unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long wb_thresh = dtc->wb_thresh;
@@ -1115,7 +1115,7 @@ out:
&wb->bdi->tot_write_bandwidth) <= 0);
}
wb->write_bandwidth = bw;
- wb->avg_write_bandwidth = avg;
+ WRITE_ONCE(wb->avg_write_bandwidth, avg);
}
static void update_dirty_limit(struct dirty_throttle_control *dtc)
@@ -1147,8 +1147,8 @@ update:
dom->dirty_limit = limit;
}
-static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
- unsigned long now)
+static void domain_update_dirty_limit(struct dirty_throttle_control *dtc,
+ unsigned long now)
{
struct wb_domain *dom = dtc_dom(dtc);
@@ -1324,7 +1324,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
else
dirty_ratelimit -= step;
- wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+ WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL));
wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
@@ -1332,35 +1332,28 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
struct dirty_throttle_control *mdtc,
- unsigned long start_time,
bool update_ratelimit)
{
struct bdi_writeback *wb = gdtc->wb;
unsigned long now = jiffies;
- unsigned long elapsed = now - wb->bw_time_stamp;
+ unsigned long elapsed;
unsigned long dirtied;
unsigned long written;
- lockdep_assert_held(&wb->list_lock);
+ spin_lock(&wb->list_lock);
/*
- * rate-limit, only update once every 200ms.
+ * Lockless checks for elapsed time are racy and delayed update after
+ * IO completion doesn't do it at all (to make sure written pages are
+ * accounted reasonably quickly). Make sure elapsed >= 1 to avoid
+ * division errors.
*/
- if (elapsed < BANDWIDTH_INTERVAL)
- return;
-
+ elapsed = max(now - wb->bw_time_stamp, 1UL);
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
- /*
- * Skip quiet periods when disk bandwidth is under-utilized.
- * (at least 1s idle time between two flusher runs)
- */
- if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
- goto snapshot;
-
if (update_ratelimit) {
- domain_update_bandwidth(gdtc, now);
+ domain_update_dirty_limit(gdtc, now);
wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
/*
@@ -1368,23 +1361,41 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
* compiler has no way to figure that out. Help it.
*/
if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
- domain_update_bandwidth(mdtc, now);
+ domain_update_dirty_limit(mdtc, now);
wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
}
}
wb_update_write_bandwidth(wb, elapsed, written);
-snapshot:
wb->dirtied_stamp = dirtied;
wb->written_stamp = written;
- wb->bw_time_stamp = now;
+ WRITE_ONCE(wb->bw_time_stamp, now);
+ spin_unlock(&wb->list_lock);
}
-void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
+void wb_update_bandwidth(struct bdi_writeback *wb)
{
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
- __wb_update_bandwidth(&gdtc, NULL, start_time, false);
+ __wb_update_bandwidth(&gdtc, NULL, false);
+}
+
+/* Interval after which we consider wb idle and don't estimate bandwidth */
+#define WB_BANDWIDTH_IDLE_JIF (HZ)
+
+static void wb_bandwidth_estimate_start(struct bdi_writeback *wb)
+{
+ unsigned long now = jiffies;
+ unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp);
+
+ if (elapsed > WB_BANDWIDTH_IDLE_JIF &&
+ !atomic_read(&wb->writeback_inodes)) {
+ spin_lock(&wb->list_lock);
+ wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED);
+ wb->written_stamp = wb_stat(wb, WB_WRITTEN);
+ WRITE_ONCE(wb->bw_time_stamp, now);
+ spin_unlock(&wb->list_lock);
+ }
}
/*
@@ -1407,7 +1418,7 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
static unsigned long wb_max_pause(struct bdi_writeback *wb,
unsigned long wb_dirty)
{
- unsigned long bw = wb->avg_write_bandwidth;
+ unsigned long bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long t;
/*
@@ -1429,8 +1440,8 @@ static long wb_min_pause(struct bdi_writeback *wb,
unsigned long dirty_ratelimit,
int *nr_dirtied_pause)
{
- long hi = ilog2(wb->avg_write_bandwidth);
- long lo = ilog2(wb->dirty_ratelimit);
+ long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth));
+ long lo = ilog2(READ_ONCE(wb->dirty_ratelimit));
long t; /* target pause */
long pause; /* estimated next pause */
int pages; /* target nr_dirtied_pause */
@@ -1710,15 +1721,12 @@ free_running:
if (dirty_exceeded && !wb->dirty_exceeded)
wb->dirty_exceeded = 1;
- if (time_is_before_jiffies(wb->bw_time_stamp +
- BANDWIDTH_INTERVAL)) {
- spin_lock(&wb->list_lock);
- __wb_update_bandwidth(gdtc, mdtc, start_time, true);
- spin_unlock(&wb->list_lock);
- }
+ if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+ BANDWIDTH_INTERVAL))
+ __wb_update_bandwidth(gdtc, mdtc, true);
/* throttle according to the chosen dtc */
- dirty_ratelimit = wb->dirty_ratelimit;
+ dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
RATELIMIT_CALC_SHIFT;
max_pause = wb_max_pause(wb, sdtc->wb_dirty);
@@ -2345,9 +2353,12 @@ EXPORT_SYMBOL(generic_writepages);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
+ struct bdi_writeback *wb;
if (wbc->nr_to_write <= 0)
return 0;
+ wb = inode_to_wb_wbc(mapping->host, wbc);
+ wb_bandwidth_estimate_start(wb);
while (1) {
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
@@ -2358,6 +2369,14 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
cond_resched();
congestion_wait(BLK_RW_ASYNC, HZ/50);
}
+ /*
+ * Usually few pages are written by now from those we've just submitted
+ * but if there's constant writeback being submitted, this makes sure
+ * writeback bandwidth is updated once in a while.
+ */
+ if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+ BANDWIDTH_INTERVAL))
+ wb_update_bandwidth(wb);
return ret;
}
@@ -2729,6 +2748,24 @@ int clear_page_dirty_for_io(struct page *page)
}
EXPORT_SYMBOL(clear_page_dirty_for_io);
+static void wb_inode_writeback_start(struct bdi_writeback *wb)
+{
+ atomic_inc(&wb->writeback_inodes);
+}
+
+static void wb_inode_writeback_end(struct bdi_writeback *wb)
+{
+ atomic_dec(&wb->writeback_inodes);
+ /*
+ * Make sure estimate of writeback throughput gets updated after
+ * writeback completed. We delay the update by BANDWIDTH_INTERVAL
+ * (which is the interval other bandwidth updates use for batching) so
+ * that if multiple inodes end writeback at a similar time, they get
+ * batched into one bandwidth update.
+ */
+ queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
+}
+
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
@@ -2750,6 +2787,9 @@ int test_clear_page_writeback(struct page *page)
dec_wb_stat(wb, WB_WRITEBACK);
__wb_writeout_inc(wb);
+ if (!mapping_tagged(mapping,
+ PAGECACHE_TAG_WRITEBACK))
+ wb_inode_writeback_end(wb);
}
}
@@ -2792,8 +2832,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
PAGECACHE_TAG_WRITEBACK);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
- if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT)
- inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ inc_wb_stat(wb, WB_WRITEBACK);
+ if (!on_wblist)
+ wb_inode_writeback_start(wb);
+ }
/*
* We can come through here when swapping anonymous
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eeb3a9cb36bb..f95e1d2386a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4211,7 +4211,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
if (tsk_is_oom_victim(current) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
- if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
+ if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
show_mem(filter, nodemask);
@@ -4549,14 +4549,14 @@ static bool __need_reclaim(gfp_t gfp_mask)
return true;
}
-void __fs_reclaim_acquire(void)
+void __fs_reclaim_acquire(unsigned long ip)
{
- lock_map_acquire(&__fs_reclaim_map);
+ lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip);
}
-void __fs_reclaim_release(void)
+void __fs_reclaim_release(unsigned long ip)
{
- lock_map_release(&__fs_reclaim_map);
+ lock_release(&__fs_reclaim_map, ip);
}
void fs_reclaim_acquire(gfp_t gfp_mask)
@@ -4565,7 +4565,7 @@ void fs_reclaim_acquire(gfp_t gfp_mask)
if (__need_reclaim(gfp_mask)) {
if (gfp_mask & __GFP_FS)
- __fs_reclaim_acquire();
+ __fs_reclaim_acquire(_RET_IP_);
#ifdef CONFIG_MMU_NOTIFIER
lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
@@ -4582,7 +4582,7 @@ void fs_reclaim_release(gfp_t gfp_mask)
if (__need_reclaim(gfp_mask)) {
if (gfp_mask & __GFP_FS)
- __fs_reclaim_release();
+ __fs_reclaim_release(_RET_IP_);
}
}
EXPORT_SYMBOL_GPL(fs_reclaim_release);
@@ -4697,7 +4697,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* comment for __cpuset_node_allowed().
*/
alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(current)) && !in_interrupt())
+ } else if (unlikely(rt_task(current)) && in_task())
alloc_flags |= ALLOC_HARDER;
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
@@ -5157,7 +5157,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
* When we are in the interrupt context, it is irrelevant
* to the current task context. It means that any node ok.
*/
- if (!in_interrupt() && !ac->nodemask)
+ if (in_task() && !ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
@@ -5903,6 +5903,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" unevictable:%lu dirty:%lu writeback:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+ " kernel_misc_reclaimable:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
global_node_page_state(NR_ACTIVE_ANON),
global_node_page_state(NR_INACTIVE_ANON),
@@ -5919,6 +5920,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
global_zone_page_state(NR_BOUNCE),
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
free_pcp,
global_zone_page_state(NR_FREE_CMA_PAGES));
@@ -6155,7 +6157,7 @@ static int node_load[MAX_NUMNODES];
*
* Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
-static int find_next_best_node(int node, nodemask_t *used_node_mask)
+int find_next_best_node(int node, nodemask_t *used_node_mask)
{
int n, val;
int min_val = INT_MAX;
@@ -6640,7 +6642,6 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
-#if !defined(CONFIG_FLATMEM)
/*
* Only struct pages that correspond to ranges defined by memblock.memory
* are zeroed and initialized by going through __init_single_page() during
@@ -6685,13 +6686,6 @@ static void __init init_unavailable_range(unsigned long spfn,
pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
node, zone_names[zone], pgcnt);
}
-#else
-static inline void init_unavailable_range(unsigned long spfn,
- unsigned long epfn,
- int zone, int node)
-{
-}
-#endif
static void __init memmap_init_zone_range(struct zone *zone,
unsigned long start_pfn,
@@ -6721,7 +6715,7 @@ static void __init memmap_init(void)
{
unsigned long start_pfn, end_pfn;
unsigned long hole_pfn = 0;
- int i, j, zone_id, nid;
+ int i, j, zone_id = 0, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
struct pglist_data *node = NODE_DATA(nid);
@@ -6754,6 +6748,26 @@ static void __init memmap_init(void)
init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
+void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, int nid, bool exact_nid)
+{
+ void *ptr;
+
+ if (exact_nid)
+ ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid);
+ else
+ ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid);
+
+ if (ptr && size > 0)
+ page_init_poison(ptr, size);
+
+ return ptr;
+}
+
static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
@@ -7501,7 +7515,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
}
#ifdef CONFIG_FLATMEM
-static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
{
unsigned long __maybe_unused start = 0;
unsigned long __maybe_unused offset = 0;
@@ -7525,8 +7539,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
- map = memblock_alloc_node(size, SMP_CACHE_BYTES,
- pgdat->node_id);
+ map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
+ pgdat->node_id, false);
if (!map)
panic("Failed to allocate %ld bytes for node %d memory map\n",
size, pgdat->node_id);
@@ -7547,7 +7561,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
#endif
}
#else
-static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
+static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
#endif /* CONFIG_FLATMEM */
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -8976,7 +8990,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
cc->nr_migratepages -= nr_reclaimed;
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
- NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
/*
* On -ENOMEM, migrate_pages() bails out right away. It is pointless
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index bddf788f45bf..fff55bb830f9 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -287,6 +287,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
unsigned long pfn, flags;
struct page *page;
struct zone *zone;
+ int ret;
/*
* Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
@@ -299,15 +300,21 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
break;
}
page = __first_valid_page(start_pfn, end_pfn - start_pfn);
- if ((pfn < end_pfn) || !page)
- return -EBUSY;
+ if ((pfn < end_pfn) || !page) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* Check all pages are free or marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags);
spin_unlock_irqrestore(&zone->lock, flags);
+ ret = pfn < end_pfn ? -EBUSY : 0;
+
+out:
trace_test_pages_isolated(start_pfn, end_pfn, pfn);
- return pfn < end_pfn ? -EBUSY : 0;
+ return ret;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 7f2e0151c4e2..e1c20837a42a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1520,9 +1520,6 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
* Pages in [@page_start,@page_end) have been populated to @chunk. Update
* the bookkeeping information accordingly. Must be called after each
* successful population.
- *
- * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
- * is to serve an allocation in that area.
*/
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
int page_end)
diff --git a/mm/shmem.c b/mm/shmem.c
index 3107acee4f71..88742953532c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -38,8 +38,7 @@
#include <linux/hugetlb.h>
#include <linux/frontswap.h>
#include <linux/fs_parser.h>
-
-#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
+#include <linux/swapfile.h>
static struct vfsmount *shm_mnt;
@@ -137,9 +136,6 @@ static unsigned long shmem_default_max_inodes(void)
}
#endif
-static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
-static int shmem_replace_page(struct page **pagep, gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index);
static int shmem_swapin_page(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
@@ -278,10 +274,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
ino_t ino;
if (!(sb->s_flags & SB_KERNMOUNT)) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
if (sbinfo->max_inodes) {
if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
return -ENOSPC;
}
sbinfo->free_inodes--;
@@ -304,7 +300,7 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
}
*inop = ino;
}
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
} else if (inop) {
/*
* __shmem_file_setup, one of our callers, is lock-free: it
@@ -319,13 +315,14 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
* to worry about things like glibc compatibility.
*/
ino_t *next_ino;
+
next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
ino = *next_ino;
if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
ino = sbinfo->next_ino;
sbinfo->next_ino += SHMEM_INO_BATCH;
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
if (unlikely(is_zero_ino(ino)))
ino++;
}
@@ -341,9 +338,9 @@ static void shmem_free_inode(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
}
}
@@ -474,7 +471,38 @@ static bool shmem_confirm_swap(struct address_space *mapping,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* ifdef here to avoid bloating shmem.o when not necessary */
-static int shmem_huge __read_mostly;
+static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
+
+bool shmem_is_huge(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+ loff_t i_size;
+
+ if (shmem_huge == SHMEM_HUGE_DENY)
+ return false;
+ if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
+ return false;
+ if (shmem_huge == SHMEM_HUGE_FORCE)
+ return true;
+
+ switch (SHMEM_SB(inode->i_sb)->huge) {
+ case SHMEM_HUGE_ALWAYS:
+ return true;
+ case SHMEM_HUGE_WITHIN_SIZE:
+ index = round_up(index, HPAGE_PMD_NR);
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (i_size >= HPAGE_PMD_SIZE && (i_size >> PAGE_SHIFT) >= index)
+ return true;
+ fallthrough;
+ case SHMEM_HUGE_ADVISE:
+ if (vma && (vma->vm_flags & VM_HUGEPAGE))
+ return true;
+ fallthrough;
+ default:
+ return false;
+ }
+}
#if defined(CONFIG_SYSFS)
static int shmem_parse_huge(const char *str)
@@ -645,6 +673,12 @@ static long shmem_unused_huge_count(struct super_block *sb,
#define shmem_huge SHMEM_HUGE_DENY
+bool shmem_is_huge(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+ return false;
+}
+
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
struct shrink_control *sc, unsigned long nr_to_split)
{
@@ -652,15 +686,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
-{
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
- shmem_huge != SHMEM_HUGE_DENY)
- return true;
- return false;
-}
-
/*
* Like add_to_page_cache_locked, but error if expected item has gone.
*/
@@ -905,6 +930,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (lend == -1)
end = -1; /* unsigned, so actually very big */
+ if (info->fallocend > start && info->fallocend <= end && !unfalloc)
+ info->fallocend = start;
+
pagevec_init(&pvec);
index = start;
while (index < end && find_lock_entries(mapping, index, end - 1,
@@ -1038,7 +1066,6 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
{
struct inode *inode = path->dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
spin_lock_irq(&info->lock);
@@ -1047,7 +1074,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
}
generic_fillattr(&init_user_ns, inode, stat);
- if (is_huge_enabled(sb_info))
+ if (shmem_is_huge(NULL, inode, 0))
stat->blksize = HPAGE_PMD_SIZE;
return 0;
@@ -1058,7 +1085,6 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
{
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
int error;
error = setattr_prepare(&init_user_ns, dentry, attr);
@@ -1094,24 +1120,6 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
if (oldsize > holebegin)
unmap_mapping_range(inode->i_mapping,
holebegin, 0, 1);
-
- /*
- * Part of the huge page can be beyond i_size: subject
- * to shrink under memory pressure.
- */
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- spin_lock(&sbinfo->shrinklist_lock);
- /*
- * _careful to defend against unlocked access to
- * ->shrink_list in shmem_unused_huge_shrink()
- */
- if (list_empty_careful(&info->shrinklist)) {
- list_add_tail(&info->shrinklist,
- &sbinfo->shrinklist);
- sbinfo->shrinklist_len++;
- }
- spin_unlock(&sbinfo->shrinklist_lock);
- }
}
}
@@ -1156,8 +1164,6 @@ static void shmem_evict_inode(struct inode *inode)
clear_inode(inode);
}
-extern struct swap_info_struct *swap_info[];
-
static int shmem_find_swap_entries(struct address_space *mapping,
pgoff_t start, unsigned int nr_entries,
struct page **entries, pgoff_t *indices,
@@ -1338,7 +1344,19 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
swp_entry_t swap;
pgoff_t index;
- VM_BUG_ON_PAGE(PageCompound(page), page);
+ /*
+ * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
+ * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
+ * and its shmem_writeback() needs them to be split when swapping.
+ */
+ if (PageTransCompound(page)) {
+ /* Ensure the subpages are still dirty */
+ SetPageDirty(page);
+ if (split_huge_page(page) < 0)
+ goto redirty;
+ ClearPageDirty(page);
+ }
+
BUG_ON(!PageLocked(page));
mapping = page->mapping;
index = page->index;
@@ -1453,10 +1471,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
struct mempolicy *mpol = NULL;
if (sbinfo->mpol) {
- spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
+ raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
mpol = sbinfo->mpol;
mpol_get(mpol);
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
}
return mpol;
}
@@ -1798,7 +1816,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
struct page *page;
- enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
gfp_t huge_gfp;
int error;
@@ -1807,8 +1824,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
return -EFBIG;
- if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
- sgp = SGP_CACHE;
repeat:
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
@@ -1840,26 +1855,31 @@ repeat:
return error;
}
- if (page)
+ if (page) {
hindex = page->index;
- if (page && sgp == SGP_WRITE)
- mark_page_accessed(page);
-
- /* fallocated page? */
- if (page && !PageUptodate(page)) {
+ if (sgp == SGP_WRITE)
+ mark_page_accessed(page);
+ if (PageUptodate(page))
+ goto out;
+ /* fallocated page */
if (sgp != SGP_READ)
goto clear;
unlock_page(page);
put_page(page);
- page = NULL;
- hindex = index;
}
- if (page || sgp == SGP_READ)
- goto out;
/*
- * Fast cache lookup did not find it:
- * bring it back from swap or allocate.
+ * SGP_READ: succeed on hole, with NULL page, letting caller zero.
+ * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail.
+ */
+ *pagep = NULL;
+ if (sgp == SGP_READ)
+ return 0;
+ if (sgp == SGP_NOALLOC)
+ return -ENOENT;
+
+ /*
+ * Fast cache lookup and swap lookup did not find it: allocate.
*/
if (vma && userfaultfd_missing(vma)) {
@@ -1867,36 +1887,12 @@ repeat:
return 0;
}
- /* shmem_symlink() */
- if (!shmem_mapping(mapping))
- goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
- goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_FORCE)
- goto alloc_huge;
- switch (sbinfo->huge) {
- case SHMEM_HUGE_NEVER:
+ /* Never use a huge page for shmem_symlink() */
+ if (S_ISLNK(inode->i_mode))
goto alloc_nohuge;
- case SHMEM_HUGE_WITHIN_SIZE: {
- loff_t i_size;
- pgoff_t off;
-
- off = round_up(index, HPAGE_PMD_NR);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >= HPAGE_PMD_SIZE &&
- i_size >> PAGE_SHIFT >= off)
- goto alloc_huge;
-
- fallthrough;
- }
- case SHMEM_HUGE_ADVISE:
- if (sgp_huge == SGP_HUGE)
- goto alloc_huge;
- /* TODO: implement fadvise() hints */
+ if (!shmem_is_huge(vma, inode, index))
goto alloc_nohuge;
- }
-alloc_huge:
huge_gfp = vma_thp_gfp_mask(vma);
huge_gfp = limit_gfp_mask(huge_gfp, gfp);
page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
@@ -2052,7 +2048,6 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
- enum sgp_type sgp;
int err;
vm_fault_t ret = VM_FAULT_LOCKED;
@@ -2115,15 +2110,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
spin_unlock(&inode->i_lock);
}
- sgp = SGP_CACHE;
-
- if ((vma->vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- sgp = SGP_NOHUGE;
- else if (vma->vm_flags & VM_HUGEPAGE)
- sgp = SGP_HUGE;
-
- err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
+ err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
gfp, vma, vmf, &ret);
if (err)
return vmf_error(err);
@@ -2655,7 +2642,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_falloc shmem_falloc;
- pgoff_t start, index, end;
+ pgoff_t start, index, end, undo_fallocend;
int error;
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2724,7 +2711,16 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
inode->i_private = &shmem_falloc;
spin_unlock(&inode->i_lock);
- for (index = start; index < end; index++) {
+ /*
+ * info->fallocend is only relevant when huge pages might be
+ * involved: to prevent split_huge_page() freeing fallocated
+ * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
+ */
+ undo_fallocend = info->fallocend;
+ if (info->fallocend < end)
+ info->fallocend = end;
+
+ for (index = start; index < end; ) {
struct page *page;
/*
@@ -2738,6 +2734,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
else
error = shmem_getpage(inode, index, &page, SGP_FALLOC);
if (error) {
+ info->fallocend = undo_fallocend;
/* Remove the !PageUptodate pages we added */
if (index > start) {
shmem_undo_range(inode,
@@ -2747,13 +2744,26 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
goto undone;
}
+ index++;
+ /*
+ * Here is a more important optimization than it appears:
+ * a second SGP_FALLOC on the same huge page will clear it,
+ * making it PageUptodate and un-undoable if we fail later.
+ */
+ if (PageTransCompound(page)) {
+ index = round_up(index, HPAGE_PMD_NR);
+ /* Beware 32-bit wraparound */
+ if (!index)
+ index--;
+ }
+
/*
* Inform shmem_writepage() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
- shmem_falloc.next++;
if (!PageUptodate(page))
- shmem_falloc.nr_falloced++;
+ shmem_falloc.nr_falloced += index - shmem_falloc.next;
+ shmem_falloc.next = index;
/*
* If !PageUptodate, leave it that way so that freeable pages
@@ -3488,9 +3498,10 @@ static int shmem_reconfigure(struct fs_context *fc)
struct shmem_options *ctx = fc->fs_private;
struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
unsigned long inodes;
+ struct mempolicy *mpol = NULL;
const char *err;
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
if (!sbinfo->max_blocks) {
@@ -3535,14 +3546,15 @@ static int shmem_reconfigure(struct fs_context *fc)
* Preserve previous mempolicy unless mpol remount option was specified.
*/
if (ctx->mpol) {
- mpol_put(sbinfo->mpol);
+ mpol = sbinfo->mpol;
sbinfo->mpol = ctx->mpol; /* transfers initial ref */
ctx->mpol = NULL;
}
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
+ mpol_put(mpol);
return 0;
out:
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
return invalfc(fc, "%s", err);
}
@@ -3613,7 +3625,6 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
struct shmem_options *ctx = fc->fs_private;
struct inode *inode;
struct shmem_sb_info *sbinfo;
- int err = -ENOMEM;
/* Round up to L1_CACHE_BYTES to resist false sharing */
sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
@@ -3659,7 +3670,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sbinfo->mpol = ctx->mpol;
ctx->mpol = NULL;
- spin_lock_init(&sbinfo->stat_lock);
+ raw_spin_lock_init(&sbinfo->stat_lock);
if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
goto failed;
spin_lock_init(&sbinfo->shrinklist_lock);
@@ -3691,7 +3702,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
failed:
shmem_put_super(sb);
- return err;
+ return -ENOMEM;
}
static int shmem_get_tree(struct fs_context *fc)
@@ -3907,7 +3918,7 @@ int __init shmem_init(void)
if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
else
- shmem_huge = 0; /* just in case it was patched */
+ shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
#endif
return 0;
@@ -3976,42 +3987,6 @@ struct kobj_attribute shmem_enabled_attr =
__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-bool shmem_huge_enabled(struct vm_area_struct *vma)
-{
- struct inode *inode = file_inode(vma->vm_file);
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- loff_t i_size;
- pgoff_t off;
-
- if (!transhuge_vma_enabled(vma, vma->vm_flags))
- return false;
- if (shmem_huge == SHMEM_HUGE_FORCE)
- return true;
- if (shmem_huge == SHMEM_HUGE_DENY)
- return false;
- switch (sbinfo->huge) {
- case SHMEM_HUGE_NEVER:
- return false;
- case SHMEM_HUGE_ALWAYS:
- return true;
- case SHMEM_HUGE_WITHIN_SIZE:
- off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >= HPAGE_PMD_SIZE &&
- i_size >> PAGE_SHIFT >= off)
- return true;
- fallthrough;
- case SHMEM_HUGE_ADVISE:
- /* TODO: implement fadvise() hints */
- return (vma->vm_flags & VM_HUGEPAGE);
- default:
- VM_BUG_ON(1);
- return false;
- }
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
#else /* !CONFIG_SHMEM */
/*
diff --git a/mm/sparse.c b/mm/sparse.c
index 6326cdf36c4f..120bc8ea5293 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -109,32 +109,6 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
}
#endif
-#ifdef CONFIG_SPARSEMEM_EXTREME
-unsigned long __section_nr(struct mem_section *ms)
-{
- unsigned long root_nr;
- struct mem_section *root = NULL;
-
- for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
- root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
- if (!root)
- continue;
-
- if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
- break;
- }
-
- VM_BUG_ON(!root);
-
- return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
-}
-#else
-unsigned long __section_nr(struct mem_section *ms)
-{
- return (unsigned long)(ms - mem_section[0]);
-}
-#endif
-
/*
* During early boot, before section_mem_map is used for an actual
* mem_map, we use section_mem_map to store the section's NUMA
@@ -143,7 +117,7 @@ unsigned long __section_nr(struct mem_section *ms)
*/
static inline unsigned long sparse_encode_early_nid(int nid)
{
- return (nid << SECTION_NID_SHIFT);
+ return ((unsigned long)nid << SECTION_NID_SHIFT);
}
static inline int sparse_early_nid(struct mem_section *section)
@@ -187,10 +161,9 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
* those loops early.
*/
unsigned long __highest_present_section_nr;
-static void section_mark_present(struct mem_section *ms)
+static void __section_mark_present(struct mem_section *ms,
+ unsigned long section_nr)
{
- unsigned long section_nr = __section_nr(ms);
-
if (section_nr > __highest_present_section_nr)
__highest_present_section_nr = section_nr;
@@ -280,7 +253,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
if (!ms->section_mem_map) {
ms->section_mem_map = sparse_encode_early_nid(nid) |
SECTION_IS_ONLINE;
- section_mark_present(ms);
+ __section_mark_present(ms, section);
}
}
}
@@ -348,7 +321,8 @@ size_t mem_section_usage_size(void)
static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
{
#ifndef CONFIG_NUMA
- return __pa_symbol(pgdat);
+ VM_BUG_ON(pgdat != &contig_page_data);
+ return __pa_symbol(&contig_page_data);
#else
return __pa(pgdat);
#endif
@@ -462,8 +436,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
if (map)
return map;
- map = memblock_alloc_try_nid_raw(size, size, addr,
- MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ map = memmap_alloc(size, size, addr, nid, false);
if (!map)
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
__func__, size, PAGE_SIZE, nid, &addr);
@@ -490,8 +463,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
* and we want it to be properly aligned to the section size - this is
* especially the case for VMEMMAP which maps memmap to PMDs
*/
- sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(),
- addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
sparsemap_buf_end = sparsemap_buf + size;
}
@@ -934,7 +906,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
- section_mark_present(ms);
+ __section_mark_present(ms, section_nr);
/* Align memmap to section boundary in the subsection case */
if (section_nr_to_pfn(section_nr) != start_pfn)
diff --git a/mm/swap.c b/mm/swap.c
index 19600430e536..897200d27dd0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -179,28 +179,6 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
}
EXPORT_SYMBOL_GPL(get_kernel_pages);
-/*
- * get_kernel_page() - pin a kernel page in memory
- * @start: starting kernel address
- * @write: pinning for read/write, currently ignored
- * @pages: array that receives pointer to the page pinned.
- * Must be at least nr_segs long.
- *
- * Returns 1 if page is pinned. If the page was not pinned, returns
- * -errno. The page returned must be released with a put_page() call
- * when it is finished with.
- */
-int get_kernel_page(unsigned long start, int write, struct page **pages)
-{
- const struct kvec kiov = {
- .iov_base = (void *)start,
- .iov_len = PAGE_SIZE
- };
-
- return get_kernel_pages(&kiov, 1, write, pages);
-}
-EXPORT_SYMBOL_GPL(get_kernel_page);
-
static void pagevec_lru_move_fn(struct pagevec *pvec,
void (*move_fn)(struct page *page, struct lruvec *lruvec))
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1e07d1c776f2..22d10f713848 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3130,6 +3130,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
struct filename *name;
struct file *swap_file = NULL;
struct address_space *mapping;
+ struct dentry *dentry;
int prio;
int error;
union swap_header *swap_header;
@@ -3173,6 +3174,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->swap_file = swap_file;
mapping = swap_file->f_mapping;
+ dentry = swap_file->f_path.dentry;
inode = mapping->host;
error = claim_swapfile(p, inode);
@@ -3180,6 +3182,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap;
inode_lock(inode);
+ if (d_unlinked(dentry) || cant_mount(dentry)) {
+ error = -ENOENT;
+ goto bad_swap_unlock_inode;
+ }
if (IS_SWAPFILE(inode)) {
error = -EBUSY;
goto bad_swap_unlock_inode;
@@ -3773,7 +3779,7 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
{
struct swap_info_struct *si, *next;
int nid = page_to_nid(page);
diff --git a/mm/truncate.c b/mm/truncate.c
index 44ad5e515140..714eaf19821d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -484,8 +484,9 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
index = indices[i];
if (xa_is_value(page)) {
- invalidate_exceptional_entry(mapping, index,
- page);
+ count += invalidate_exceptional_entry(mapping,
+ index,
+ page);
continue;
}
index += thp_nr_pages(page) - 1;
@@ -513,19 +514,18 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
}
/**
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
+ * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode
+ * @mapping: the address_space which holds the cache to invalidate
* @start: the offset 'from' which to invalidate
* @end: the offset 'to' which to invalidate (inclusive)
*
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
+ * This function removes pages that are clean, unmapped and unlocked,
+ * as well as shadow entries. It will not block on IO activity.
*
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
+ * If you want to remove all the pages of one inode, regardless of
+ * their use and writeback state, use truncate_inode_pages().
*
- * Return: the number of the pages that were invalidated
+ * Return: the number of the cache entries that were invalidated
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -561,21 +561,19 @@ void invalidate_mapping_pagevec(struct address_space *mapping,
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
- unsigned long flags;
-
if (page->mapping != mapping)
return 0;
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
- xa_lock_irqsave(&mapping->i_pages, flags);
+ xa_lock_irq(&mapping->i_pages);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page));
__delete_from_page_cache(page, NULL);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -583,7 +581,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
put_page(page); /* pagecache ref */
return 1;
failed:
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
return 0;
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0e2132834bc7..7a9008415534 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -483,7 +483,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long src_start,
unsigned long len,
enum mcopy_atomic_mode mcopy_mode,
- bool *mmap_changing,
+ atomic_t *mmap_changing,
__u64 mode)
{
struct vm_area_struct *dst_vma;
@@ -517,7 +517,7 @@ retry:
* request the user to retry later
*/
err = -EAGAIN;
- if (mmap_changing && READ_ONCE(*mmap_changing))
+ if (mmap_changing && atomic_read(mmap_changing))
goto out_unlock;
/*
@@ -650,28 +650,29 @@ out:
ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- bool *mmap_changing, __u64 mode)
+ atomic_t *mmap_changing, __u64 mode)
{
return __mcopy_atomic(dst_mm, dst_start, src_start, len,
MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
}
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, bool *mmap_changing)
+ unsigned long len, atomic_t *mmap_changing)
{
return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
mmap_changing, 0);
}
ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, bool *mmap_changing)
+ unsigned long len, atomic_t *mmap_changing)
{
return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
mmap_changing, 0);
}
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, bool enable_wp, bool *mmap_changing)
+ unsigned long len, bool enable_wp,
+ atomic_t *mmap_changing)
{
struct vm_area_struct *dst_vma;
pgprot_t newprot;
@@ -694,7 +695,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
* request the user to retry later
*/
err = -EAGAIN;
- if (mmap_changing && READ_ONCE(*mmap_changing))
+ if (mmap_changing && atomic_read(mmap_changing))
goto out_unlock;
err = -ENOENT;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d5cd52805149..3824dc16ce1c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -787,6 +787,28 @@ unsigned long vmalloc_nr_pages(void)
return atomic_long_read(&nr_vmalloc_pages);
}
+static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
+{
+ struct vmap_area *va = NULL;
+ struct rb_node *n = vmap_area_root.rb_node;
+
+ while (n) {
+ struct vmap_area *tmp;
+
+ tmp = rb_entry(n, struct vmap_area, rb_node);
+ if (tmp->va_end > addr) {
+ va = tmp;
+ if (tmp->va_start <= addr)
+ break;
+
+ n = n->rb_left;
+ } else
+ n = n->rb_right;
+ }
+
+ return va;
+}
+
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
struct rb_node *n = vmap_area_root.rb_node;
@@ -1479,6 +1501,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
int node, gfp_t gfp_mask)
{
struct vmap_area *va;
+ unsigned long freed;
unsigned long addr;
int purged = 0;
int ret;
@@ -1542,13 +1565,12 @@ overflow:
goto retry;
}
- if (gfpflags_allow_blocking(gfp_mask)) {
- unsigned long freed = 0;
- blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
- if (freed > 0) {
- purged = 0;
- goto retry;
- }
+ freed = 0;
+ blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
+
+ if (freed > 0) {
+ purged = 0;
+ goto retry;
}
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
@@ -2779,7 +2801,7 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
static inline unsigned int
vm_area_alloc_pages(gfp_t gfp, int nid,
- unsigned int order, unsigned long nr_pages, struct page **pages)
+ unsigned int order, unsigned int nr_pages, struct page **pages)
{
unsigned int nr_allocated = 0;
@@ -2789,10 +2811,32 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* to fails, fallback to a single page allocator that is
* more permissive.
*/
- if (!order)
- nr_allocated = alloc_pages_bulk_array_node(
- gfp, nid, nr_pages, pages);
- else
+ if (!order) {
+ while (nr_allocated < nr_pages) {
+ unsigned int nr, nr_pages_request;
+
+ /*
+ * A maximum allowed request is hard-coded and is 100
+ * pages per call. That is done in order to prevent a
+ * long preemption off scenario in the bulk-allocator
+ * so the range is [1:100].
+ */
+ nr_pages_request = min(100U, nr_pages - nr_allocated);
+
+ nr = alloc_pages_bulk_array_node(gfp, nid,
+ nr_pages_request, pages + nr_allocated);
+
+ nr_allocated += nr;
+ cond_resched();
+
+ /*
+ * If zero or pages were obtained partly,
+ * fallback to a single page allocator.
+ */
+ if (nr != nr_pages_request)
+ break;
+ }
+ } else
/*
* Compound pages required for remap_vmalloc_page if
* high-order pages.
@@ -2816,9 +2860,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
for (i = 0; i < (1U << order); i++)
pages[nr_allocated + i] = page + i;
- if (gfpflags_allow_blocking(gfp))
- cond_resched();
-
+ cond_resched();
nr_allocated += 1U << order;
}
@@ -3267,9 +3309,14 @@ long vread(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
spin_lock(&vmap_area_lock);
- va = __find_vmap_area((unsigned long)addr);
+ va = find_vmap_area_exceed_addr((unsigned long)addr);
if (!va)
goto finished;
+
+ /* no intersects with alive vmap_area */
+ if ((unsigned long)addr + count <= va->va_start)
+ goto finished;
+
list_for_each_entry_from(va, &vmap_area_list, list) {
if (!count)
break;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index d69019fc3789..76518e4166dc 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -74,8 +74,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work)
static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
{
- struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr);
memcg = parent_mem_cgroup(memcg);
if (!memcg)
@@ -240,7 +239,12 @@ static void vmpressure_work_fn(struct work_struct *work)
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
- struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+ struct vmpressure *vmpr;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ vmpr = memcg_to_vmpressure(memcg);
/*
* Here we only want to account pressure that userland is able to
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eeae2f6bc532..740d03e6dae2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,6 +41,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
+#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
@@ -121,6 +122,9 @@ struct scan_control {
/* The file pages on the current node are dangerously low */
unsigned int file_is_tiny:1;
+ /* Always discard instead of demoting to lower tier memory */
+ unsigned int no_demotion:1;
+
/* Allocation order */
s8 order;
@@ -518,6 +522,48 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker,
return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
}
+static bool can_demote(int nid, struct scan_control *sc)
+{
+ if (!numa_demotion_enabled)
+ return false;
+ if (sc) {
+ if (sc->no_demotion)
+ return false;
+ /* It is pointless to do demotion in memcg reclaim */
+ if (cgroup_reclaim(sc))
+ return false;
+ }
+ if (next_demotion_node(nid) == NUMA_NO_NODE)
+ return false;
+
+ return true;
+}
+
+static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
+ int nid,
+ struct scan_control *sc)
+{
+ if (memcg == NULL) {
+ /*
+ * For non-memcg reclaim, is there
+ * space in any swap device?
+ */
+ if (get_nr_swap_pages() > 0)
+ return true;
+ } else {
+ /* Is the memcg below its swap limit? */
+ if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
+ return true;
+ }
+
+ /*
+ * The page can not be swapped.
+ *
+ * Can it be reclaimed from this node via demotion?
+ */
+ return can_demote(nid, sc);
+}
+
/*
* This misses isolated pages which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
@@ -529,7 +575,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
+ if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
@@ -893,6 +939,7 @@ out:
void drop_slab_node(int nid)
{
unsigned long freed;
+ int shift = 0;
do {
struct mem_cgroup *memcg = NULL;
@@ -905,7 +952,7 @@ void drop_slab_node(int nid)
do {
freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- } while (freed > 10);
+ } while ((freed >> shift++) > 1);
}
void drop_slab(void)
@@ -1052,14 +1099,13 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed, struct mem_cgroup *target_memcg)
{
- unsigned long flags;
int refcount;
void *shadow = NULL;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- xa_lock_irqsave(&mapping->i_pages, flags);
+ xa_lock_irq(&mapping->i_pages);
/*
* The non racy check for a busy page.
*
@@ -1100,7 +1146,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(page, target_memcg);
__delete_from_swap_cache(page, swap, shadow);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
put_swap_page(page, swap);
} else {
void (*freepage)(struct page *);
@@ -1126,7 +1172,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(page, target_memcg);
__delete_from_page_cache(page, shadow);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
if (freepage != NULL)
freepage(page);
@@ -1135,7 +1181,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
return 1;
cannot_free:
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
return 0;
}
@@ -1264,6 +1310,54 @@ static void page_check_dirty_writeback(struct page *page,
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}
+static struct page *alloc_demote_page(struct page *page, unsigned long node)
+{
+ struct migration_target_control mtc = {
+ /*
+ * Allocate from 'node', or fail quickly and quietly.
+ * When this happens, 'page' will likely just be discarded
+ * instead of migrated.
+ */
+ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
+ __GFP_THISNODE | __GFP_NOWARN |
+ __GFP_NOMEMALLOC | GFP_NOWAIT,
+ .nid = node
+ };
+
+ return alloc_migration_target(page, (unsigned long)&mtc);
+}
+
+/*
+ * Take pages on @demote_list and attempt to demote them to
+ * another node. Pages which are not demoted are left on
+ * @demote_pages.
+ */
+static unsigned int demote_page_list(struct list_head *demote_pages,
+ struct pglist_data *pgdat)
+{
+ int target_nid = next_demotion_node(pgdat->node_id);
+ unsigned int nr_succeeded;
+ int err;
+
+ if (list_empty(demote_pages))
+ return 0;
+
+ if (target_nid == NUMA_NO_NODE)
+ return 0;
+
+ /* Demotion ignores all cpuset and mempolicy settings */
+ err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+ target_nid, MIGRATE_ASYNC, MR_DEMOTION,
+ &nr_succeeded);
+
+ if (current_is_kswapd())
+ __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
+ else
+ __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+
+ return nr_succeeded;
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -1275,12 +1369,16 @@ static unsigned int shrink_page_list(struct list_head *page_list,
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
+ LIST_HEAD(demote_pages);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
+ bool do_demote_pass;
memset(stat, 0, sizeof(*stat));
cond_resched();
+ do_demote_pass = can_demote(pgdat->node_id, sc);
+retry:
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
@@ -1430,6 +1528,17 @@ static unsigned int shrink_page_list(struct list_head *page_list,
}
/*
+ * Before reclaiming the page, try to relocate
+ * its contents to another node.
+ */
+ if (do_demote_pass &&
+ (thp_migration_supported() || !PageTransHuge(page))) {
+ list_add(&page->lru, &demote_pages);
+ unlock_page(page);
+ continue;
+ }
+
+ /*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
* Lazyfree page could be freed directly
@@ -1624,11 +1733,14 @@ static unsigned int shrink_page_list(struct list_head *page_list,
/* follow __remove_mapping for reference */
if (!page_ref_freeze(page, 1))
goto keep_locked;
- if (PageDirty(page)) {
- page_ref_unfreeze(page, 1);
- goto keep_locked;
- }
-
+ /*
+ * The page has only one reference left, which is
+ * from the isolation. After the caller puts the
+ * page back on lru and drops the reference, the
+ * page will be freed anyway. It doesn't matter
+ * which lru it goes. So we don't bother checking
+ * PageDirty here.
+ */
count_vm_event(PGLAZYFREED);
count_memcg_page_event(page, PGLAZYFREED);
} else if (!mapping || !__remove_mapping(mapping, page, true,
@@ -1680,6 +1792,17 @@ keep:
list_add(&page->lru, &ret_pages);
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
+ /* 'page_list' is always empty here */
+
+ /* Migrate pages selected for demotion */
+ nr_reclaimed += demote_page_list(&demote_pages, pgdat);
+ /* Pages that could not be demoted are still in @demote_pages */
+ if (!list_empty(&demote_pages)) {
+ /* Pages which failed to demoted go back on @page_list for retry: */
+ list_splice_init(&demote_pages, page_list);
+ do_demote_pass = false;
+ goto retry;
+ }
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
@@ -1698,7 +1821,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
{
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
- .priority = DEF_PRIORITY,
.may_unmap = 1,
};
struct reclaim_stat stat;
@@ -2323,10 +2445,10 @@ unsigned long reclaim_pages(struct list_head *page_list)
unsigned int noreclaim_flag;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
- .priority = DEF_PRIORITY,
.may_writepage = 1,
.may_unmap = 1,
.may_swap = 1,
+ .no_demotion = 1,
};
noreclaim_flag = memalloc_noreclaim_save();
@@ -2452,6 +2574,7 @@ enum scan_balance {
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
{
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
@@ -2462,7 +2585,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
enum lru_list lru;
/* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
+ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2645,6 +2768,21 @@ out:
}
}
+/*
+ * Anonymous LRU management is a waste if there is
+ * ultimately no way to reclaim the memory.
+ */
+static bool can_age_anon_pages(struct pglist_data *pgdat,
+ struct scan_control *sc)
+{
+ /* Aging the anon LRU is valuable if swap is present: */
+ if (total_swap_pages > 0)
+ return true;
+
+ /* Also valuable if anon pages can be demoted: */
+ return can_demote(pgdat->node_id, sc);
+}
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
@@ -2754,7 +2892,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+ if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -2824,7 +2963,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
*/
pages_for_compaction = compact_gap(sc->order);
inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
return inactive_lru_pages > pages_for_compaction;
@@ -2898,6 +3037,12 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
+ /*
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
+ * lruvec stats for heuristics.
+ */
+ mem_cgroup_flush_stats();
+
memset(&sc->nr, 0, sizeof(sc->nr));
nr_reclaimed = sc->nr_reclaimed;
@@ -3434,18 +3579,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
* blocked waiting on the same lock. Instead, throttle for up to a
* second before continuing.
*/
- if (!(gfp_mask & __GFP_FS)) {
+ if (!(gfp_mask & __GFP_FS))
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
allow_direct_reclaim(pgdat), HZ);
+ else
+ /* Throttle until kswapd wakes the process */
+ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+ allow_direct_reclaim(pgdat));
- goto check_pending;
- }
-
- /* Throttle until kswapd wakes the process */
- wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
- allow_direct_reclaim(pgdat));
-
-check_pending:
if (fatal_signal_pending(current))
return true;
@@ -3583,7 +3724,7 @@ static void age_active_anon(struct pglist_data *pgdat,
struct mem_cgroup *memcg;
struct lruvec *lruvec;
- if (!total_swap_pages)
+ if (!can_age_anon_pages(pgdat, sc))
return;
lruvec = mem_cgroup_lruvec(NULL, pgdat);
@@ -3812,7 +3953,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
- __fs_reclaim_acquire();
+ __fs_reclaim_acquire(_THIS_IP_);
count_vm_event(PAGEOUTRUN);
@@ -3938,9 +4079,9 @@ restart:
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
- __fs_reclaim_release();
+ __fs_reclaim_release(_THIS_IP_);
ret = try_to_freeze();
- __fs_reclaim_acquire();
+ __fs_reclaim_acquire(_THIS_IP_);
if (ret || kthread_should_stop())
break;
@@ -3992,7 +4133,7 @@ out:
}
snapshot_refaults(NULL, pgdat);
- __fs_reclaim_release();
+ __fs_reclaim_release(_THIS_IP_);
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);
@@ -4290,23 +4431,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
-int kswapd_run(int nid)
+void kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- int ret = 0;
if (pgdat->kswapd)
- return 0;
+ return;
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
- ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
}
- return ret;
}
/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a7ed56ac4c0b..0885a34197b7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -204,7 +204,7 @@ int calculate_normal_threshold(struct zone *zone)
*
* Some sample thresholds:
*
- * Threshold Processors (fls) Zonesize fls(mem+1)
+ * Threshold Processors (fls) Zonesize fls(mem)+1
* ------------------------------------------------------------------
* 8 1 1 0.9-1 GB 4
* 16 2 2 0.9-1 GB 4
@@ -1217,6 +1217,8 @@ const char * const vmstat_text[] = {
"pgreuse",
"pgsteal_kswapd",
"pgsteal_direct",
+ "pgdemote_kswapd",
+ "pgdemote_direct",
"pgscan_kswapd",
"pgscan_direct",
"pgscan_direct_throttle",
@@ -1452,7 +1454,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
}
/* Print out the free pages at each order for each migatetype */
-static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
+static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
{
int order;
pg_data_t *pgdat = (pg_data_t *)arg;
@@ -1464,8 +1466,6 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
seq_putc(m, '\n');
walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
-
- return 0;
}
static void pagetypeinfo_showblockcount_print(struct seq_file *m,
@@ -1501,7 +1501,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
}
/* Print out the number of pageblocks for each migratetype */
-static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
+static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
{
int mtype;
pg_data_t *pgdat = (pg_data_t *)arg;
@@ -1512,8 +1512,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
seq_putc(m, '\n');
walk_zones_in_node(m, pgdat, true, false,
pagetypeinfo_showblockcount_print);
-
- return 0;
}
/*
@@ -1874,11 +1872,6 @@ static void vmstat_update(struct work_struct *w)
}
/*
- * Switch off vmstat processing and then fold all the remaining differentials
- * until the diffs stay at zero. The function is used by NOHZ and can only be
- * invoked when tick processing is not active.
- */
-/*
* Check if the diffs for a certain cpu indicate that
* an update is needed.
*/
@@ -1894,17 +1887,15 @@ static bool need_update(int cpu)
/*
* The fast way of checking if there are any vmstat diffs.
*/
- if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
- sizeof(pzstats->vm_stat_diff[0])))
+ if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
return true;
if (last_pgdat == zone->zone_pgdat)
continue;
last_pgdat = zone->zone_pgdat;
n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
- if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS *
- sizeof(n->vm_node_stat_diff[0])))
- return true;
+ if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
+ return true;
}
return false;
}
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 270a7df898e2..1209e1786af7 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -931,7 +931,7 @@ static void check_section(const char *modname, struct elf_info *elf,
".kprobes.text", ".cpuidle.text", ".noinstr.text"
#define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \
".fixup", ".entry.text", ".exception.text", ".text.*", \
- ".coldtext"
+ ".coldtext", ".softirqentry.text"
#define INIT_SECTIONS ".init.*"
#define MEM_INIT_SECTIONS ".meminit.*"
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 98d985895ec8..31af29f669d2 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -897,6 +897,9 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
struct tomoyo_page_dump *dump)
{
struct page *page;
+#ifdef CONFIG_MMU
+ int ret;
+#endif
/* dump->data is released by tomoyo_find_next_domain(). */
if (!dump->data) {
@@ -909,11 +912,13 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
/*
* This is called at execve() time in order to dig around
* in the argv/environment of the new proceess
- * (represented by bprm). 'current' is the process doing
- * the execve().
+ * (represented by bprm).
*/
- if (get_user_pages_remote(bprm->mm, pos, 1,
- FOLL_FORCE, &page, NULL, NULL) <= 0)
+ mmap_read_lock(bprm->mm);
+ ret = get_user_pages_remote(bprm->mm, pos, 1,
+ FOLL_FORCE, &page, NULL, NULL);
+ mmap_read_unlock(bprm->mm);
+ if (ret <= 0)
return false;
#else
page = bprm->page[pos / PAGE_SIZE];
diff --git a/tools/testing/scatterlist/linux/mm.h b/tools/testing/scatterlist/linux/mm.h
index f9a12005fcea..16ec895bbe5f 100644
--- a/tools/testing/scatterlist/linux/mm.h
+++ b/tools/testing/scatterlist/linux/mm.h
@@ -127,7 +127,6 @@ kmalloc_array(unsigned int n, unsigned int size, unsigned int flags)
#define kmemleak_free(a)
#define PageSlab(p) (0)
-#define flush_kernel_dcache_page(p)
#define MAX_ERRNO 4095
diff --git a/tools/testing/selftests/powerpc/primitives/asm/extable.h b/tools/testing/selftests/powerpc/primitives/asm/extable.h
new file mode 120000
index 000000000000..6385f059a951
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/asm/extable.h
@@ -0,0 +1 @@
+../../../../../../arch/powerpc/include/asm/extable.h \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c
index 82f7bdc2e5e6..67ca297c5cca 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c
@@ -57,7 +57,7 @@ trans:
: [gpr_1]"i"(GPR_1), [gpr_2]"i"(GPR_2),
[sprn_texasr] "i" (SPRN_TEXASR), [flt_1] "b" (&a),
[flt_2] "b" (&b), [cptr1] "b" (&cptr[1])
- : "memory", "r7", "r8", "r9", "r10",
+ : "memory", "r0", "r7", "r8", "r9", "r10",
"r11", "r12", "r13", "r14", "r15", "r16",
"r17", "r18", "r19", "r20", "r21", "r22",
"r23", "r24", "r25", "r26", "r27", "r28",
@@ -113,6 +113,7 @@ int ptrace_tm_gpr(void)
int ret, status;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT);
pid = fork();
if (pid < 0) {
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c
index ad65be6e8e85..6f2bce1b6c5d 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c
@@ -65,7 +65,7 @@ trans:
: [gpr_1]"i"(GPR_1), [gpr_2]"i"(GPR_2), [gpr_4]"i"(GPR_4),
[sprn_texasr] "i" (SPRN_TEXASR), [flt_1] "b" (&a),
[flt_4] "b" (&d)
- : "memory", "r5", "r6", "r7",
+ : "memory", "r0", "r5", "r6", "r7",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"
@@ -119,6 +119,7 @@ int ptrace_tm_spd_gpr(void)
int ret, status;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT);
pid = fork();
if (pid < 0) {
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c
index 2ecfa1158e2b..e112a34fbe59 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c
@@ -129,6 +129,7 @@ int ptrace_tm_spd_tar(void)
int ret, status;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT);
pid = fork();
if (pid == 0)
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c
index 6f7fb51f0809..40133d49fe39 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c
@@ -129,6 +129,7 @@ int ptrace_tm_spd_vsx(void)
int ret, status, i;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT);
for (i = 0; i < 128; i++) {
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c
index 068bfed2e606..880ba6a29a48 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c
@@ -114,6 +114,7 @@ int ptrace_tm_spr(void)
int ret, status;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
shm_id = shmget(IPC_PRIVATE, sizeof(struct shared), 0777|IPC_CREAT);
shm_id1 = shmget(IPC_PRIVATE, sizeof(int), 0777|IPC_CREAT);
pid = fork();
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c
index 46ef378a15ec..d0db6df0f0ea 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c
@@ -117,6 +117,7 @@ int ptrace_tm_tar(void)
int ret, status;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT);
pid = fork();
if (pid == 0)
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c
index 70ca01234f79..4f05ce4fd282 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c
@@ -113,6 +113,7 @@ int ptrace_tm_vsx(void)
int ret, status, i;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT);
for (i = 0; i < 128; i++) {
diff --git a/tools/testing/selftests/powerpc/signal/signal_tm.c b/tools/testing/selftests/powerpc/signal/signal_tm.c
index 5bf2224ef7f2..c9cf66a3daa2 100644
--- a/tools/testing/selftests/powerpc/signal/signal_tm.c
+++ b/tools/testing/selftests/powerpc/signal/signal_tm.c
@@ -56,6 +56,7 @@ static int test_signal_tm()
}
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
for (i = 0; i < MAX_ATTEMPT; i++) {
/*
diff --git a/tools/testing/selftests/powerpc/tm/tm-exec.c b/tools/testing/selftests/powerpc/tm/tm-exec.c
index 260cfdb97d23..c59919d6710d 100644
--- a/tools/testing/selftests/powerpc/tm/tm-exec.c
+++ b/tools/testing/selftests/powerpc/tm/tm-exec.c
@@ -27,6 +27,7 @@ static char *path;
static int test_exec(void)
{
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
asm __volatile__(
"tbegin.;"
diff --git a/tools/testing/selftests/powerpc/tm/tm-fork.c b/tools/testing/selftests/powerpc/tm/tm-fork.c
index 6efa5a685a77..c27b935f0e9f 100644
--- a/tools/testing/selftests/powerpc/tm/tm-fork.c
+++ b/tools/testing/selftests/powerpc/tm/tm-fork.c
@@ -21,6 +21,7 @@
int test_fork(void)
{
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
asm __volatile__(
"tbegin.;"
diff --git a/tools/testing/selftests/powerpc/tm/tm-poison.c b/tools/testing/selftests/powerpc/tm/tm-poison.c
index 29e5f26af7b9..a7bbf034b5bb 100644
--- a/tools/testing/selftests/powerpc/tm/tm-poison.c
+++ b/tools/testing/selftests/powerpc/tm/tm-poison.c
@@ -20,7 +20,6 @@
#include <sched.h>
#include <sys/types.h>
#include <signal.h>
-#include <inttypes.h>
#include "tm.h"
@@ -34,6 +33,7 @@ int tm_poison_test(void)
bool fail_vr = false;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
cpu = pick_online_cpu();
FAIL_IF(cpu < 0);
diff --git a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
index 4cdb83964bb3..85c940ae6ff8 100644
--- a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
+++ b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
@@ -40,6 +40,7 @@ int test_body(void)
uint64_t rv, dscr1 = 1, dscr2, texasr;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
printf("Check DSCR TM context switch: ");
fflush(stdout);
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
index 254f912ad611..657d755b2905 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
@@ -79,6 +79,7 @@ static int tm_signal_context_chk_fpu()
pid_t pid = getpid();
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
act.sa_sigaction = signal_usr1;
sigemptyset(&act.sa_mask);
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c
index 0cc680f61828..400fa70ca71e 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c
@@ -81,6 +81,7 @@ static int tm_signal_context_chk_gpr()
pid_t pid = getpid();
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
act.sa_sigaction = signal_usr1;
sigemptyset(&act.sa_mask);
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c
index b6d52730a0d8..d628fd302b28 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c
@@ -104,6 +104,7 @@ static int tm_signal_context_chk()
pid_t pid = getpid();
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
act.sa_sigaction = signal_usr1;
sigemptyset(&act.sa_mask);
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c
index 8e25e2072ecd..9bd869245bad 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c
@@ -153,6 +153,7 @@ static int tm_signal_context_chk()
pid_t pid = getpid();
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
act.sa_sigaction = signal_usr1;
sigemptyset(&act.sa_mask);
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c b/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c
index 5908bc6abe60..0b84c9208d62 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c
@@ -226,6 +226,7 @@ int tm_signal_pagefault(void)
stack_t ss;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
SKIP_IF(!have_userfaultfd());
setup_uf_mem();
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c
index 07c388147b75..06b801906f27 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c
@@ -32,6 +32,7 @@ int tm_signal_sigreturn_nt(void)
struct sigaction trap_sa;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
trap_sa.sa_flags = SA_SIGINFO;
trap_sa.sa_sigaction = trap_signal_handler;
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
index cdcf8c5bbbc7..68807aac8dd3 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
@@ -35,6 +35,7 @@ int tm_signal_stack()
int pid;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
pid = fork();
if (pid < 0)
diff --git a/tools/testing/selftests/powerpc/tm/tm-sigreturn.c b/tools/testing/selftests/powerpc/tm/tm-sigreturn.c
index 9a6017a1d769..ffe4e5515f33 100644
--- a/tools/testing/selftests/powerpc/tm/tm-sigreturn.c
+++ b/tools/testing/selftests/powerpc/tm/tm-sigreturn.c
@@ -55,6 +55,7 @@ int tm_sigreturn(void)
uint64_t ret = 0;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
SKIP_IF(!is_ppc64le());
memset(&sa, 0, sizeof(sa));
diff --git a/tools/testing/selftests/powerpc/tm/tm-syscall.c b/tools/testing/selftests/powerpc/tm/tm-syscall.c
index becb8207b432..467a6b3134b2 100644
--- a/tools/testing/selftests/powerpc/tm/tm-syscall.c
+++ b/tools/testing/selftests/powerpc/tm/tm-syscall.c
@@ -25,7 +25,6 @@ extern int getppid_tm_suspended(void);
unsigned retries = 0;
#define TEST_DURATION 10 /* seconds */
-#define TM_RETRIES 100
pid_t getppid_tm(bool suspend)
{
@@ -67,6 +66,7 @@ int tm_syscall(void)
struct timeval end, now;
SKIP_IF(!have_htm_nosc());
+ SKIP_IF(htm_is_synthetic());
setbuf(stdout, NULL);
diff --git a/tools/testing/selftests/powerpc/tm/tm-tar.c b/tools/testing/selftests/powerpc/tm/tm-tar.c
index 03be8c47292b..f2a9137f3c1e 100644
--- a/tools/testing/selftests/powerpc/tm/tm-tar.c
+++ b/tools/testing/selftests/powerpc/tm/tm-tar.c
@@ -26,6 +26,7 @@ int test_tar(void)
int i;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
SKIP_IF(!is_ppc64le());
for (i = 0; i < num_loops; i++)
diff --git a/tools/testing/selftests/powerpc/tm/tm-tmspr.c b/tools/testing/selftests/powerpc/tm/tm-tmspr.c
index 794d574db784..dd5ddffa28b7 100644
--- a/tools/testing/selftests/powerpc/tm/tm-tmspr.c
+++ b/tools/testing/selftests/powerpc/tm/tm-tmspr.c
@@ -96,6 +96,7 @@ int test_tmspr()
unsigned long i;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
/* To cause some context switching */
thread_num = 10 * sysconf(_SC_NPROCESSORS_ONLN);
diff --git a/tools/testing/selftests/powerpc/tm/tm-trap.c b/tools/testing/selftests/powerpc/tm/tm-trap.c
index 11521077f915..97cb74768e30 100644
--- a/tools/testing/selftests/powerpc/tm/tm-trap.c
+++ b/tools/testing/selftests/powerpc/tm/tm-trap.c
@@ -255,6 +255,7 @@ int tm_trap_test(void)
struct sigaction trap_sa;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
trap_sa.sa_flags = SA_SIGINFO;
trap_sa.sa_sigaction = trap_signal_handler;
diff --git a/tools/testing/selftests/powerpc/tm/tm-unavailable.c b/tools/testing/selftests/powerpc/tm/tm-unavailable.c
index a1348a5f721a..6bf1b65b020d 100644
--- a/tools/testing/selftests/powerpc/tm/tm-unavailable.c
+++ b/tools/testing/selftests/powerpc/tm/tm-unavailable.c
@@ -344,6 +344,7 @@ int tm_unavailable_test(void)
cpu_set_t cpuset;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
cpu = pick_online_cpu();
FAIL_IF(cpu < 0);
diff --git a/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c b/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c
index 9ef37a9836ac..34364ed2b6b7 100644
--- a/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c
+++ b/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c
@@ -91,6 +91,7 @@ int tm_vmx_unavail_test()
pthread_t *thread;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
passed = 1;
diff --git a/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c
index c1e788a6df47..1640e7ead69b 100644
--- a/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c
+++ b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c
@@ -46,6 +46,7 @@ int test_vmxcopy()
uint64_t aborted = 0;
SKIP_IF(!have_htm());
+ SKIP_IF(htm_is_synthetic());
SKIP_IF(!is_ppc64le());
fd = mkstemp(tmpfile);
diff --git a/tools/testing/selftests/powerpc/tm/tm.h b/tools/testing/selftests/powerpc/tm/tm.h
index c5a1e5c163fc..c03c6e778876 100644
--- a/tools/testing/selftests/powerpc/tm/tm.h
+++ b/tools/testing/selftests/powerpc/tm/tm.h
@@ -10,6 +10,9 @@
#include <asm/tm.h>
#include "utils.h"
+#include "reg.h"
+
+#define TM_RETRIES 100
static inline bool have_htm(void)
{
@@ -31,6 +34,39 @@ static inline bool have_htm_nosc(void)
#endif
}
+/*
+ * Transactional Memory was removed in ISA 3.1. A synthetic TM implementation
+ * is provided on P10 for threads running in P8/P9 compatibility mode. The
+ * synthetic implementation immediately fails after tbegin. This failure sets
+ * Bit 7 (Failure Persistent) and Bit 15 (Implementation-specific).
+ */
+static inline bool htm_is_synthetic(void)
+{
+ int i;
+
+ /*
+ * Per the ISA, the Failure Persistent bit may be incorrect. Try a few
+ * times in case we got an Implementation-specific failure on a non ISA
+ * v3.1 system. On these systems the Implementation-specific failure
+ * should not be persistent.
+ */
+ for (i = 0; i < TM_RETRIES; i++) {
+ asm volatile(
+ "tbegin.;"
+ "beq 1f;"
+ "tend.;"
+ "1:"
+ :
+ :
+ : "memory");
+
+ if ((__builtin_get_texasr() & (TEXASR_FP | TEXASR_IC)) !=
+ (TEXASR_FP | TEXASR_IC))
+ break;
+ }
+ return i == TM_RETRIES;
+}
+
static inline long failure_code(void)
{
return __builtin_get_texasru() >> 24;
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index f0fd80ef17df..b02eac613fdd 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -27,3 +27,4 @@ hmm-tests
memfd_secret
local_config.*
split_huge_page_test
+ksm_tests
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 521243770f26..d9605bd10f2d 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -45,6 +45,7 @@ TEST_GEN_FILES += thuge-gen
TEST_GEN_FILES += transhuge-stress
TEST_GEN_FILES += userfaultfd
TEST_GEN_FILES += split_huge_page_test
+TEST_GEN_FILES += ksm_tests
ifeq ($(MACHINE),x86_64)
CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
@@ -145,6 +146,8 @@ $(OUTPUT)/hmm-tests: local_config.h
# HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
$(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS)
+$(OUTPUT)/ksm_tests: LDLIBS += -lnuma
+
local_config.mk local_config.h: check_config.sh
/bin/sh ./check_config.sh $(CC)
diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh
index 18d33684faad..fe8fcfb334e0 100644
--- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh
@@ -1,11 +1,14 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
set -e
if [[ $(id -u) -ne 0 ]]; then
echo "This test must be run as root. Skipping..."
- exit 0
+ exit $ksft_skip
fi
fault_limit_file=limit_in_bytes
diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh
index d11d1febccc3..4a9a3afe9fd4 100644
--- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh
@@ -1,11 +1,14 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
set -e
if [[ $(id -u) -ne 0 ]]; then
echo "This test must be run as root. Skipping..."
- exit 0
+ exit $ksft_skip
fi
usage_file=usage_in_bytes
diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c
new file mode 100644
index 000000000000..b61dcdb44c5b
--- /dev/null
+++ b/tools/testing/selftests/vm/ksm_tests.c
@@ -0,0 +1,662 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <time.h>
+#include <string.h>
+#include <numa.h>
+
+#include "../kselftest.h"
+#include "../../../../include/vdso/time64.h"
+
+#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/"
+#define KSM_FP(s) (KSM_SYSFS_PATH s)
+#define KSM_SCAN_LIMIT_SEC_DEFAULT 120
+#define KSM_PAGE_COUNT_DEFAULT 10l
+#define KSM_PROT_STR_DEFAULT "rw"
+#define KSM_USE_ZERO_PAGES_DEFAULT false
+#define KSM_MERGE_ACROSS_NODES_DEFAULT true
+#define MB (1ul << 20)
+
+struct ksm_sysfs {
+ unsigned long max_page_sharing;
+ unsigned long merge_across_nodes;
+ unsigned long pages_to_scan;
+ unsigned long run;
+ unsigned long sleep_millisecs;
+ unsigned long stable_node_chains_prune_millisecs;
+ unsigned long use_zero_pages;
+};
+
+enum ksm_test_name {
+ CHECK_KSM_MERGE,
+ CHECK_KSM_UNMERGE,
+ CHECK_KSM_ZERO_PAGE_MERGE,
+ CHECK_KSM_NUMA_MERGE,
+ KSM_MERGE_TIME,
+ KSM_COW_TIME
+};
+
+static int ksm_write_sysfs(const char *file_path, unsigned long val)
+{
+ FILE *f = fopen(file_path, "w");
+
+ if (!f) {
+ fprintf(stderr, "f %s\n", file_path);
+ perror("fopen");
+ return 1;
+ }
+ if (fprintf(f, "%lu", val) < 0) {
+ perror("fprintf");
+ return 1;
+ }
+ fclose(f);
+
+ return 0;
+}
+
+static int ksm_read_sysfs(const char *file_path, unsigned long *val)
+{
+ FILE *f = fopen(file_path, "r");
+
+ if (!f) {
+ fprintf(stderr, "f %s\n", file_path);
+ perror("fopen");
+ return 1;
+ }
+ if (fscanf(f, "%lu", val) != 1) {
+ perror("fscanf");
+ return 1;
+ }
+ fclose(f);
+
+ return 0;
+}
+
+static int str_to_prot(char *prot_str)
+{
+ int prot = 0;
+
+ if ((strchr(prot_str, 'r')) != NULL)
+ prot |= PROT_READ;
+ if ((strchr(prot_str, 'w')) != NULL)
+ prot |= PROT_WRITE;
+ if ((strchr(prot_str, 'x')) != NULL)
+ prot |= PROT_EXEC;
+
+ return prot;
+}
+
+static void print_help(void)
+{
+ printf("usage: ksm_tests [-h] <test type> [-a prot] [-p page_count] [-l timeout]\n"
+ "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n");
+
+ printf("Supported <test type>:\n"
+ " -M (page merging)\n"
+ " -Z (zero pages merging)\n"
+ " -N (merging of pages in different NUMA nodes)\n"
+ " -U (page unmerging)\n"
+ " -P evaluate merging time and speed.\n"
+ " For this test, the size of duplicated memory area (in MiB)\n"
+ " must be provided using -s option\n"
+ " -C evaluate the time required to break COW of merged pages.\n\n");
+
+ printf(" -a: specify the access protections of pages.\n"
+ " <prot> must be of the form [rwx].\n"
+ " Default: %s\n", KSM_PROT_STR_DEFAULT);
+ printf(" -p: specify the number of pages to test.\n"
+ " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT);
+ printf(" -l: limit the maximum running time (in seconds) for a test.\n"
+ " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT);
+ printf(" -z: change use_zero_pages tunable\n"
+ " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT);
+ printf(" -m: change merge_across_nodes tunable\n"
+ " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT);
+ printf(" -s: the size of duplicated memory area (in MiB)\n");
+
+ exit(0);
+}
+
+static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size)
+{
+ void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0);
+
+ if (!map_ptr) {
+ perror("mmap");
+ return NULL;
+ }
+ memset(map_ptr, data, map_size);
+ if (mprotect(map_ptr, map_size, prot)) {
+ perror("mprotect");
+ munmap(map_ptr, map_size);
+ return NULL;
+ }
+
+ return map_ptr;
+}
+
+static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout)
+{
+ struct timespec cur_time;
+ unsigned long cur_scan, init_scan;
+
+ if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan))
+ return 1;
+ cur_scan = init_scan;
+
+ while (cur_scan < init_scan + scan_count) {
+ if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan))
+ return 1;
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) {
+ perror("clock_gettime");
+ return 1;
+ }
+ if ((cur_time.tv_sec - start_time.tv_sec) > timeout) {
+ printf("Scan time limit exceeded\n");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout)
+{
+ if (madvise(addr, size, MADV_MERGEABLE)) {
+ perror("madvise");
+ return 1;
+ }
+ if (ksm_write_sysfs(KSM_FP("run"), 1))
+ return 1;
+
+ /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */
+ if (ksm_do_scan(2, start_time, timeout))
+ return 1;
+
+ return 0;
+}
+
+static bool assert_ksm_pages_count(long dupl_page_count)
+{
+ unsigned long max_page_sharing, pages_sharing, pages_shared;
+
+ if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) ||
+ ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) ||
+ ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing))
+ return false;
+
+ /*
+ * Since there must be at least 2 pages for merging and 1 page can be
+ * shared with the limited number of pages (max_page_sharing), sometimes
+ * there are 'leftover' pages that cannot be merged. For example, if there
+ * are 11 pages and max_page_sharing = 10, then only 10 pages will be
+ * merged and the 11th page won't be affected. As a result, when the number
+ * of duplicate pages is divided by max_page_sharing and the remainder is 1,
+ * pages_shared and pages_sharing values will be equal between dupl_page_count
+ * and dupl_page_count - 1.
+ */
+ if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) {
+ if (pages_shared == dupl_page_count / max_page_sharing &&
+ pages_sharing == pages_shared * (max_page_sharing - 1))
+ return true;
+ } else {
+ if (pages_shared == (dupl_page_count / max_page_sharing + 1) &&
+ pages_sharing == dupl_page_count - pages_shared)
+ return true;
+ }
+
+ return false;
+}
+
+static int ksm_save_def(struct ksm_sysfs *ksm_sysfs)
+{
+ if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) ||
+ ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) ||
+ ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) ||
+ ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) ||
+ ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) ||
+ ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
+ &ksm_sysfs->stable_node_chains_prune_millisecs) ||
+ ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages))
+ return 1;
+
+ return 0;
+}
+
+static int ksm_restore(struct ksm_sysfs *ksm_sysfs)
+{
+ if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) ||
+ ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) ||
+ ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) ||
+ ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) ||
+ ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) ||
+ ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
+ ksm_sysfs->stable_node_chains_prune_millisecs) ||
+ ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages))
+ return 1;
+
+ return 0;
+}
+
+static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size)
+{
+ void *map_ptr;
+ struct timespec start_time;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ /* fill pages with the same data and merge them */
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+ goto err_out;
+
+ /* verify that the right number of pages are merged */
+ if (assert_ksm_pages_count(page_count)) {
+ printf("OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_PASS;
+ }
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_FAIL;
+}
+
+static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size)
+{
+ void *map_ptr;
+ struct timespec start_time;
+ int page_count = 2;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ /* fill pages with the same data and merge them */
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+ goto err_out;
+
+ /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */
+ memset(map_ptr, '-', 1);
+ memset(map_ptr + page_size, '+', 1);
+
+ /* get at least 1 scan, so KSM can detect that the pages were modified */
+ if (ksm_do_scan(1, start_time, timeout))
+ goto err_out;
+
+ /* check that unmerging was successful and 0 pages are currently merged */
+ if (assert_ksm_pages_count(0)) {
+ printf("OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_PASS;
+ }
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_FAIL;
+}
+
+static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout,
+ bool use_zero_pages, size_t page_size)
+{
+ void *map_ptr;
+ struct timespec start_time;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages))
+ return KSFT_FAIL;
+
+ /* fill pages with zero and try to merge them */
+ map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+ goto err_out;
+
+ /*
+ * verify that the right number of pages are merged:
+ * 1) if use_zero_pages is set to 1, empty pages are merged
+ * with the kernel zero page instead of with each other;
+ * 2) if use_zero_pages is set to 0, empty pages are not treated specially
+ * and merged as usual.
+ */
+ if (use_zero_pages && !assert_ksm_pages_count(0))
+ goto err_out;
+ else if (!use_zero_pages && !assert_ksm_pages_count(page_count))
+ goto err_out;
+
+ printf("OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_PASS;
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_FAIL;
+}
+
+static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes,
+ size_t page_size)
+{
+ void *numa1_map_ptr, *numa2_map_ptr;
+ struct timespec start_time;
+ int page_count = 2;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ if (numa_available() < 0) {
+ perror("NUMA support not enabled");
+ return KSFT_SKIP;
+ }
+ if (numa_max_node() < 1) {
+ printf("At least 2 NUMA nodes must be available\n");
+ return KSFT_SKIP;
+ }
+ if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes))
+ return KSFT_FAIL;
+
+ /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */
+ numa1_map_ptr = numa_alloc_onnode(page_size, 0);
+ numa2_map_ptr = numa_alloc_onnode(page_size, 1);
+ if (!numa1_map_ptr || !numa2_map_ptr) {
+ perror("numa_alloc_onnode");
+ return KSFT_FAIL;
+ }
+
+ memset(numa1_map_ptr, '*', page_size);
+ memset(numa2_map_ptr, '*', page_size);
+
+ /* try to merge the pages */
+ if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) ||
+ ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout))
+ goto err_out;
+
+ /*
+ * verify that the right number of pages are merged:
+ * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged;
+ * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is
+ * only 1 unique page in each node and they can't be shared.
+ */
+ if (merge_across_nodes && !assert_ksm_pages_count(page_count))
+ goto err_out;
+ else if (!merge_across_nodes && !assert_ksm_pages_count(0))
+ goto err_out;
+
+ numa_free(numa1_map_ptr, page_size);
+ numa_free(numa2_map_ptr, page_size);
+ printf("OK\n");
+ return KSFT_PASS;
+
+err_out:
+ numa_free(numa1_map_ptr, page_size);
+ numa_free(numa2_map_ptr, page_size);
+ printf("Not OK\n");
+ return KSFT_FAIL;
+}
+
+static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size)
+{
+ void *map_ptr;
+ struct timespec start_time, end_time;
+ unsigned long scan_time_ns;
+
+ map_size *= MB;
+
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+ if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+ goto err_out;
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+
+ scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+ (end_time.tv_nsec - start_time.tv_nsec);
+
+ printf("Total size: %lu MiB\n", map_size / MB);
+ printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+ scan_time_ns % NSEC_PER_SEC);
+ printf("Average speed: %.3f MiB/s\n", (map_size / MB) /
+ ((double)scan_time_ns / NSEC_PER_SEC));
+
+ munmap(map_ptr, map_size);
+ return KSFT_PASS;
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, map_size);
+ return KSFT_FAIL;
+}
+
+static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size)
+{
+ void *map_ptr;
+ struct timespec start_time, end_time;
+ unsigned long cow_time_ns;
+
+ /* page_count must be less than 2*page_size */
+ size_t page_count = 4000;
+
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+ for (size_t i = 0; i < page_count - 1; i = i + 2)
+ memset(map_ptr + page_size * i, '-', 1);
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+ (end_time.tv_nsec - start_time.tv_nsec);
+
+ printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB);
+ printf("Not merged pages:\n");
+ printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
+ cow_time_ns % NSEC_PER_SEC);
+ printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) /
+ ((double)cow_time_ns / NSEC_PER_SEC));
+
+ /* Create 2000 pairs of duplicate pages */
+ for (size_t i = 0; i < page_count - 1; i = i + 2) {
+ memset(map_ptr + page_size * i, '+', i / 2 + 1);
+ memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1);
+ }
+ if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+ goto err_out;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+ for (size_t i = 0; i < page_count - 1; i = i + 2)
+ memset(map_ptr + page_size * i, '-', 1);
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+
+ cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+ (end_time.tv_nsec - start_time.tv_nsec);
+
+ printf("Merged pages:\n");
+ printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
+ cow_time_ns % NSEC_PER_SEC);
+ printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) /
+ ((double)cow_time_ns / NSEC_PER_SEC));
+
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_PASS;
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_FAIL;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret, opt;
+ int prot = 0;
+ int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT;
+ long page_count = KSM_PAGE_COUNT_DEFAULT;
+ size_t page_size = sysconf(_SC_PAGESIZE);
+ struct ksm_sysfs ksm_sysfs_old;
+ int test_name = CHECK_KSM_MERGE;
+ bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT;
+ bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT;
+ long size_MB = 0;
+
+ while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPC")) != -1) {
+ switch (opt) {
+ case 'a':
+ prot = str_to_prot(optarg);
+ break;
+ case 'p':
+ page_count = atol(optarg);
+ if (page_count <= 0) {
+ printf("The number of pages must be greater than 0\n");
+ return KSFT_FAIL;
+ }
+ break;
+ case 'l':
+ ksm_scan_limit_sec = atoi(optarg);
+ if (ksm_scan_limit_sec <= 0) {
+ printf("Timeout value must be greater than 0\n");
+ return KSFT_FAIL;
+ }
+ break;
+ case 'h':
+ print_help();
+ break;
+ case 'z':
+ if (strcmp(optarg, "0") == 0)
+ use_zero_pages = 0;
+ else
+ use_zero_pages = 1;
+ break;
+ case 'm':
+ if (strcmp(optarg, "0") == 0)
+ merge_across_nodes = 0;
+ else
+ merge_across_nodes = 1;
+ break;
+ case 's':
+ size_MB = atoi(optarg);
+ if (size_MB <= 0) {
+ printf("Size must be greater than 0\n");
+ return KSFT_FAIL;
+ }
+ case 'M':
+ break;
+ case 'U':
+ test_name = CHECK_KSM_UNMERGE;
+ break;
+ case 'Z':
+ test_name = CHECK_KSM_ZERO_PAGE_MERGE;
+ break;
+ case 'N':
+ test_name = CHECK_KSM_NUMA_MERGE;
+ break;
+ case 'P':
+ test_name = KSM_MERGE_TIME;
+ break;
+ case 'C':
+ test_name = KSM_COW_TIME;
+ break;
+ default:
+ return KSFT_FAIL;
+ }
+ }
+
+ if (prot == 0)
+ prot = str_to_prot(KSM_PROT_STR_DEFAULT);
+
+ if (access(KSM_SYSFS_PATH, F_OK)) {
+ printf("Config KSM not enabled\n");
+ return KSFT_SKIP;
+ }
+
+ if (ksm_save_def(&ksm_sysfs_old)) {
+ printf("Cannot save default tunables\n");
+ return KSFT_FAIL;
+ }
+
+ if (ksm_write_sysfs(KSM_FP("run"), 2) ||
+ ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) ||
+ ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) ||
+ ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count))
+ return KSFT_FAIL;
+
+ switch (test_name) {
+ case CHECK_KSM_MERGE:
+ ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
+ ksm_scan_limit_sec, page_size);
+ break;
+ case CHECK_KSM_UNMERGE:
+ ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+ page_size);
+ break;
+ case CHECK_KSM_ZERO_PAGE_MERGE:
+ ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
+ ksm_scan_limit_sec, use_zero_pages, page_size);
+ break;
+ case CHECK_KSM_NUMA_MERGE:
+ ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+ merge_across_nodes, page_size);
+ break;
+ case KSM_MERGE_TIME:
+ if (size_MB == 0) {
+ printf("Option '-s' is required.\n");
+ return KSFT_FAIL;
+ }
+ ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+ size_MB);
+ break;
+ case KSM_COW_TIME:
+ ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+ page_size);
+ break;
+ }
+
+ if (ksm_restore(&ksm_sysfs_old)) {
+ printf("Cannot restore default tunables\n");
+ return KSFT_FAIL;
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/vm/mlock-random-test.c
index ff4d72eb74b9..782ea94dee2f 100644
--- a/tools/testing/selftests/vm/mlock-random-test.c
+++ b/tools/testing/selftests/vm/mlock-random-test.c
@@ -70,7 +70,7 @@ int get_proc_locked_vm_size(void)
}
}
- perror("cann't parse VmLck in /proc/self/status\n");
+ perror("cannot parse VmLck in /proc/self/status\n");
fclose(f);
return -1;
}
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
index d09a6b71f1e9..45e803af7c77 100755
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -377,6 +377,102 @@ else
exitcode=1
fi
+echo "-------------------------------------------------------"
+echo "running KSM MADV_MERGEABLE test with 10 identical pages"
+echo "-------------------------------------------------------"
+./ksm_tests -M -p 10
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+echo "------------------------"
+echo "running KSM unmerge test"
+echo "------------------------"
+./ksm_tests -U
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+echo "----------------------------------------------------------"
+echo "running KSM test with 10 zero pages and use_zero_pages = 0"
+echo "----------------------------------------------------------"
+./ksm_tests -Z -p 10 -z 0
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+echo "----------------------------------------------------------"
+echo "running KSM test with 10 zero pages and use_zero_pages = 1"
+echo "----------------------------------------------------------"
+./ksm_tests -Z -p 10 -z 1
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+echo "-------------------------------------------------------------"
+echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 1"
+echo "-------------------------------------------------------------"
+./ksm_tests -N -m 1
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+echo "-------------------------------------------------------------"
+echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 0"
+echo "-------------------------------------------------------------"
+./ksm_tests -N -m 0
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
exit $exitcode
exit $exitcode
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 2ea438e6b8b1..10ab56c2484a 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -566,6 +566,18 @@ static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
}
}
+static void wake_range(int ufd, unsigned long addr, unsigned long len)
+{
+ struct uffdio_range uffdio_wake;
+
+ uffdio_wake.start = addr;
+ uffdio_wake.len = len;
+
+ if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
+ fprintf(stderr, "error waking %lu\n",
+ addr), exit(1);
+}
+
static int __copy_page(int ufd, unsigned long offset, bool retry)
{
struct uffdio_copy uffdio_copy;
@@ -585,6 +597,7 @@ static int __copy_page(int ufd, unsigned long offset, bool retry)
if (uffdio_copy.copy != -EEXIST)
err("UFFDIO_COPY error: %"PRId64,
(int64_t)uffdio_copy.copy);
+ wake_range(ufd, uffdio_copy.dst, page_size);
} else if (uffdio_copy.copy != page_size) {
err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
} else {