diff options
Diffstat (limited to 'arch/arm64')
25 files changed, 496 insertions, 233 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0f8b2e35ba99..2272a9505727 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -331,16 +331,16 @@ config BROKEN_GAS_INST config KASAN_SHADOW_OFFSET hex depends on KASAN - default 0xdfffa00000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && !KASAN_SW_TAGS - default 0xdfffd00000000000 if ARM64_VA_BITS_47 && !KASAN_SW_TAGS - default 0xdffffe8000000000 if ARM64_VA_BITS_42 && !KASAN_SW_TAGS - default 0xdfffffd000000000 if ARM64_VA_BITS_39 && !KASAN_SW_TAGS - default 0xdffffffa00000000 if ARM64_VA_BITS_36 && !KASAN_SW_TAGS - default 0xefff900000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && KASAN_SW_TAGS - default 0xefffc80000000000 if ARM64_VA_BITS_47 && KASAN_SW_TAGS - default 0xeffffe4000000000 if ARM64_VA_BITS_42 && KASAN_SW_TAGS - default 0xefffffc800000000 if ARM64_VA_BITS_39 && KASAN_SW_TAGS - default 0xeffffff900000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS + default 0xdfff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && !KASAN_SW_TAGS + default 0xdfffc00000000000 if ARM64_VA_BITS_47 && !KASAN_SW_TAGS + default 0xdffffe0000000000 if ARM64_VA_BITS_42 && !KASAN_SW_TAGS + default 0xdfffffc000000000 if ARM64_VA_BITS_39 && !KASAN_SW_TAGS + default 0xdffffff800000000 if ARM64_VA_BITS_36 && !KASAN_SW_TAGS + default 0xefff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && KASAN_SW_TAGS + default 0xefffc00000000000 if ARM64_VA_BITS_47 && KASAN_SW_TAGS + default 0xeffffe0000000000 if ARM64_VA_BITS_42 && KASAN_SW_TAGS + default 0xefffffc000000000 if ARM64_VA_BITS_39 && KASAN_SW_TAGS + default 0xeffffff800000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS default 0xffffffffffffffff source "arch/arm64/Kconfig.platforms" @@ -1849,15 +1849,36 @@ config CMDLINE entering them here. As a minimum, you should specify the the root device (e.g. root=/dev/nfs). +choice + prompt "Kernel command line type" if CMDLINE != "" + default CMDLINE_FROM_BOOTLOADER + help + Choose how the kernel will handle the provided default kernel + command line string. + +config CMDLINE_FROM_BOOTLOADER + bool "Use bootloader kernel arguments if available" + help + Uses the command-line options passed by the boot loader. If + the boot loader doesn't provide any, the default kernel command + string provided in CMDLINE will be used. + +config CMDLINE_EXTEND + bool "Extend bootloader kernel arguments" + help + The command-line arguments provided by the boot loader will be + appended to the default kernel command string. + config CMDLINE_FORCE bool "Always use the default kernel command string" - depends on CMDLINE != "" help Always use the default kernel command string, even if the boot loader passes other arguments to the kernel. This is useful if you cannot or don't want to change the command-line options your boot loader passes to the kernel. +endchoice + config EFI_STUB bool diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 97244d4feca9..f5b44ac354dc 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -765,8 +765,16 @@ static inline bool cpu_has_hw_af(void) #ifdef CONFIG_ARM64_AMU_EXTN /* Check whether the cpu supports the Activity Monitors Unit (AMU) */ extern bool cpu_has_amu_feat(int cpu); +#else +static inline bool cpu_has_amu_feat(int cpu) +{ + return false; +} #endif +/* Get a cpu that supports the Activity Monitors Unit (AMU) */ +extern int get_cpu_with_amu_feat(void); + static inline unsigned int get_vmid_bits(u64 mmfr1) { int vmid_bits; diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 99b9383cd036..2a8aa1884d8a 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr) } asmlinkage void enter_from_user_mode(void); -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs); void do_undefinstr(struct pt_regs *regs); void do_bti(struct pt_regs *regs); asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr); diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index cd61239bae8c..556cb2d62b5b 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -30,8 +30,8 @@ * keep a constant PAGE_OFFSET and "fallback" to using the higher end * of the VMEMMAP where 52-bit support is not available in hardware. */ -#define VMEMMAP_SIZE ((_PAGE_END(VA_BITS_MIN) - PAGE_OFFSET) \ - >> (PAGE_SHIFT - STRUCT_PAGE_MAX_SHIFT)) +#define VMEMMAP_SHIFT (PAGE_SHIFT - STRUCT_PAGE_MAX_SHIFT) +#define VMEMMAP_SIZE ((_PAGE_END(VA_BITS_MIN) - PAGE_OFFSET) >> VMEMMAP_SHIFT) /* * PAGE_OFFSET - the virtual address of the start of the linear map, at the @@ -44,17 +44,17 @@ #define _PAGE_OFFSET(va) (-(UL(1) << (va))) #define PAGE_OFFSET (_PAGE_OFFSET(VA_BITS)) #define KIMAGE_VADDR (MODULES_END) -#define BPF_JIT_REGION_START (KASAN_SHADOW_END) +#define BPF_JIT_REGION_START (_PAGE_END(VA_BITS_MIN)) #define BPF_JIT_REGION_SIZE (SZ_128M) #define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) #define MODULES_VADDR (BPF_JIT_REGION_END) #define MODULES_VSIZE (SZ_128M) -#define VMEMMAP_START (-VMEMMAP_SIZE - SZ_2M) +#define VMEMMAP_START (-(UL(1) << (VA_BITS - VMEMMAP_SHIFT))) #define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE) -#define PCI_IO_END (VMEMMAP_START - SZ_2M) +#define PCI_IO_END (VMEMMAP_START - SZ_8M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) -#define FIXADDR_TOP (PCI_IO_START - SZ_2M) +#define FIXADDR_TOP (VMEMMAP_START - SZ_32M) #if VA_BITS > 48 #define VA_BITS_MIN (48) @@ -76,10 +76,11 @@ #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) #define KASAN_SHADOW_END ((UL(1) << (64 - KASAN_SHADOW_SCALE_SHIFT)) \ + KASAN_SHADOW_OFFSET) +#define PAGE_END (KASAN_SHADOW_END - (1UL << (vabits_actual - KASAN_SHADOW_SCALE_SHIFT))) #define KASAN_THREAD_SHIFT 1 #else #define KASAN_THREAD_SHIFT 0 -#define KASAN_SHADOW_END (_PAGE_END(VA_BITS_MIN)) +#define PAGE_END (_PAGE_END(VA_BITS_MIN)) #endif /* CONFIG_KASAN */ #define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) @@ -167,7 +168,6 @@ #include <asm/bug.h> extern u64 vabits_actual; -#define PAGE_END (_PAGE_END(vabits_actual)) extern s64 memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ @@ -238,11 +238,9 @@ static inline const void *__tag_set(const void *addr, u8 tag) /* - * The linear kernel range starts at the bottom of the virtual address - * space. Testing the top bit for the start of the region is a - * sufficient check and avoids having to worry about the tag. + * The linear kernel range starts at the bottom of the virtual address space. */ -#define __is_lm_address(addr) (!(((u64)addr) & BIT(vabits_actual - 1))) +#define __is_lm_address(addr) (((u64)(addr) & ~PAGE_OFFSET) < (PAGE_END - PAGE_OFFSET)) #define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET) #define __kimg_to_phys(addr) ((addr) - kimage_voffset) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4ff12a7adcfd..ec307b8bcb15 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -22,7 +22,7 @@ * and fixed mappings */ #define VMALLOC_START (MODULES_END) -#define VMALLOC_END (- PUD_SIZE - VMEMMAP_SIZE - SZ_64K) +#define VMALLOC_END (VMEMMAP_START - SZ_256M) #define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) diff --git a/arch/arm64/include/asm/signal.h b/arch/arm64/include/asm/signal.h new file mode 100644 index 000000000000..ef449f5f4ba8 --- /dev/null +++ b/arch/arm64/include/asm/signal.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARM64_ASM_SIGNAL_H +#define __ARM64_ASM_SIGNAL_H + +#include <asm/memory.h> +#include <uapi/asm/signal.h> +#include <uapi/asm/siginfo.h> + +static inline void __user *arch_untagged_si_addr(void __user *addr, + unsigned long sig, + unsigned long si_code) +{ + /* + * For historical reasons, all bits of the fault address are exposed as + * address bits for watchpoint exceptions. New architectures should + * handle the tag bits consistently. + */ + if (sig == SIGTRAP && si_code == TRAP_BRKPT) + return addr; + + return untagged_addr(addr); +} +#define arch_untagged_si_addr arch_untagged_si_addr + +#endif diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h index 1ab63cfbbaf1..673be2d1263c 100644 --- a/arch/arm64/include/asm/system_misc.h +++ b/arch/arm64/include/asm/system_misc.h @@ -22,7 +22,7 @@ void die(const char *msg, struct pt_regs *regs, int err); struct siginfo; void arm64_notify_die(const char *str, struct pt_regs *regs, - int signo, int sicode, void __user *addr, + int signo, int sicode, unsigned long far, int err); void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int, diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 11a465243f66..3b8dca4eb08d 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -16,12 +16,14 @@ int pcibus_to_node(struct pci_bus *bus); #include <linux/arch_topology.h> +void update_freq_counters_refs(void); +void topology_scale_freq_tick(void); + #ifdef CONFIG_ARM64_AMU_EXTN /* * Replace task scheduler's default counter-based * frequency-invariance scale factor setting. */ -void topology_scale_freq_tick(void); #define arch_scale_freq_tick topology_scale_freq_tick #endif /* CONFIG_ARM64_AMU_EXTN */ diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index d96dc2c7c09d..54f32a0675df 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -26,9 +26,9 @@ void register_undef_hook(struct undef_hook *hook); void unregister_undef_hook(struct undef_hook *hook); void force_signal_inject(int signal, int code, unsigned long address, unsigned int err); void arm64_notify_segfault(unsigned long addr); -void arm64_force_sig_fault(int signo, int code, void __user *addr, const char *str); -void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, const char *str); -void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr, const char *str); +void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str); +void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str); +void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str); /* * Move regs->pc to next instruction and do necessary setup before it diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index b7b6804cb931..bffcd55668c7 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1526,8 +1526,10 @@ bool cpu_has_amu_feat(int cpu) return cpumask_test_cpu(cpu, &amu_cpus); } -/* Initialize the use of AMU counters for frequency invariance */ -extern void init_cpu_freq_invariance_counters(void); +int get_cpu_with_amu_feat(void) +{ + return cpumask_any(&amu_cpus); +} static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap) { @@ -1535,7 +1537,7 @@ static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap) pr_info("detected CPU%d: Activity Monitors Unit (AMU)\n", smp_processor_id()); cpumask_set_cpu(smp_processor_id(), &amu_cpus); - init_cpu_freq_invariance_counters(); + update_freq_counters_refs(); } } @@ -1557,6 +1559,11 @@ static bool has_amu(const struct arm64_cpu_capabilities *cap, return true; } +#else +int get_cpu_with_amu_feat(void) +{ + return nr_cpu_ids; +} #endif #ifdef CONFIG_ARM64_VHE diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index fa76151de6ff..4f3661eeb7ec 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -234,9 +234,8 @@ static void send_user_sigtrap(int si_code) if (interrupts_enabled(regs)) local_irq_enable(); - arm64_force_sig_fault(SIGTRAP, si_code, - (void __user *)instruction_pointer(regs), - "User debug trap"); + arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), + "User debug trap"); } static int single_step_handler(unsigned long unused, unsigned int esr, diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index a71844fb923e..28d8a5dca5f1 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -7,30 +7,48 @@ #include <linux/pe.h> #include <linux/sizes.h> + .macro efi_signature_nop +#ifdef CONFIG_EFI +.L_head: + /* + * This ccmp instruction has no meaningful effect except that + * its opcode forms the magic "MZ" signature required by UEFI. + */ + ccmp x18, #0, #0xd, pl +#else + /* + * Bootloaders may inspect the opcode at the start of the kernel + * image to decide if the kernel is capable of booting via UEFI. + * So put an ordinary NOP here, not the "MZ.." pseudo-nop above. + */ + nop +#endif + .endm + .macro __EFI_PE_HEADER +#ifdef CONFIG_EFI + .set .Lpe_header_offset, . - .L_head .long PE_MAGIC -coff_header: .short IMAGE_FILE_MACHINE_ARM64 // Machine - .short section_count // NumberOfSections + .short .Lsection_count // NumberOfSections .long 0 // TimeDateStamp .long 0 // PointerToSymbolTable .long 0 // NumberOfSymbols - .short section_table - optional_header // SizeOfOptionalHeader + .short .Lsection_table - .Loptional_header // SizeOfOptionalHeader .short IMAGE_FILE_DEBUG_STRIPPED | \ IMAGE_FILE_EXECUTABLE_IMAGE | \ IMAGE_FILE_LINE_NUMS_STRIPPED // Characteristics -optional_header: +.Loptional_header: .short PE_OPT_MAGIC_PE32PLUS // PE32+ format .byte 0x02 // MajorLinkerVersion .byte 0x14 // MinorLinkerVersion - .long __initdata_begin - efi_header_end // SizeOfCode + .long __initdata_begin - .Lefi_header_end // SizeOfCode .long __pecoff_data_size // SizeOfInitializedData .long 0 // SizeOfUninitializedData - .long __efistub_efi_pe_entry - _head // AddressOfEntryPoint - .long efi_header_end - _head // BaseOfCode + .long __efistub_efi_pe_entry - .L_head // AddressOfEntryPoint + .long .Lefi_header_end - .L_head // BaseOfCode -extra_header_fields: .quad 0 // ImageBase .long SEGMENT_ALIGN // SectionAlignment .long PECOFF_FILE_ALIGNMENT // FileAlignment @@ -42,10 +60,10 @@ extra_header_fields: .short 0 // MinorSubsystemVersion .long 0 // Win32VersionValue - .long _end - _head // SizeOfImage + .long _end - .L_head // SizeOfImage // Everything before the kernel image is considered part of the header - .long efi_header_end - _head // SizeOfHeaders + .long .Lefi_header_end - .L_head // SizeOfHeaders .long 0 // CheckSum .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem .short 0 // DllCharacteristics @@ -54,7 +72,7 @@ extra_header_fields: .quad 0 // SizeOfHeapReserve .quad 0 // SizeOfHeapCommit .long 0 // LoaderFlags - .long (section_table - .) / 8 // NumberOfRvaAndSizes + .long (.Lsection_table - .) / 8 // NumberOfRvaAndSizes .quad 0 // ExportTable .quad 0 // ImportTable @@ -64,17 +82,17 @@ extra_header_fields: .quad 0 // BaseRelocationTable #ifdef CONFIG_DEBUG_EFI - .long efi_debug_table - _head // DebugTable - .long efi_debug_table_size + .long .Lefi_debug_table - .L_head // DebugTable + .long .Lefi_debug_table_size #endif // Section table -section_table: +.Lsection_table: .ascii ".text\0\0\0" - .long __initdata_begin - efi_header_end // VirtualSize - .long efi_header_end - _head // VirtualAddress - .long __initdata_begin - efi_header_end // SizeOfRawData - .long efi_header_end - _head // PointerToRawData + .long __initdata_begin - .Lefi_header_end // VirtualSize + .long .Lefi_header_end - .L_head // VirtualAddress + .long __initdata_begin - .Lefi_header_end // SizeOfRawData + .long .Lefi_header_end - .L_head // PointerToRawData .long 0 // PointerToRelocations .long 0 // PointerToLineNumbers @@ -86,9 +104,9 @@ section_table: .ascii ".data\0\0\0" .long __pecoff_data_size // VirtualSize - .long __initdata_begin - _head // VirtualAddress + .long __initdata_begin - .L_head // VirtualAddress .long __pecoff_data_rawsize // SizeOfRawData - .long __initdata_begin - _head // PointerToRawData + .long __initdata_begin - .L_head // PointerToRawData .long 0 // PointerToRelocations .long 0 // PointerToLineNumbers @@ -98,7 +116,7 @@ section_table: IMAGE_SCN_MEM_READ | \ IMAGE_SCN_MEM_WRITE // Characteristics - .set section_count, (. - section_table) / 40 + .set .Lsection_count, (. - .Lsection_table) / 40 #ifdef CONFIG_DEBUG_EFI /* @@ -114,21 +132,21 @@ section_table: __INITRODATA .align 2 -efi_debug_table: +.Lefi_debug_table: // EFI_IMAGE_DEBUG_DIRECTORY_ENTRY .long 0 // Characteristics .long 0 // TimeDateStamp .short 0 // MajorVersion .short 0 // MinorVersion .long IMAGE_DEBUG_TYPE_CODEVIEW // Type - .long efi_debug_entry_size // SizeOfData + .long .Lefi_debug_entry_size // SizeOfData .long 0 // RVA - .long efi_debug_entry - _head // FileOffset + .long .Lefi_debug_entry - .L_head // FileOffset - .set efi_debug_table_size, . - efi_debug_table + .set .Lefi_debug_table_size, . - .Lefi_debug_table .previous -efi_debug_entry: +.Lefi_debug_entry: // EFI_IMAGE_DEBUG_CODEVIEW_NB10_ENTRY .ascii "NB10" // Signature .long 0 // Unknown @@ -137,16 +155,12 @@ efi_debug_entry: .asciz VMLINUX_PATH - .set efi_debug_entry_size, . - efi_debug_entry + .set .Lefi_debug_entry_size, . - .Lefi_debug_entry #endif - /* - * EFI will load .text onwards at the 4k section alignment - * described in the PE/COFF header. To ensure that instruction - * sequences using an adrp and a :lo12: immediate will function - * correctly at this alignment, we must ensure that .text is - * placed at a 4k boundary in the Image to begin with. - */ .balign SEGMENT_ALIGN -efi_header_end: +.Lefi_header_end: +#else + .set .Lpe_header_offset, 0x0 +#endif .endm diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 43d4c329775f..dbbddfbf4a72 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr) unsigned long far = read_sysreg(far_el1); local_daif_inherit(regs); - far = untagged_addr(far); do_mem_abort(far, esr, regs); } NOKPROBE_SYMBOL(el1_abort); @@ -114,7 +113,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr) user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); - far = untagged_addr(far); do_mem_abort(far, esr, regs); } NOKPROBE_SYMBOL(el0_da); diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index d8d9caf02834..c1f8f2c5be47 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -58,21 +58,11 @@ * in the entry routines. */ __HEAD -_head: /* * DO NOT MODIFY. Image header expected by Linux boot-loaders. */ -#ifdef CONFIG_EFI - /* - * This add instruction has no meaningful effect except that - * its opcode forms the magic "MZ" signature required by UEFI. - */ - add x13, x18, #0x16 - b primary_entry -#else + efi_signature_nop // special NOP to identity as PE/COFF executable b primary_entry // branch to kernel start, magic - .long 0 // reserved -#endif .quad 0 // Image load offset from start of RAM, little-endian le64sym _kernel_size_le // Effective size of kernel image, little-endian le64sym _kernel_flags_le // Informative flags, little-endian @@ -80,14 +70,9 @@ _head: .quad 0 // reserved .quad 0 // reserved .ascii ARM64_IMAGE_MAGIC // Magic number -#ifdef CONFIG_EFI - .long pe_header - _head // Offset to the PE header. + .long .Lpe_header_offset // Offset to the PE header. -pe_header: __EFI_PE_HEADER -#else - .long 0 // reserved -#endif __INIT diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index b181e0544b79..0921aa1520b0 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -50,10 +50,16 @@ static __init u64 get_kaslr_seed(void *fdt) return ret; } -static __init const u8 *kaslr_get_cmdline(void *fdt) +static __init bool cmdline_contains_nokaslr(const u8 *cmdline) { - static __initconst const u8 default_cmdline[] = CONFIG_CMDLINE; + const u8 *str; + str = strstr(cmdline, "nokaslr"); + return str == cmdline || (str > cmdline && *(str - 1) == ' '); +} + +static __init bool is_kaslr_disabled_cmdline(void *fdt) +{ if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { int node; const u8 *prop; @@ -65,10 +71,17 @@ static __init const u8 *kaslr_get_cmdline(void *fdt) prop = fdt_getprop(fdt, node, "bootargs", NULL); if (!prop) goto out; - return prop; + + if (cmdline_contains_nokaslr(prop)) + return true; + + if (IS_ENABLED(CONFIG_CMDLINE_EXTEND)) + goto out; + + return false; } out: - return default_cmdline; + return cmdline_contains_nokaslr(CONFIG_CMDLINE); } /* @@ -83,7 +96,6 @@ u64 __init kaslr_early_init(u64 dt_phys) { void *fdt; u64 seed, offset, mask, module_range; - const u8 *cmdline, *str; unsigned long raw; int size; @@ -115,9 +127,7 @@ u64 __init kaslr_early_init(u64 dt_phys) * Check if 'nokaslr' appears on the command line, and * return 0 if that is the case. */ - cmdline = kaslr_get_cmdline(fdt); - str = strstr(cmdline, "nokaslr"); - if (str == cmdline || (str > cmdline && *(str - 1) == ' ')) { + if (is_kaslr_disabled_cmdline(fdt)) { kaslr_status = KASLR_DISABLED_CMDLINE; return 0; } diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index f49b349e16a3..8ac487c84e37 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -192,14 +192,11 @@ static void ptrace_hbptriggered(struct perf_event *bp, break; } } - arm64_force_sig_ptrace_errno_trap(si_errno, - (void __user *)bkpt->trigger, + arm64_force_sig_ptrace_errno_trap(si_errno, bkpt->trigger, desc); } #endif - arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, - (void __user *)(bkpt->trigger), - desc); + arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc); } /* diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 133257ffd859..fe1cf52f5f80 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -206,7 +206,7 @@ static void __init request_standard_resources(void) unsigned long i = 0; size_t res_size; - kernel_code.start = __pa_symbol(_text); + kernel_code.start = __pa_symbol(_stext); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); @@ -283,7 +283,7 @@ u64 cpu_logical_map(int cpu) void __init __no_sanitize_address setup_arch(char **cmdline_p) { - init_mm.start_code = (unsigned long) _text; + init_mm.start_code = (unsigned long) _stext; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; init_mm.brk = (unsigned long) _end; diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index 3c18c2454089..265fe3eb1069 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -68,7 +68,7 @@ do_compat_cache_op(unsigned long start, unsigned long end, int flags) */ long compat_arm_syscall(struct pt_regs *regs, int scno) { - void __user *addr; + unsigned long addr; switch (scno) { /* @@ -111,8 +111,7 @@ long compat_arm_syscall(struct pt_regs *regs, int scno) break; } - addr = (void __user *)instruction_pointer(regs) - - (compat_thumb_mode(regs) ? 2 : 4); + addr = instruction_pointer(regs) - (compat_thumb_mode(regs) ? 2 : 4); arm64_notify_die("Oops - bad compat syscall(2)", regs, SIGILL, ILL_ILLTRP, addr, scno); diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 543c67cae02f..b8026ec684ba 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -124,6 +124,12 @@ int __init parse_acpi_topology(void) #endif #ifdef CONFIG_ARM64_AMU_EXTN +#define read_corecnt() read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0) +#define read_constcnt() read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0) +#else +#define read_corecnt() (0UL) +#define read_constcnt() (0UL) +#endif #undef pr_fmt #define pr_fmt(fmt) "AMU: " fmt @@ -133,54 +139,58 @@ static DEFINE_PER_CPU(u64, arch_const_cycles_prev); static DEFINE_PER_CPU(u64, arch_core_cycles_prev); static cpumask_var_t amu_fie_cpus; -/* Initialize counter reference per-cpu variables for the current CPU */ -void init_cpu_freq_invariance_counters(void) +void update_freq_counters_refs(void) { - this_cpu_write(arch_core_cycles_prev, - read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0)); - this_cpu_write(arch_const_cycles_prev, - read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0)); + this_cpu_write(arch_core_cycles_prev, read_corecnt()); + this_cpu_write(arch_const_cycles_prev, read_constcnt()); } -static int validate_cpu_freq_invariance_counters(int cpu) +static inline bool freq_counters_valid(int cpu) { - u64 max_freq_hz, ratio; + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) + return false; if (!cpu_has_amu_feat(cpu)) { pr_debug("CPU%d: counters are not supported.\n", cpu); - return -EINVAL; + return false; } if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) || !per_cpu(arch_core_cycles_prev, cpu))) { pr_debug("CPU%d: cycle counters are not enabled.\n", cpu); - return -EINVAL; + return false; } - /* Convert maximum frequency from KHz to Hz and validate */ - max_freq_hz = cpufreq_get_hw_max_freq(cpu) * 1000; - if (unlikely(!max_freq_hz)) { - pr_debug("CPU%d: invalid maximum frequency.\n", cpu); + return true; +} + +static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) +{ + u64 ratio; + + if (unlikely(!max_rate || !ref_rate)) { + pr_debug("CPU%d: invalid maximum or reference frequency.\n", + cpu); return -EINVAL; } /* * Pre-compute the fixed ratio between the frequency of the constant - * counter and the maximum frequency of the CPU. + * reference counter and the maximum frequency of the CPU. * - * const_freq - * arch_max_freq_scale = ---------------- * SCHED_CAPACITY_SCALE² - * cpuinfo_max_freq + * ref_rate + * arch_max_freq_scale = ---------- * SCHED_CAPACITY_SCALE² + * max_rate * * We use a factor of 2 * SCHED_CAPACITY_SHIFT -> SCHED_CAPACITY_SCALE² * in order to ensure a good resolution for arch_max_freq_scale for - * very low arch timer frequencies (down to the KHz range which should + * very low reference frequencies (down to the KHz range which should * be unlikely). */ - ratio = (u64)arch_timer_get_rate() << (2 * SCHED_CAPACITY_SHIFT); - ratio = div64_u64(ratio, max_freq_hz); + ratio = ref_rate << (2 * SCHED_CAPACITY_SHIFT); + ratio = div64_u64(ratio, max_rate); if (!ratio) { - WARN_ONCE(1, "System timer frequency too low.\n"); + WARN_ONCE(1, "Reference frequency too low.\n"); return -EINVAL; } @@ -227,8 +237,12 @@ static int __init init_amu_fie(void) } for_each_present_cpu(cpu) { - if (validate_cpu_freq_invariance_counters(cpu)) + if (!freq_counters_valid(cpu) || + freq_inv_set_max_ratio(cpu, + cpufreq_get_hw_max_freq(cpu) * 1000, + arch_timer_get_rate())) continue; + cpumask_set_cpu(cpu, valid_cpus); have_policy |= enable_policy_freq_counters(cpu, valid_cpus); } @@ -280,11 +294,14 @@ void topology_scale_freq_tick(void) if (!cpumask_test_cpu(cpu, amu_fie_cpus)) return; - const_cnt = read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0); - core_cnt = read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0); prev_const_cnt = this_cpu_read(arch_const_cycles_prev); prev_core_cnt = this_cpu_read(arch_core_cycles_prev); + update_freq_counters_refs(); + + const_cnt = this_cpu_read(arch_const_cycles_prev); + core_cnt = this_cpu_read(arch_core_cycles_prev); + if (unlikely(core_cnt <= prev_core_cnt || const_cnt <= prev_const_cnt)) goto store_and_exit; @@ -309,4 +326,71 @@ store_and_exit: this_cpu_write(arch_core_cycles_prev, core_cnt); this_cpu_write(arch_const_cycles_prev, const_cnt); } -#endif /* CONFIG_ARM64_AMU_EXTN */ + +#ifdef CONFIG_ACPI_CPPC_LIB +#include <acpi/cppc_acpi.h> + +static void cpu_read_corecnt(void *val) +{ + *(u64 *)val = read_corecnt(); +} + +static void cpu_read_constcnt(void *val) +{ + *(u64 *)val = read_constcnt(); +} + +static inline +int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val) +{ + /* + * Abort call on counterless CPU or when interrupts are + * disabled - can lead to deadlock in smp sync call. + */ + if (!cpu_has_amu_feat(cpu)) + return -EOPNOTSUPP; + + if (WARN_ON_ONCE(irqs_disabled())) + return -EPERM; + + smp_call_function_single(cpu, func, val, 1); + + return 0; +} + +/* + * Refer to drivers/acpi/cppc_acpi.c for the description of the functions + * below. + */ +bool cpc_ffh_supported(void) +{ + return freq_counters_valid(get_cpu_with_amu_feat()); +} + +int cpc_read_ffh(int cpu, struct cpc_reg *reg, u64 *val) +{ + int ret = -EOPNOTSUPP; + + switch ((u64)reg->address) { + case 0x0: + ret = counters_read_on_cpu(cpu, cpu_read_corecnt, val); + break; + case 0x1: + ret = counters_read_on_cpu(cpu, cpu_read_constcnt, val); + break; + } + + if (!ret) { + *val &= GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + *val >>= reg->bit_offset; + } + + return ret; +} + +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_ACPI_CPPC_LIB */ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8af4e0e85736..f4ddbe9ed3f1 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -170,32 +170,32 @@ static void arm64_show_signal(int signo, const char *str) __show_regs(regs); } -void arm64_force_sig_fault(int signo, int code, void __user *addr, +void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str) { arm64_show_signal(signo, str); if (signo == SIGKILL) force_sig(SIGKILL); else - force_sig_fault(signo, code, addr); + force_sig_fault(signo, code, (void __user *)far); } -void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, +void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { arm64_show_signal(SIGBUS, str); - force_sig_mceerr(code, addr, lsb); + force_sig_mceerr(code, (void __user *)far, lsb); } -void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr, +void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str) { arm64_show_signal(SIGTRAP, str); - force_sig_ptrace_errno_trap(errno, addr); + force_sig_ptrace_errno_trap(errno, (void __user *)far); } void arm64_notify_die(const char *str, struct pt_regs *regs, - int signo, int sicode, void __user *addr, + int signo, int sicode, unsigned long far, int err) { if (user_mode(regs)) { @@ -203,7 +203,7 @@ void arm64_notify_die(const char *str, struct pt_regs *regs, current->thread.fault_address = 0; current->thread.fault_code = err; - arm64_force_sig_fault(signo, sicode, addr, str); + arm64_force_sig_fault(signo, sicode, far, str); } else { die(str, regs, err); } @@ -374,7 +374,7 @@ void force_signal_inject(int signal, int code, unsigned long address, unsigned i signal = SIGKILL; } - arm64_notify_die(desc, regs, signal, code, (void __user *)address, err); + arm64_notify_die(desc, regs, signal, code, address, err); } /* @@ -385,7 +385,7 @@ void arm64_notify_segfault(unsigned long addr) int code; mmap_read_lock(current->mm); - if (find_vma(current->mm, addr) == NULL) + if (find_vma(current->mm, untagged_addr(addr)) == NULL) code = SEGV_MAPERR; else code = SEGV_ACCERR; @@ -448,12 +448,13 @@ NOKPROBE_SYMBOL(do_ptrauth_fault); static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) { - unsigned long address; + unsigned long tagged_address, address; int rt = ESR_ELx_SYS64_ISS_RT(esr); int crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; int ret = 0; - address = untagged_addr(pt_regs_read_reg(regs, rt)); + tagged_address = pt_regs_read_reg(regs, rt); + address = untagged_addr(tagged_address); switch (crm) { case ESR_ELx_SYS64_ISS_CRM_DC_CVAU: /* DC CVAU, gets promoted */ @@ -480,7 +481,7 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) } if (ret) - arm64_notify_segfault(address); + arm64_notify_segfault(tagged_address); else arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } @@ -772,7 +773,7 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr) */ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) { - void __user *pc = (void __user *)instruction_pointer(regs); + unsigned long pc = instruction_pointer(regs); current->thread.fault_address = 0; current->thread.fault_code = esr; diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index d6cdcf4aa6a5..94a08e3e32b1 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -121,7 +121,7 @@ SECTIONS _text = .; HEAD_TEXT } - .text : { /* Real text segment */ + .text : ALIGN(SEGMENT_ALIGN) { /* Real text segment */ _stext = .; /* Text and read-only data */ IRQENTRY_TEXT SOFTIRQENTRY_TEXT diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 57972bdb213a..1a01da9fdc99 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -788,10 +788,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } switch (vma_shift) { +#ifndef __PAGETABLE_PMD_FOLDED case PUD_SHIFT: if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) break; fallthrough; +#endif case CONT_PMD_SHIFT: vma_shift = PMD_SHIFT; fallthrough; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 1ee94002801f..29a6b8c9e830 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -40,7 +40,7 @@ #include <asm/traps.h> struct fault_info { - int (*fn)(unsigned long addr, unsigned int esr, + int (*fn)(unsigned long far, unsigned int esr, struct pt_regs *regs); int sig; int code; @@ -385,8 +385,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr) current->thread.fault_code = esr; } -static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs) +static void do_bad_area(unsigned long far, unsigned int esr, + struct pt_regs *regs) { + unsigned long addr = untagged_addr(far); + /* * If we are in kernel mode at this point, we have no context to * handle this fault with. @@ -395,8 +398,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re const struct fault_info *inf = esr_to_fault_info(esr); set_thread_esr(addr, esr); - arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr, - inf->name); + arm64_force_sig_fault(inf->sig, inf->code, far, inf->name); } else { __do_kernel_fault(addr, esr, regs); } @@ -448,7 +450,7 @@ static bool is_write_abort(unsigned int esr) return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); } -static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, +static int __kprobes do_page_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf; @@ -456,6 +458,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, vm_fault_t fault; unsigned long vm_flags = VM_ACCESS_FLAGS; unsigned int mm_flags = FAULT_FLAG_DEFAULT; + unsigned long addr = untagged_addr(far); if (kprobe_page_fault(regs, esr)) return 0; @@ -567,8 +570,7 @@ retry: * We had some memory, but were unable to successfully fix up * this page fault. */ - arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr, - inf->name); + arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name); } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) { unsigned int lsb; @@ -576,8 +578,7 @@ retry: if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); - arm64_force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, lsb, - inf->name); + arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name); } else { /* * Something tried to access memory that isn't in our memory @@ -585,8 +586,7 @@ retry: */ arm64_force_sig_fault(SIGSEGV, fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR, - (void __user *)addr, - inf->name); + far, inf->name); } return 0; @@ -596,33 +596,35 @@ no_context: return 0; } -static int __kprobes do_translation_fault(unsigned long addr, +static int __kprobes do_translation_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { + unsigned long addr = untagged_addr(far); + if (is_ttbr0_addr(addr)) - return do_page_fault(addr, esr, regs); + return do_page_fault(far, esr, regs); - do_bad_area(addr, esr, regs); + do_bad_area(far, esr, regs); return 0; } -static int do_alignment_fault(unsigned long addr, unsigned int esr, +static int do_alignment_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { - do_bad_area(addr, esr, regs); + do_bad_area(far, esr, regs); return 0; } -static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) +static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs) { return 1; /* "fault" */ } -static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) +static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf; - void __user *siaddr; + unsigned long siaddr; inf = esr_to_fault_info(esr); @@ -634,19 +636,30 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 0; } - if (esr & ESR_ELx_FnV) - siaddr = NULL; - else - siaddr = (void __user *)addr; + if (esr & ESR_ELx_FnV) { + siaddr = 0; + } else { + /* + * The architecture specifies that the tag bits of FAR_EL1 are + * UNKNOWN for synchronous external aborts. Mask them out now + * so that userspace doesn't see them. + */ + siaddr = untagged_addr(far); + } arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr); return 0; } -static int do_tag_check_fault(unsigned long addr, unsigned int esr, +static int do_tag_check_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { - do_bad_area(addr, esr, regs); + /* + * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN for tag + * check faults. Mask them out now so that userspace doesn't see them. + */ + far &= (1UL << 60) - 1; + do_bad_area(far, esr, regs); return 0; } @@ -717,11 +730,12 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, }; -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_fault_info(esr); + unsigned long addr = untagged_addr(far); - if (!inf->fn(addr, esr, regs)) + if (!inf->fn(far, esr, regs)) return; if (!user_mode(regs)) { @@ -730,8 +744,12 @@ void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) show_pte(addr); } - arm64_notify_die(inf->name, regs, - inf->sig, inf->code, (void __user *)addr, esr); + /* + * At this point we have an unrecognized fault type whose tag bits may + * have been defined as UNKNOWN. Therefore we only expose the untagged + * address to the signal handler. + */ + arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr); } NOKPROBE_SYMBOL(do_mem_abort); @@ -744,8 +762,8 @@ NOKPROBE_SYMBOL(do_el0_irq_bp_hardening); void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - arm64_notify_die("SP/PC alignment exception", regs, - SIGBUS, BUS_ADRALN, (void __user *)addr, esr); + arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN, + addr, esr); } NOKPROBE_SYMBOL(do_sp_pc_abort); @@ -871,8 +889,7 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, arm64_apply_bp_hardening(); if (inf->fn(addr_if_watchpoint, esr, regs)) { - arm64_notify_die(inf->name, regs, - inf->sig, inf->code, (void __user *)pc, esr); + arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr); } debug_exception_exit(regs); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 095540667f0f..fbd452e12397 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -29,6 +29,7 @@ #include <linux/kexec.h> #include <linux/crash_dump.h> #include <linux/hugetlb.h> +#include <linux/acpi_iort.h> #include <asm/boot.h> #include <asm/fixmap.h> @@ -42,8 +43,6 @@ #include <asm/tlb.h> #include <asm/alternative.h> -#define ARM64_ZONE_DMA_BITS 30 - /* * We need to be able to catch inadvertent references to memstart_addr * that occur (potentially in generic code) before arm64_memblock_init() @@ -175,21 +174,34 @@ static void __init reserve_elfcorehdr(void) #endif /* CONFIG_CRASH_DUMP */ /* - * Return the maximum physical address for a zone with a given address size - * limit. It currently assumes that for memory starting above 4G, 32-bit - * devices will use a DMA offset. + * Return the maximum physical address for a zone accessible by the given bits + * limit. If DRAM starts above 32-bit, expand the zone to the maximum + * available memory, otherwise cap it at 32-bit. */ static phys_addr_t __init max_zone_phys(unsigned int zone_bits) { - phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, zone_bits); - return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM()); + phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); + phys_addr_t phys_start = memblock_start_of_DRAM(); + + if (phys_start > U32_MAX) + zone_mask = PHYS_ADDR_MAX; + else if (phys_start > zone_mask) + zone_mask = U32_MAX; + + return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; } static void __init zone_sizes_init(unsigned long min, unsigned long max) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; + unsigned int __maybe_unused acpi_zone_dma_bits; + unsigned int __maybe_unused dt_zone_dma_bits; #ifdef CONFIG_ZONE_DMA + acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); + dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL)); + zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits); + arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); #endif #ifdef CONFIG_ZONE_DMA32 @@ -269,7 +281,7 @@ static void __init fdt_enforce_memory_region(void) void __init arm64_memblock_init(void) { - const s64 linear_region_size = BIT(vabits_actual - 1); + const s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual); /* Handle linux,usable-memory-range property */ fdt_enforce_memory_region(); @@ -348,15 +360,18 @@ void __init arm64_memblock_init(void) if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { extern u16 memstart_offset_seed; - u64 range = linear_region_size - - (memblock_end_of_DRAM() - memblock_start_of_DRAM()); + u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); + int parange = cpuid_feature_extract_unsigned_field( + mmfr0, ID_AA64MMFR0_PARANGE_SHIFT); + s64 range = linear_region_size - + BIT(id_aa64mmfr0_parange_to_phys_shift(parange)); /* * If the size of the linear region exceeds, by a sufficient - * margin, the size of the region that the available physical - * memory spans, randomize the linear region as well. + * margin, the size of the region that the physical memory can + * span, randomize the linear region as well. */ - if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) { + if (memstart_offset_seed > 0 && range >= (s64)ARM64_MEMSTART_ALIGN) { range /= ARM64_MEMSTART_ALIGN; memstart_addr -= ARM64_MEMSTART_ALIGN * ((range * memstart_offset_seed) >> 16); @@ -367,7 +382,7 @@ void __init arm64_memblock_init(void) * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock. */ - memblock_reserve(__pa_symbol(_text), _end - _text); + memblock_reserve(__pa_symbol(_stext), _end - _stext); if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { /* the generic initrd code expects virtual addresses */ initrd_start = __phys_to_virt(phys_initrd_start); @@ -376,18 +391,11 @@ void __init arm64_memblock_init(void) early_init_fdt_scan_reserved_mem(); - if (IS_ENABLED(CONFIG_ZONE_DMA)) { - zone_dma_bits = ARM64_ZONE_DMA_BITS; - arm64_dma_phys_limit = max_zone_phys(ARM64_ZONE_DMA_BITS); - } - if (IS_ENABLED(CONFIG_ZONE_DMA32)) arm64_dma32_phys_limit = max_zone_phys(32); else arm64_dma32_phys_limit = PHYS_MASK + 1; - reserve_crashkernel(); - reserve_elfcorehdr(); high_memory = __va(memblock_end_of_DRAM() - 1) + 1; @@ -427,6 +435,12 @@ void __init bootmem_init(void) sparse_init(); zone_sizes_init(min, max); + /* + * request_standard_resources() depends on crashkernel's memory being + * reserved, so do it here. + */ + reserve_crashkernel(); + memblock_dump_all(); } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1c0f3e02f731..fe0721a44376 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -464,20 +464,35 @@ void __init mark_linear_text_alias_ro(void) /* * Remove the write permissions from the linear alias of .text/.rodata */ - update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), - (unsigned long)__init_begin - (unsigned long)_text, + update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext), + (unsigned long)__init_begin - (unsigned long)_stext, PAGE_KERNEL_RO); } +static bool crash_mem_map __initdata; + +static int __init enable_crash_mem_map(char *arg) +{ + /* + * Proper parameter parsing is done by reserve_crashkernel(). We only + * need to know if the linear map has to avoid block mappings so that + * the crashkernel reservations can be unmapped later. + */ + crash_mem_map = true; + + return 0; +} +early_param("crashkernel", enable_crash_mem_map); + static void __init map_mem(pgd_t *pgdp) { - phys_addr_t kernel_start = __pa_symbol(_text); + phys_addr_t kernel_start = __pa_symbol(_stext); phys_addr_t kernel_end = __pa_symbol(__init_begin); phys_addr_t start, end; int flags = 0; u64 i; - if (rodata_full || debug_pagealloc_enabled()) + if (rodata_full || crash_mem_map || debug_pagealloc_enabled()) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* @@ -487,11 +502,6 @@ static void __init map_mem(pgd_t *pgdp) * the following for-loop */ memblock_mark_nomap(kernel_start, kernel_end - kernel_start); -#ifdef CONFIG_KEXEC_CORE - if (crashk_res.end) - memblock_mark_nomap(crashk_res.start, - resource_size(&crashk_res)); -#endif /* map all the memory banks */ for_each_mem_range(i, &start, &end) { @@ -506,7 +516,7 @@ static void __init map_mem(pgd_t *pgdp) } /* - * Map the linear alias of the [_text, __init_begin) interval + * Map the linear alias of the [_stext, __init_begin) interval * as non-executable now, and remove the write permission in * mark_linear_text_alias_ro() below (which will be called after * alternative patching has completed). This makes the contents @@ -518,21 +528,6 @@ static void __init map_mem(pgd_t *pgdp) __map_memblock(pgdp, kernel_start, kernel_end, PAGE_KERNEL, NO_CONT_MAPPINGS); memblock_clear_nomap(kernel_start, kernel_end - kernel_start); - -#ifdef CONFIG_KEXEC_CORE - /* - * Use page-level mappings here so that we can shrink the region - * in page granularity and put back unused memory to buddy system - * through /sys/kernel/kexec_crash_size interface. - */ - if (crashk_res.end) { - __map_memblock(pgdp, crashk_res.start, crashk_res.end + 1, - PAGE_KERNEL, - NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); - memblock_clear_nomap(crashk_res.start, - resource_size(&crashk_res)); - } -#endif } void mark_rodata_ro(void) @@ -665,7 +660,7 @@ static void __init map_kernel(pgd_t *pgdp) * Only rodata will be remapped with different permissions later on, * all other segments are allowed to use contiguous mappings. */ - map_kernel_segment(pgdp, _text, _etext, text_prot, &vmlinux_text, 0, + map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0, VM_NO_GUARD); map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL, &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD); @@ -1493,13 +1488,43 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb, unsigned long end_pfn = arg->start_pfn + arg->nr_pages; unsigned long pfn = arg->start_pfn; - if (action != MEM_GOING_OFFLINE) + if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE)) return NOTIFY_OK; for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long start = PFN_PHYS(pfn); + unsigned long end = start + (1UL << PA_SECTION_SHIFT); + ms = __pfn_to_section(pfn); - if (early_section(ms)) + if (!early_section(ms)) + continue; + + if (action == MEM_GOING_OFFLINE) { + /* + * Boot memory removal is not supported. Prevent + * it via blocking any attempted offline request + * for the boot memory and just report it. + */ + pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end); return NOTIFY_BAD; + } else if (action == MEM_OFFLINE) { + /* + * This should have never happened. Boot memory + * offlining should have been prevented by this + * very notifier. Probably some memory removal + * procedure might have changed which would then + * require further debug. + */ + pr_err("Boot memory [%lx %lx] offlined\n", start, end); + + /* + * Core memory hotplug does not process a return + * code from the notifier for MEM_OFFLINE events. + * The error condition has been reported. Return + * from here as if ignored. + */ + return NOTIFY_DONE; + } } return NOTIFY_OK; } @@ -1508,9 +1533,66 @@ static struct notifier_block prevent_bootmem_remove_nb = { .notifier_call = prevent_bootmem_remove_notifier, }; +/* + * This ensures that boot memory sections on the platform are online + * from early boot. Memory sections could not be prevented from being + * offlined, unless for some reason they are not online to begin with. + * This helps validate the basic assumption on which the above memory + * event notifier works to prevent boot memory section offlining and + * its possible removal. + */ +static void validate_bootmem_online(void) +{ + phys_addr_t start, end, addr; + struct mem_section *ms; + u64 i; + + /* + * Scanning across all memblock might be expensive + * on some big memory systems. Hence enable this + * validation only with DEBUG_VM. + */ + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + + for_each_mem_range(i, &start, &end) { + for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) { + ms = __pfn_to_section(PHYS_PFN(addr)); + + /* + * All memory ranges in the system at this point + * should have been marked as early sections. + */ + WARN_ON(!early_section(ms)); + + /* + * Memory notifier mechanism here to prevent boot + * memory offlining depends on the fact that each + * early section memory on the system is initially + * online. Otherwise a given memory section which + * is already offline will be overlooked and can + * be removed completely. Call out such sections. + */ + if (!online_section(ms)) + pr_err("Boot memory [%llx %llx] is offline, can be removed\n", + addr, addr + (1UL << PA_SECTION_SHIFT)); + } + } +} + static int __init prevent_bootmem_remove_init(void) { - return register_memory_notifier(&prevent_bootmem_remove_nb); + int ret = 0; + + if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return ret; + + validate_bootmem_online(); + ret = register_memory_notifier(&prevent_bootmem_remove_nb); + if (ret) + pr_err("%s: Notifier registration failed %d\n", __func__, ret); + + return ret; } -device_initcall(prevent_bootmem_remove_init); +early_initcall(prevent_bootmem_remove_init); #endif |
