diff options
Diffstat (limited to 'include/linux')
42 files changed, 976 insertions, 409 deletions
diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 8a582d242f06..20aa3c2d89f7 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -11,7 +11,7 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size); -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE) +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO) extern unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX]; void init_vmlinux_build_id(void); #else diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index d504eb4b49ab..2cb15fe4fe12 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -138,4 +138,10 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level) #define use_arch_cache_info() (false) #endif +#ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING +#define cpu_dcache_is_aliasing() false +#else +#include <asm/cachetype.h> +#endif + #endif /* _LINUX_CACHEINFO_H */ diff --git a/include/linux/cma.h b/include/linux/cma.h index 63873b93deaa..9db877506ea8 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -6,12 +6,8 @@ #include <linux/types.h> #include <linux/numa.h> -/* - * There is always at least global CMA area and a few optional - * areas configured in kernel .config. - */ #ifdef CONFIG_CMA_AREAS -#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS) +#define MAX_CMA_AREAS CONFIG_CMA_AREAS #endif #define CMA_MAX_NAME 64 diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 9eaeaafe0cad..23270b16e1db 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -5,118 +5,47 @@ #include <linux/linkage.h> #include <linux/elfcore.h> #include <linux/elf.h> -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -#include <asm/crash_core.h> -#endif -/* Location of a reserved region to hold the crash kernel. - */ -extern struct resource crashk_res; -extern struct resource crashk_low_res; +struct kimage; + +#ifdef CONFIG_CRASH_DUMP -#define CRASH_CORE_NOTE_NAME "CORE" -#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) -#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) -#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) +int crash_shrink_memory(unsigned long new_size); +ssize_t crash_get_memory_size(void); +#ifndef arch_kexec_protect_crashkres /* - * The per-cpu notes area is a list of notes terminated by a "NULL" - * note header. For kdump, the code in vmcore.c runs in the context - * of the second kernel to combine them into one note. + * Protection mechanism for crashkernel reserved memory after + * the kdump kernel is loaded. + * + * Provide an empty default implementation here -- architecture + * code may override this */ -#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ - CRASH_CORE_NOTE_NAME_BYTES + \ - CRASH_CORE_NOTE_DESC_BYTES) - -#define VMCOREINFO_BYTES PAGE_SIZE -#define VMCOREINFO_NOTE_NAME "VMCOREINFO" -#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) -#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ - VMCOREINFO_NOTE_NAME_BYTES + \ - VMCOREINFO_BYTES) - -typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; -/* Per cpu memory for storing cpu states in case of system crash. */ -extern note_buf_t __percpu *crash_notes; - -void crash_update_vmcoreinfo_safecopy(void *ptr); -void crash_save_vmcoreinfo(void); -void arch_crash_save_vmcoreinfo(void); -__printf(1, 2) -void vmcoreinfo_append_str(const char *fmt, ...); -phys_addr_t paddr_vmcoreinfo_note(void); - -#define VMCOREINFO_OSRELEASE(value) \ - vmcoreinfo_append_str("OSRELEASE=%s\n", value) -#define VMCOREINFO_BUILD_ID() \ - ({ \ - static_assert(sizeof(vmlinux_build_id) == 20); \ - vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \ - }) - -#define VMCOREINFO_PAGESIZE(value) \ - vmcoreinfo_append_str("PAGESIZE=%ld\n", value) -#define VMCOREINFO_SYMBOL(name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) -#define VMCOREINFO_SYMBOL_ARRAY(name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name) -#define VMCOREINFO_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(name)) -#define VMCOREINFO_STRUCT_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(struct name)) -#define VMCOREINFO_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(struct name, field)) -#define VMCOREINFO_TYPE_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(name, field)) -#define VMCOREINFO_LENGTH(name, value) \ - vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) -#define VMCOREINFO_NUMBER(name) \ - vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) -#define VMCOREINFO_CONFIG(name) \ - vmcoreinfo_append_str("CONFIG_%s=y\n", #name) - -extern unsigned char *vmcoreinfo_data; -extern size_t vmcoreinfo_size; -extern u32 *vmcoreinfo_note; - -Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, - void *data, size_t data_len); -void final_note(Elf_Word *buf); - -int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base, - unsigned long long *low_size, bool *high); - -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE -#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) +static inline void arch_kexec_protect_crashkres(void) { } #endif -#ifndef CRASH_ALIGN -#define CRASH_ALIGN SZ_2M + +#ifndef arch_kexec_unprotect_crashkres +static inline void arch_kexec_unprotect_crashkres(void) { } #endif -#ifndef CRASH_ADDR_LOW_MAX -#define CRASH_ADDR_LOW_MAX SZ_4G + + + +#ifndef arch_crash_handle_hotplug_event +static inline void arch_crash_handle_hotplug_event(struct kimage *image) { } #endif -#ifndef CRASH_ADDR_HIGH_MAX -#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() + +int crash_check_update_elfcorehdr(void); + +#ifndef crash_hotplug_cpu_support +static inline int crash_hotplug_cpu_support(void) { return 0; } +#endif + +#ifndef crash_hotplug_memory_support +static inline int crash_hotplug_memory_support(void) { return 0; } #endif -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high); -#else -static inline void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) -{} +#ifndef crash_get_elfcorehdr_size +static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } #endif /* Alignment required for elf header segment */ @@ -144,4 +73,23 @@ struct kexec_segment; #define KEXEC_CRASH_HP_REMOVE_MEMORY 4 #define KEXEC_CRASH_HP_INVALID_CPU -1U +extern void __crash_kexec(struct pt_regs *regs); +extern void crash_kexec(struct pt_regs *regs); +int kexec_should_crash(struct task_struct *p); +int kexec_crash_loaded(void); +void crash_save_cpu(struct pt_regs *regs, int cpu); +extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); + +#else /* !CONFIG_CRASH_DUMP*/ +struct pt_regs; +struct task_struct; +struct kimage; +static inline void __crash_kexec(struct pt_regs *regs) { } +static inline void crash_kexec(struct pt_regs *regs) { } +static inline int kexec_should_crash(struct task_struct *p) { return 0; } +static inline int kexec_crash_loaded(void) { return 0; } +static inline void crash_save_cpu(struct pt_regs *regs, int cpu) {}; +static inline int kimage_crash_copy_vmcoreinfo(struct kimage *image) { return 0; }; +#endif /* CONFIG_CRASH_DUMP*/ + #endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h new file mode 100644 index 000000000000..5a9df944fb80 --- /dev/null +++ b/include/linux/crash_reserve.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_CRASH_RESERVE_H +#define LINUX_CRASH_RESERVE_H + +#include <linux/linkage.h> +#include <linux/elfcore.h> +#include <linux/elf.h> +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#include <asm/crash_reserve.h> +#endif + +/* Location of a reserved region to hold the crash kernel. + */ +extern struct resource crashk_res; +extern struct resource crashk_low_res; + +int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base, + unsigned long long *low_size, bool *high); + +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE +#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) +#endif +#ifndef CRASH_ALIGN +#define CRASH_ALIGN SZ_2M +#endif +#ifndef CRASH_ADDR_LOW_MAX +#define CRASH_ADDR_LOW_MAX SZ_4G +#endif +#ifndef CRASH_ADDR_HIGH_MAX +#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() +#endif + +void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high); +#else +static inline void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) +{} +#endif +#endif /* LINUX_CRASH_RESERVE_H */ diff --git a/include/linux/damon.h b/include/linux/damon.h index 5881e4ac30be..886d07294f4e 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -127,18 +127,60 @@ enum damos_action { }; /** + * enum damos_quota_goal_metric - Represents the metric to be used as the goal + * + * @DAMOS_QUOTA_USER_INPUT: User-input value. + * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us. + * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. + * + * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. + */ +enum damos_quota_goal_metric { + DAMOS_QUOTA_USER_INPUT, + DAMOS_QUOTA_SOME_MEM_PSI_US, + NR_DAMOS_QUOTA_GOAL_METRICS, +}; + +/** + * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal. + * @metric: Metric to be used for representing the goal. + * @target_value: Target value of @metric to achieve with the tuning. + * @current_value: Current value of @metric. + * @last_psi_total: Last measured total PSI + * @list: List head for siblings. + * + * Data structure for getting the current score of the quota tuning goal. The + * score is calculated by how close @current_value and @target_value are. Then + * the score is entered to DAMON's internal feedback loop mechanism to get the + * auto-tuned quota. + * + * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually + * entered by the user, probably inside the kdamond callbacks. Otherwise, + * DAMON sets @current_value with self-measured value of @metric. + */ +struct damos_quota_goal { + enum damos_quota_goal_metric metric; + unsigned long target_value; + unsigned long current_value; + /* metric-dependent fields */ + union { + u64 last_psi_total; + }; + struct list_head list; +}; + +/** * struct damos_quota - Controls the aggressiveness of the given scheme. + * @reset_interval: Charge reset interval in milliseconds. * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. - * @reset_interval: Charge reset interval in milliseconds. + * @goals: Head of quota tuning goals (&damos_quota_goal) list. + * @esz: Effective size quota in bytes. * * @weight_sz: Weight of the region's size for prioritization. * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. * @weight_age: Weight of the region's age for prioritization. * - * @get_score: Feedback function for self-tuning quota. - * @get_score_arg: Parameter for @get_score - * * To avoid consuming too much CPU time or IO resources for applying the * &struct damos->action to large memory, DAMON allows users to set time and/or * size quotas. The quotas can be set by writing non-zero values to &ms and @@ -151,42 +193,35 @@ enum damos_action { * throughput of the scheme's action. DAMON then compares it against &sz and * uses smaller one as the effective quota. * + * If @goals is not empt, DAMON calculates yet another size quota based on the + * goals using its internal feedback loop algorithm, for every @reset_interval. + * Then, if the new size quota is smaller than the effective quota, it uses the + * new size quota as the effective quota. + * + * The resulting effective size quota in bytes is set to @esz. + * * For selecting regions within the quota, DAMON prioritizes current scheme's * target memory regions using the &struct damon_operations->get_scheme_score. * You could customize the prioritization logic by setting &weight_sz, * &weight_nr_accesses, and &weight_age, because monitoring operations are * encouraged to respect those. - * - * If @get_score function pointer is set, DAMON calls it back with - * @get_score_arg and get the return value of it for every @reset_interval. - * Then, DAMON adjusts the effective quota using the return value as a feedback - * score to the current quota, using its internal feedback loop algorithm. - * - * The feedback loop algorithem assumes the quota input and the feedback score - * output are in a positive proportional relationship, and the goal of the - * tuning is getting the feedback screo value of 10,000. If @ms and/or @sz are - * set together, those work as a hard limit quota. If neither @ms nor @sz are - * set, the mechanism starts from the quota of one byte. */ struct damos_quota { + unsigned long reset_interval; unsigned long ms; unsigned long sz; - unsigned long reset_interval; + struct list_head goals; + unsigned long esz; unsigned int weight_sz; unsigned int weight_nr_accesses; unsigned int weight_age; - unsigned long (*get_score)(void *arg); - void *get_score_arg; - /* private: */ /* For throughput estimation */ unsigned long total_charged_sz; unsigned long total_charged_ns; - unsigned long esz; /* Effective size quota in bytes */ - /* For charging the quota */ unsigned long charged_sz; unsigned long charged_from; @@ -640,6 +675,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damon_for_each_scheme_safe(s, next, ctx) \ list_for_each_entry_safe(s, next, &(ctx)->schemes, list) +#define damos_for_each_quota_goal(goal, quota) \ + list_for_each_entry(goal, "a->goals, list) + +#define damos_for_each_quota_goal_safe(goal, next, quota) \ + list_for_each_entry_safe(goal, next, &(quota)->goals, list) + #define damos_for_each_filter(f, scheme) \ list_for_each_entry(f, &(scheme)->filters, list) @@ -673,6 +714,12 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type, void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); +struct damos_quota_goal *damos_new_quota_goal( + enum damos_quota_goal_metric metric, + unsigned long target_value); +void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g); +void damos_destroy_quota_goal(struct damos_quota_goal *goal); + struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, unsigned long apply_interval_us, diff --git a/include/linux/dax.h b/include/linux/dax.h index b463502b16e1..9d3e3327af4c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -63,6 +63,8 @@ void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); bool dax_synchronous(struct dax_device *dax_dev); +void set_dax_nocache(struct dax_device *dax_dev); +void set_dax_nomc(struct dax_device *dax_dev); void set_dax_synchronous(struct dax_device *dax_dev); size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); @@ -86,11 +88,7 @@ static inline void *dax_holder(struct dax_device *dax_dev) static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { - /* - * Callers should check IS_ENABLED(CONFIG_DAX) to know if this - * NULL is an error or expected. - */ - return NULL; + return ERR_PTR(-EOPNOTSUPP); } static inline void put_dax(struct dax_device *dax_dev) { @@ -109,6 +107,12 @@ static inline bool dax_synchronous(struct dax_device *dax_dev) { return true; } +static inline void set_dax_nocache(struct dax_device *dax_dev) +{ +} +static inline void set_dax_nomc(struct dax_device *dax_dev) +{ +} static inline void set_dax_synchronous(struct dax_device *dax_dev) { } @@ -124,9 +128,6 @@ static inline size_t dax_recovery_write(struct dax_device *dax_dev, } #endif -void set_dax_nocache(struct dax_device *dax_dev); -void set_dax_nomc(struct dax_device *dax_dev); - struct writeback_control; #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); diff --git a/include/linux/efi.h b/include/linux/efi.h index f0d56f106b60..d59b0947fba0 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -694,6 +694,11 @@ extern struct efi { extern struct mm_struct efi_mm; +static inline bool mm_is_efi(struct mm_struct *mm) +{ + return IS_ENABLED(CONFIG_EFI) && mm == &efi_mm; +} + static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right) { diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 937c2a9b6e54..c775ea3c6015 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -342,7 +342,7 @@ void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); -void setup_pcp_cacheinfo(void); +void setup_pcp_cacheinfo(unsigned int cpu); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 1b6053da8754..868c8fb1bbc1 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -21,44 +21,78 @@ typedef unsigned int __bitwise gfp_t; * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c */ +enum { + ___GFP_DMA_BIT, + ___GFP_HIGHMEM_BIT, + ___GFP_DMA32_BIT, + ___GFP_MOVABLE_BIT, + ___GFP_RECLAIMABLE_BIT, + ___GFP_HIGH_BIT, + ___GFP_IO_BIT, + ___GFP_FS_BIT, + ___GFP_ZERO_BIT, + ___GFP_UNUSED_BIT, /* 0x200u unused */ + ___GFP_DIRECT_RECLAIM_BIT, + ___GFP_KSWAPD_RECLAIM_BIT, + ___GFP_WRITE_BIT, + ___GFP_NOWARN_BIT, + ___GFP_RETRY_MAYFAIL_BIT, + ___GFP_NOFAIL_BIT, + ___GFP_NORETRY_BIT, + ___GFP_MEMALLOC_BIT, + ___GFP_COMP_BIT, + ___GFP_NOMEMALLOC_BIT, + ___GFP_HARDWALL_BIT, + ___GFP_THISNODE_BIT, + ___GFP_ACCOUNT_BIT, + ___GFP_ZEROTAGS_BIT, +#ifdef CONFIG_KASAN_HW_TAGS + ___GFP_SKIP_ZERO_BIT, + ___GFP_SKIP_KASAN_BIT, +#endif +#ifdef CONFIG_LOCKDEP + ___GFP_NOLOCKDEP_BIT, +#endif + ___GFP_LAST_BIT +}; + /* Plain integer GFP bitmasks. Do not use this directly. */ -#define ___GFP_DMA 0x01u -#define ___GFP_HIGHMEM 0x02u -#define ___GFP_DMA32 0x04u -#define ___GFP_MOVABLE 0x08u -#define ___GFP_RECLAIMABLE 0x10u -#define ___GFP_HIGH 0x20u -#define ___GFP_IO 0x40u -#define ___GFP_FS 0x80u -#define ___GFP_ZERO 0x100u +#define ___GFP_DMA BIT(___GFP_DMA_BIT) +#define ___GFP_HIGHMEM BIT(___GFP_HIGHMEM_BIT) +#define ___GFP_DMA32 BIT(___GFP_DMA32_BIT) +#define ___GFP_MOVABLE BIT(___GFP_MOVABLE_BIT) +#define ___GFP_RECLAIMABLE BIT(___GFP_RECLAIMABLE_BIT) +#define ___GFP_HIGH BIT(___GFP_HIGH_BIT) +#define ___GFP_IO BIT(___GFP_IO_BIT) +#define ___GFP_FS BIT(___GFP_FS_BIT) +#define ___GFP_ZERO BIT(___GFP_ZERO_BIT) /* 0x200u unused */ -#define ___GFP_DIRECT_RECLAIM 0x400u -#define ___GFP_KSWAPD_RECLAIM 0x800u -#define ___GFP_WRITE 0x1000u -#define ___GFP_NOWARN 0x2000u -#define ___GFP_RETRY_MAYFAIL 0x4000u -#define ___GFP_NOFAIL 0x8000u -#define ___GFP_NORETRY 0x10000u -#define ___GFP_MEMALLOC 0x20000u -#define ___GFP_COMP 0x40000u -#define ___GFP_NOMEMALLOC 0x80000u -#define ___GFP_HARDWALL 0x100000u -#define ___GFP_THISNODE 0x200000u -#define ___GFP_ACCOUNT 0x400000u -#define ___GFP_ZEROTAGS 0x800000u +#define ___GFP_DIRECT_RECLAIM BIT(___GFP_DIRECT_RECLAIM_BIT) +#define ___GFP_KSWAPD_RECLAIM BIT(___GFP_KSWAPD_RECLAIM_BIT) +#define ___GFP_WRITE BIT(___GFP_WRITE_BIT) +#define ___GFP_NOWARN BIT(___GFP_NOWARN_BIT) +#define ___GFP_RETRY_MAYFAIL BIT(___GFP_RETRY_MAYFAIL_BIT) +#define ___GFP_NOFAIL BIT(___GFP_NOFAIL_BIT) +#define ___GFP_NORETRY BIT(___GFP_NORETRY_BIT) +#define ___GFP_MEMALLOC BIT(___GFP_MEMALLOC_BIT) +#define ___GFP_COMP BIT(___GFP_COMP_BIT) +#define ___GFP_NOMEMALLOC BIT(___GFP_NOMEMALLOC_BIT) +#define ___GFP_HARDWALL BIT(___GFP_HARDWALL_BIT) +#define ___GFP_THISNODE BIT(___GFP_THISNODE_BIT) +#define ___GFP_ACCOUNT BIT(___GFP_ACCOUNT_BIT) +#define ___GFP_ZEROTAGS BIT(___GFP_ZEROTAGS_BIT) #ifdef CONFIG_KASAN_HW_TAGS -#define ___GFP_SKIP_ZERO 0x1000000u -#define ___GFP_SKIP_KASAN 0x2000000u +#define ___GFP_SKIP_ZERO BIT(___GFP_SKIP_ZERO_BIT) +#define ___GFP_SKIP_KASAN BIT(___GFP_SKIP_KASAN_BIT) #else #define ___GFP_SKIP_ZERO 0 #define ___GFP_SKIP_KASAN 0 #endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x4000000u +#define ___GFP_NOLOCKDEP BIT(___GFP_NOLOCKDEP_BIT) #else #define ___GFP_NOLOCKDEP 0 #endif -/* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) @@ -249,7 +283,7 @@ typedef unsigned int __bitwise gfp_t; #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT ___GFP_LAST_BIT #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 451c1dff0e87..00341b56d291 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -439,6 +439,13 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) kunmap_local(addr); } +/** + * memcpy_from_folio - Copy a range of bytes from a folio. + * @to: The memory to copy to. + * @folio: The folio to read from. + * @offset: The first byte in the folio to read. + * @len: The number of bytes to copy. + */ static inline void memcpy_from_folio(char *to, struct folio *folio, size_t offset, size_t len) { @@ -460,6 +467,13 @@ static inline void memcpy_from_folio(char *to, struct folio *folio, } while (len > 0); } +/** + * memcpy_to_folio - Copy a range of bytes to a folio. + * @folio: The folio to write to. + * @offset: The first byte in the folio to store to. + * @from: The memory to copy from. + * @len: The number of bytes to copy. + */ static inline void memcpy_to_folio(struct folio *folio, size_t offset, const char *from, size_t len) { diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 5adb86af35fc..de0c89105076 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -265,10 +265,11 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, void folio_prep_large_rmappable(struct folio *folio); bool can_split_folio(struct folio *folio, int *pextra_pins); -int split_huge_page_to_list(struct page *page, struct list_head *list); +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order); static inline int split_huge_page(struct page *page) { - return split_huge_page_to_list(page, NULL); + return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio); @@ -422,7 +423,8 @@ can_split_folio(struct folio *folio, int *pextra_pins) return false; } static inline int -split_huge_page_to_list(struct page *page, struct list_head *list) +split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) { return 0; } @@ -519,17 +521,20 @@ static inline bool thp_migration_supported(void) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int split_folio_to_list(struct folio *folio, - struct list_head *list) +static inline int split_folio_to_list_to_order(struct folio *folio, + struct list_head *list, int new_order) { - return split_huge_page_to_list(&folio->page, list); + return split_huge_page_to_list_to_order(&folio->page, list, new_order); } -static inline int split_folio(struct folio *folio) +static inline int split_folio_to_order(struct folio *folio, int new_order) { - return split_folio_to_list(folio, NULL); + return split_folio_to_list_to_order(folio, NULL, new_order); } +#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0) +#define split_folio(f) split_folio_to_order(f, 0) + /* * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to * limitations in the implementation like arm64 MTE can override this to diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c1ee640d87b1..77b30a8c6076 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -178,7 +178,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); extern int sysctl_hugetlb_shm_group; -extern struct list_head huge_boot_pages; +extern struct list_head huge_boot_pages[MAX_NUMNODES]; /* arch callbacks */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 400cb6c02176..060835bb82d5 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -15,7 +15,8 @@ #if !defined(__ASSEMBLY__) -#include <linux/crash_core.h> +#include <linux/vmcore_info.h> +#include <linux/crash_reserve.h> #include <asm/io.h> #include <linux/range.h> @@ -31,6 +32,7 @@ extern note_buf_t __percpu *crash_notes; #include <linux/module.h> #include <linux/highmem.h> #include <asm/kexec.h> +#include <linux/crash_core.h> /* Verify architecture specific macros are defined */ @@ -378,13 +380,6 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image, static inline int machine_kexec_post_load(struct kimage *image) { return 0; } #endif -extern void __crash_kexec(struct pt_regs *); -extern void crash_kexec(struct pt_regs *); -int kexec_should_crash(struct task_struct *); -int kexec_crash_loaded(void); -void crash_save_cpu(struct pt_regs *regs, int cpu); -extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); - extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; @@ -408,24 +403,6 @@ bool kexec_load_permitted(int kexec_image_type); /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; -int crash_shrink_memory(unsigned long new_size); -ssize_t crash_get_memory_size(void); - -#ifndef arch_kexec_protect_crashkres -/* - * Protection mechanism for crashkernel reserved memory after - * the kdump kernel is loaded. - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -static inline void arch_kexec_protect_crashkres(void) { } -#endif - -#ifndef arch_kexec_unprotect_crashkres -static inline void arch_kexec_unprotect_crashkres(void) { } -#endif - #ifndef page_to_boot_pfn static inline unsigned long page_to_boot_pfn(struct page *page) { @@ -482,24 +459,6 @@ static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, g static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { } #endif -#ifndef arch_crash_handle_hotplug_event -static inline void arch_crash_handle_hotplug_event(struct kimage *image) { } -#endif - -int crash_check_update_elfcorehdr(void); - -#ifndef crash_hotplug_cpu_support -static inline int crash_hotplug_cpu_support(void) { return 0; } -#endif - -#ifndef crash_hotplug_memory_support -static inline int crash_hotplug_memory_support(void) { return 0; } -#endif - -#ifndef crash_get_elfcorehdr_size -static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } -#endif - extern bool kexec_file_dbg_print; #define kexec_dprintk(fmt, ...) \ diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 7675a48a0701..792b67ceb631 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -24,6 +24,8 @@ enum lru_status { LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock internally, but has to return locked. */ + LRU_STOP, /* stop lru list walking. May drop the lock + internally, but has to return locked. */ }; struct list_lru_one { @@ -62,8 +64,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, #define list_lru_init(lru) \ __list_lru_init((lru), false, NULL, NULL) -#define list_lru_init_key(lru, key) \ - __list_lru_init((lru), false, (key), NULL) #define list_lru_init_memcg(lru, shrinker) \ __list_lru_init((lru), true, NULL, shrinker) @@ -170,22 +170,6 @@ static inline unsigned long list_lru_count(struct list_lru *lru) void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); -/** - * list_lru_putback: undo list_lru_isolate - * @lru: the lru pointer. - * @item: the item to put back. - * @nid: the node id of the sublist to put the item back to. - * @memcg: the cgroup of the sublist to put the item back to. - * - * Put back an isolated item into its original LRU. Note that unlike - * list_lru_add, this does not increment the node LRU count (as - * list_lru_isolate does not originally decrement this count). - * - * Since we might have dropped the LRU lock in between, recompute list_lru_one - * from the node's id and memcg. - */ -void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 20ff87f8e001..394fd0a887ae 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -14,6 +14,7 @@ #include <linux/vm_event_item.h> #include <linux/hardirq.h> #include <linux/jump_label.h> +#include <linux/kernel.h> #include <linux/page_counter.h> #include <linux/vmpressure.h> #include <linux/eventfd.h> @@ -712,18 +713,16 @@ static inline void mem_cgroup_uncharge(struct folio *folio) __mem_cgroup_uncharge(folio); } -void __mem_cgroup_uncharge_list(struct list_head *page_list); -static inline void mem_cgroup_uncharge_list(struct list_head *page_list) +void __mem_cgroup_uncharge_folios(struct folio_batch *folios); +static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { if (mem_cgroup_disabled()) return; - __mem_cgroup_uncharge_list(page_list); + __mem_cgroup_uncharge_folios(folios); } void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); - void mem_cgroup_replace_folio(struct folio *old, struct folio *new); - void mem_cgroup_migrate(struct folio *old, struct folio *new); /** @@ -1162,7 +1161,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -void split_page_memcg(struct page *head, unsigned int nr); +void split_page_memcg(struct page *head, int old_order, int new_order); unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1294,7 +1293,7 @@ static inline void mem_cgroup_uncharge(struct folio *folio) { } -static inline void mem_cgroup_uncharge_list(struct list_head *page_list) +static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } @@ -1620,7 +1619,7 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void split_page_memcg(struct page *head, unsigned int nr) +static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } @@ -1694,18 +1693,18 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, return folio_lruvec_lock_irq(folio); } -/* Don't lock again iff page's lruvec locked */ -static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio, - struct lruvec *locked_lruvec, unsigned long *flags) +/* Don't lock again iff folio's lruvec locked */ +static inline void folio_lruvec_relock_irqsave(struct folio *folio, + struct lruvec **lruvecp, unsigned long *flags) { - if (locked_lruvec) { - if (folio_matches_lruvec(folio, locked_lruvec)) - return locked_lruvec; + if (*lruvecp) { + if (folio_matches_lruvec(folio, *lruvecp)) + return; - unlock_page_lruvec_irqrestore(locked_lruvec, *flags); + unlock_page_lruvec_irqrestore(*lruvecp, *flags); } - return folio_lruvec_lock_irqsave(folio, flags); + *lruvecp = folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/include/linux/memory.h b/include/linux/memory.h index f53cfdaaaa41..939a16bd5cea 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -96,8 +96,17 @@ int set_memory_block_size_order(unsigned int order); #define MEM_GOING_ONLINE (1<<3) #define MEM_CANCEL_ONLINE (1<<4) #define MEM_CANCEL_OFFLINE (1<<5) +#define MEM_PREPARE_ONLINE (1<<6) +#define MEM_FINISH_OFFLINE (1<<7) struct memory_notify { + /* + * The altmap_start_pfn and altmap_nr_pages fields are designated for + * specifying the altmap range and are exclusively intended for use in + * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. + */ + unsigned long altmap_start_pfn; + unsigned long altmap_nr_pages; unsigned long start_pfn; unsigned long nr_pages; int status_change_nid_normal; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7d2076583494..7a9ff464608d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -106,6 +106,22 @@ typedef int __bitwise mhp_t; * implies the node id (nid). */ #define MHP_NID_IS_MGID ((__force mhp_t)BIT(2)) +/* + * The hotplugged memory is completely inaccessible while the memory is + * offline. The memory provider will handle MEM_PREPARE_ONLINE / + * MEM_FINISH_OFFLINE notifications and make the memory accessible. + * + * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY, + * because the altmap cannot be written (e.g., poisoned) when adding + * memory -- before it is set online. + * + * This allows for adding memory with an altmap that is not currently + * made available by a hypervisor. When onlining that memory, the + * hypervisor can be instructed to make that memory available, and + * the onlining phase will not require any memory allocations, which is + * helpful in low-memory situations. + */ +#define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3)) /* * Extended parameters for memory hotplug: @@ -121,6 +137,7 @@ struct mhp_params { bool mhp_range_allowed(u64 start, u64 size, bool need_mapping); struct range mhp_get_pluggable_range(bool need_mapping); +bool mhp_supports_memmap_on_memory(void); /* * Zone resizing functions @@ -154,7 +171,7 @@ extern void adjust_present_page_count(struct page *page, long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone); + struct zone *zone, bool mhp_off_inaccessible); extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); @@ -262,6 +279,11 @@ static inline bool movable_node_is_enabled(void) return false; } +static inline bool mhp_supports_memmap_on_memory(void) +{ + return false; +} + static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {} static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {} static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {} diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 744c830f4b13..3f7143ade32c 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -25,6 +25,7 @@ struct vmem_altmap { unsigned long free; unsigned long align; unsigned long alloc; + bool inaccessible; }; /* @@ -108,7 +109,7 @@ struct dev_pagemap_ops { * @altmap: pre-allocated/reserved memory for vmemmap allocations * @ref: reference count that pins the devm_memremap_pages() mapping * @done: completion for @ref - * @type: memory type: see MEMORY_* in memory_hotplug.h + * @type: memory type: see MEMORY_* above in memremap.h * @flags: PGMAP_* flags to specify defailed behavior * @vmemmap_shift: structural definition of how the vmemmap page metadata * is populated, specifically the metadata page order. diff --git a/include/linux/mm.h b/include/linux/mm.h index f5a97dec5169..699e850d143c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -36,6 +36,7 @@ struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; +struct folio_batch; extern int sysctl_page_lock_unfairness; @@ -226,7 +227,6 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) -#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); @@ -781,6 +781,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, return NULL; } +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + mmap_assert_locked(vma->vm_mm); +} + static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); @@ -1178,7 +1183,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for * debugging purposes - it does not include PTE-mapped sub-pages; look - * at folio_mapcount() or page_mapcount() or total_mapcount() instead. + * at folio_mapcount() or page_mapcount() instead. */ static inline int folio_entire_mapcount(struct folio *folio) { @@ -1238,13 +1243,6 @@ static inline int folio_mapcount(struct folio *folio) return folio_total_mapcount(folio); } -static inline int total_mapcount(struct page *page) -{ - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) + 1; - return folio_total_mapcount(page_folio(page)); -} - static inline bool folio_large_is_mapped(struct folio *folio) { /* @@ -1514,6 +1512,8 @@ static inline void folio_put_refs(struct folio *folio, int refs) __folio_put(folio); } +void folios_put_refs(struct folio_batch *folios, unsigned int *refs); + /* * union release_pages_arg - an array of pages or folios * @@ -1536,18 +1536,19 @@ void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. - * @nr: How many folios there are. * - * Like folio_put(), but for an array of folios. This is more efficient - * than writing the loop yourself as it will optimise the locks which - * need to be taken if the folios are freed. + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need to + * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ -static inline void folios_put(struct folio **folios, unsigned int nr) +static inline void folios_put(struct folio_batch *folios) { - release_pages(folios, nr); + folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) @@ -1640,13 +1641,11 @@ static inline int page_zone_id(struct page *page) } #ifdef NODE_NOT_IN_PAGE_FLAGS -extern int page_to_nid(const struct page *page); +int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { - struct page *p = (struct page *)page; - - return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK; + return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif @@ -2065,6 +2064,13 @@ static inline long folio_nr_pages(struct folio *folio) #endif } +/* Only hugetlbfs can allocate folios larger than MAX_ORDER */ +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) +#else +#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES +#endif + /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to @@ -2595,19 +2601,19 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member) mm_trace_rss_stat(mm, member); } -/* Optimized variant when page is already known not to be PageAnon */ -static inline int mm_counter_file(struct page *page) +/* Optimized variant when folio is already known not to be anon */ +static inline int mm_counter_file(struct folio *folio) { - if (PageSwapBacked(page)) + if (folio_test_swapbacked(folio)) return MM_SHMEMPAGES; return MM_FILEPAGES; } -static inline int mm_counter(struct page *page) +static inline int mm_counter(struct folio *folio) { - if (PageAnon(page)) + if (folio_test_anon(folio)) return MM_ANONPAGES; - return mm_counter_file(page); + return mm_counter_file(folio); } static inline unsigned long get_mm_rss(struct mm_struct *mm) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8b611e13153e..5240bd7bca33 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -210,8 +210,8 @@ struct page { * * An 'encoded_page' pointer is a pointer to a regular 'struct page', but * with the low bits of the pointer indicating extra context-dependent - * information. Not super-common, but happens in mmu_gather and mlock - * handling, and this acts as a type system check on that use. + * information. Only used in mmu_gather handling, and this acts as a type + * system check on that use. * * We only really have two guaranteed bits in general, although you could * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) @@ -220,21 +220,46 @@ struct page { * Use the supplied helper functions to endcode/decode the pointer and bits. */ struct encoded_page; -#define ENCODE_PAGE_BITS 3ul + +#define ENCODED_PAGE_BITS 3ul + +/* Perform rmap removal after we have flushed the TLB. */ +#define ENCODED_PAGE_BIT_DELAY_RMAP 1ul + +/* + * The next item in an encoded_page array is the "nr_pages" argument, specifying + * the number of consecutive pages starting from this page, that all belong to + * the same folio. For example, "nr_pages" corresponds to the number of folio + * references that must be dropped. If this bit is not set, "nr_pages" is + * implicitly 1. + */ +#define ENCODED_PAGE_BIT_NR_PAGES_NEXT 2ul + static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) { - BUILD_BUG_ON(flags > ENCODE_PAGE_BITS); + BUILD_BUG_ON(flags > ENCODED_PAGE_BITS); return (struct encoded_page *)(flags | (unsigned long)page); } static inline unsigned long encoded_page_flags(struct encoded_page *page) { - return ENCODE_PAGE_BITS & (unsigned long)page; + return ENCODED_PAGE_BITS & (unsigned long)page; } static inline struct page *encoded_page_ptr(struct encoded_page *page) { - return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); + return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page); +} + +static __always_inline struct encoded_page *encode_nr_pages(unsigned long nr) +{ + VM_WARN_ON_ONCE((nr << 2) >> 2 != nr); + return (struct encoded_page *)(nr << 2); +} + +static __always_inline unsigned long encoded_nr_pages(struct encoded_page *page) +{ + return ((unsigned long)page) >> 2; } /* @@ -394,12 +419,13 @@ FOLIO_MATCH(compound_head, _head_2a); /** * struct ptdesc - Memory descriptor for page tables. - * @__page_flags: Same as page flags. Unused for page tables. + * @__page_flags: Same as page flags. Powerpc only. * @pt_rcu_head: For freeing page table pages. * @pt_list: List of used page tables. Used for s390 and x86. * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. + * @pt_index: Used for s390 gmap. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. * @_pt_pad_2: Padding to ensure proper alignment. @@ -425,6 +451,7 @@ struct ptdesc { unsigned long __page_mapping; union { + pgoff_t pt_index; struct mm_struct *pt_mm; atomic_t pt_frag_refcount; }; @@ -450,6 +477,7 @@ TABLE_MATCH(flags, __page_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); +TABLE_MATCH(index, pt_index); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); TABLE_MATCH(_refcount, __page_refcount); diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 7c3e7b0b0e8f..39a7714605a7 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -10,7 +10,7 @@ struct vm_area_struct; struct mm_struct; struct vma_iterator; -void dump_page(struct page *page, const char *reason); +void dump_page(const struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); void vma_iter_dump_tree(const struct vma_iterator *vmi); diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index f2b7a3f04099..bbaec80c78c5 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -11,7 +11,7 @@ #endif #ifndef leave_mm -static inline void leave_mm(int cpu) { } +static inline void leave_mm(void) { } #endif /* diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a497f189d988..c11b7cde81ef 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -76,9 +76,12 @@ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) +# define is_migrate_cma_folio(folio, pfn) (MIGRATE_CMA == \ + get_pfnblock_flags_mask(&folio->page, pfn, MIGRATETYPE_MASK)) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false +# define is_migrate_cma_folio(folio, pfn) false #endif static inline bool is_migrate_movable(int mt) @@ -464,7 +467,7 @@ enum { #define NR_BLOOM_FILTERS 2 struct lru_gen_mm_state { - /* set to max_seq after each iteration */ + /* synced with max_seq after each iteration */ unsigned long seq; /* where the current iteration continues after */ struct list_head *head; @@ -479,8 +482,8 @@ struct lru_gen_mm_state { struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; - /* unstable max_seq from lru_gen_folio */ - unsigned long max_seq; + /* max_seq from lru_gen_folio: can be out of date */ + unsigned long seq; /* the next address within an mm to scan */ unsigned long next_addr; /* to batch promoted pages */ diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 001b2ce83832..89b1e0ed9811 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -115,6 +115,14 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod); +#ifdef CONFIG_MODULES +void flush_module_init_free_work(void); +#else +static inline void flush_module_init_free_work(void) +{ +} +#endif + /* Any cleanup needed when module leaves. */ void module_arch_cleanup(struct module *mod); diff --git a/include/linux/padata.h b/include/linux/padata.h index 495b16b6b4d7..0146daf34430 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -137,6 +137,7 @@ struct padata_shell { * appropriate for one worker thread to do at once. * @max_threads: Max threads to use for the job, actual number may be less * depending on task size and minimum chunk size. + * @numa_aware: Distribute jobs to different nodes with CPU in a round robin fashion. */ struct padata_mt_job { void (*thread_fn)(unsigned long start, unsigned long end, void *arg); @@ -146,6 +147,7 @@ struct padata_mt_job { unsigned long align; unsigned long min_chunk; int max_threads; + bool numa_aware; }; /** @@ -178,10 +180,6 @@ struct padata_instance { #ifdef CONFIG_PADATA extern void __init padata_init(void); -#else -static inline void __init padata_init(void) {} -#endif - extern struct padata_instance *padata_alloc(const char *name); extern void padata_free(struct padata_instance *pinst); extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst); @@ -192,4 +190,12 @@ extern void padata_do_serial(struct padata_priv *padata); extern void __init padata_do_multithreaded(struct padata_mt_job *job); extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask); +#else +static inline void __init padata_init(void) {} +static inline void __init padata_do_multithreaded(struct padata_mt_job *job) +{ + job->thread_fn(job->start, job->start + job->size, job->fn_arg); +} +#endif + #endif diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 735cddc13d20..652d77805e99 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -237,7 +237,7 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) } #endif -static __always_inline int page_is_fake_head(struct page *page) +static __always_inline int page_is_fake_head(const struct page *page) { return page_fixed_fake_head(page) != page; } @@ -281,12 +281,12 @@ static inline unsigned long _compound_head(const struct page *page) */ #define folio_page(folio, n) nth_page(&(folio)->page, n) -static __always_inline int PageTail(struct page *page) +static __always_inline int PageTail(const struct page *page) { return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } -static __always_inline int PageCompound(struct page *page) +static __always_inline int PageCompound(const struct page *page) { return test_bit(PG_head, &page->flags) || READ_ONCE(page->compound_head) & 1; @@ -306,6 +306,16 @@ static inline void page_init_poison(struct page *page, size_t size) } #endif +static const unsigned long *const_folio_flags(const struct folio *folio, + unsigned n) +{ + const struct page *page = &folio->page; + + VM_BUG_ON_PGFLAGS(PageTail(page), page); + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); + return &page[n].flags; +} + static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; @@ -328,9 +338,6 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) * for compound page all operations related to the page flag applied to * head page. * - * PF_ONLY_HEAD: - * for compound page, callers only ever operate on the head page. - * * PF_NO_TAIL: * modifications of the page flag must be done on small or head pages, * checks can be done on tail pages too. @@ -346,9 +353,6 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) page; }) #define PF_ANY(page, enforce) PF_POISONED_CHECK(page) #define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page)) -#define PF_ONLY_HEAD(page, enforce) ({ \ - VM_BUG_ON_PGFLAGS(PageTail(page), page); \ - PF_POISONED_CHECK(page); }) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ PF_POISONED_CHECK(compound_head(page)); }) @@ -362,59 +366,81 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) /* Which page is the flag stored in */ #define FOLIO_PF_ANY 0 #define FOLIO_PF_HEAD 0 -#define FOLIO_PF_ONLY_HEAD 0 #define FOLIO_PF_NO_TAIL 0 #define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_SECOND 1 +#define FOLIO_HEAD_PAGE 0 +#define FOLIO_SECOND_PAGE 1 + /* * Macros to create function definitions for page flags */ +#define FOLIO_TEST_FLAG(name, page) \ +static __always_inline bool folio_test_##name(const struct folio *folio) \ +{ return test_bit(PG_##name, const_folio_flags(folio, page)); } + +#define FOLIO_SET_FLAG(name, page) \ +static __always_inline void folio_set_##name(struct folio *folio) \ +{ set_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_CLEAR_FLAG(name, page) \ +static __always_inline void folio_clear_##name(struct folio *folio) \ +{ clear_bit(PG_##name, folio_flags(folio, page)); } + +#define __FOLIO_SET_FLAG(name, page) \ +static __always_inline void __folio_set_##name(struct folio *folio) \ +{ __set_bit(PG_##name, folio_flags(folio, page)); } + +#define __FOLIO_CLEAR_FLAG(name, page) \ +static __always_inline void __folio_clear_##name(struct folio *folio) \ +{ __clear_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_TEST_SET_FLAG(name, page) \ +static __always_inline bool folio_test_set_##name(struct folio *folio) \ +{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_TEST_CLEAR_FLAG(name, page) \ +static __always_inline bool folio_test_clear_##name(struct folio *folio) \ +{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_FLAG(name, page) \ +FOLIO_TEST_FLAG(name, page) \ +FOLIO_SET_FLAG(name, page) \ +FOLIO_CLEAR_FLAG(name, page) + #define TESTPAGEFLAG(uname, lname, policy) \ -static __always_inline bool folio_test_##lname(struct folio *folio) \ -{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ -static __always_inline int Page##uname(struct page *page) \ +FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ +static __always_inline int Page##uname(const struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void folio_set_##lname(struct folio *folio) \ -{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags); } #define CLEARPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void folio_clear_##lname(struct folio *folio) \ -{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags); } #define __SETPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void __folio_set_##lname(struct folio *folio) \ -{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +__FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags); } #define __CLEARPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void __folio_clear_##lname(struct folio *folio) \ -{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTSETFLAG(uname, lname, policy) \ -static __always_inline \ -bool folio_test_set_##lname(struct folio *folio) \ -{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTCLEARFLAG(uname, lname, policy) \ -static __always_inline \ -bool folio_test_clear_##lname(struct folio *folio) \ -{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } @@ -465,7 +491,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; } TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) -PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) +FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE) PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL) PAGEFLAG(Referenced, referenced, PF_HEAD) TESTCLEARFLAG(Referenced, referenced, PF_HEAD) @@ -532,13 +558,13 @@ PAGEFLAG_FALSE(HighMem, highmem) #endif #ifdef CONFIG_SWAP -static __always_inline bool folio_test_swapcache(struct folio *folio) +static __always_inline bool folio_test_swapcache(const struct folio *folio) { return folio_test_swapbacked(folio) && - test_bit(PG_swapcache, folio_flags(folio, 0)); + test_bit(PG_swapcache, const_folio_flags(folio, 0)); } -static __always_inline bool PageSwapCache(struct page *page) +static __always_inline bool PageSwapCache(const struct page *page) { return folio_test_swapcache(page_folio(page)); } @@ -583,10 +609,10 @@ PAGEFLAG_FALSE(HWPoison, hwpoison) #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) -TESTPAGEFLAG(Young, young, PF_ANY) -SETPAGEFLAG(Young, young, PF_ANY) -TESTCLEARFLAG(Young, young, PF_ANY) -PAGEFLAG(Idle, idle, PF_ANY) +FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE) +FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE) +FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE) +FOLIO_FLAG(idle, FOLIO_HEAD_PAGE) #endif /* @@ -637,22 +663,22 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) */ #define PAGE_MAPPING_DAX_SHARED ((void *)0x1) -static __always_inline bool folio_mapping_flags(struct folio *folio) +static __always_inline bool folio_mapping_flags(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; } -static __always_inline int PageMappingFlags(struct page *page) +static __always_inline int PageMappingFlags(const struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; } -static __always_inline bool folio_test_anon(struct folio *folio) +static __always_inline bool folio_test_anon(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0; } -static __always_inline bool PageAnon(struct page *page) +static __always_inline bool PageAnon(const struct page *page) { return folio_test_anon(page_folio(page)); } @@ -663,7 +689,7 @@ static __always_inline bool __folio_test_movable(const struct folio *folio) PAGE_MAPPING_MOVABLE; } -static __always_inline int __PageMovable(struct page *page) +static __always_inline int __PageMovable(const struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_MOVABLE; @@ -676,13 +702,13 @@ static __always_inline int __PageMovable(struct page *page) * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ -static __always_inline bool folio_test_ksm(struct folio *folio) +static __always_inline bool folio_test_ksm(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_KSM; } -static __always_inline bool PageKsm(struct page *page) +static __always_inline bool PageKsm(const struct page *page) { return folio_test_ksm(page_folio(page)); } @@ -721,9 +747,9 @@ static inline bool folio_xor_flags_has_waiters(struct folio *folio, * some of the bytes in it may be; see the is_partially_uptodate() * address_space operation. */ -static inline bool folio_test_uptodate(struct folio *folio) +static inline bool folio_test_uptodate(const struct folio *folio) { - bool ret = test_bit(PG_uptodate, folio_flags(folio, 0)); + bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0)); /* * Must ensure that the data we read out of the folio is loaded * _after_ we've loaded folio->flags to check the uptodate bit. @@ -738,7 +764,7 @@ static inline bool folio_test_uptodate(struct folio *folio) return ret; } -static inline int PageUptodate(struct page *page) +static inline int PageUptodate(const struct page *page) { return folio_test_uptodate(page_folio(page)); } @@ -780,12 +806,12 @@ void set_page_writeback(struct page *page); #define folio_start_writeback_keepwrite(folio) \ __folio_start_writeback(folio, true) -static __always_inline bool folio_test_head(struct folio *folio) +static __always_inline bool folio_test_head(const struct folio *folio) { - return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); + return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY)); } -static __always_inline int PageHead(struct page *page) +static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); @@ -801,7 +827,7 @@ CLEARPAGEFLAG(Head, head, PF_ANY) * * Return: True if the folio is larger than one page. */ -static inline bool folio_test_large(struct folio *folio) +static inline bool folio_test_large(const struct folio *folio) { return folio_test_head(folio); } @@ -830,7 +856,7 @@ TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable) #define PG_head_mask ((1UL << PG_head)) #ifdef CONFIG_HUGETLB_PAGE -int PageHuge(struct page *page); +int PageHuge(const struct page *page); SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) @@ -843,10 +869,10 @@ CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) * Return: True for hugetlbfs folios, false for anon folios or folios * belonging to other filesystems. */ -static inline bool folio_test_hugetlb(struct folio *folio) +static inline bool folio_test_hugetlb(const struct folio *folio) { return folio_test_large(folio) && - test_bit(PG_hugetlb, folio_flags(folio, 1)); + test_bit(PG_hugetlb, const_folio_flags(folio, 1)); } #else TESTPAGEFLAG_FALSE(Huge, hugetlb) @@ -861,7 +887,7 @@ TESTPAGEFLAG_FALSE(Huge, hugetlb) * hugetlbfs pages, but not normal pages. PageTransHuge() can only be * called only in the core VM paths where hugetlbfs pages can't exist. */ -static inline int PageTransHuge(struct page *page) +static inline int PageTransHuge(const struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); return PageHead(page); @@ -872,7 +898,7 @@ static inline int PageTransHuge(struct page *page) * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ -static inline int PageTransCompound(struct page *page) +static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } @@ -882,7 +908,7 @@ static inline int PageTransCompound(struct page *page) * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ -static inline int PageTransTail(struct page *page) +static inline int PageTransTail(const struct page *page) { return PageTail(page); } @@ -946,7 +972,7 @@ static inline int page_type_has_type(unsigned int page_type) return (int)page_type < PAGE_MAPCOUNT_RESERVE; } -static inline int page_has_type(struct page *page) +static inline int page_has_type(const struct page *page) { return page_type_has_type(page->page_type); } @@ -1030,7 +1056,7 @@ extern bool is_free_buddy_page(struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); -static __always_inline int PageAnonExclusive(struct page *page) +static __always_inline int PageAnonExclusive(const struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); @@ -1103,19 +1129,18 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) * Determine if a page has private stuff, indicating that release routines * should be invoked upon it. */ -static inline int page_has_private(struct page *page) +static inline int page_has_private(const struct page *page) { return !!(page->flags & PAGE_FLAGS_PRIVATE); } -static inline bool folio_has_private(struct folio *folio) +static inline bool folio_has_private(const struct folio *folio) { return page_has_private(&folio->page); } #undef PF_ANY #undef PF_HEAD -#undef PF_ONLY_HEAD #undef PF_NO_TAIL #undef PF_NO_COMPOUND #undef PF_SECOND diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index c141ea9a95ef..8cd858d912c4 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -4,7 +4,7 @@ #include <linux/atomic.h> #include <linux/cache.h> -#include <linux/kernel.h> +#include <linux/limits.h> #include <asm/page.h> struct page_counter { diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 119a0c9d2a8b..debdc25f08b9 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,7 +11,8 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned short order); extern void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask); -extern void __split_page_owner(struct page *page, unsigned int nr); +extern void __split_page_owner(struct page *page, int old_order, + int new_order); extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(const struct page *page); @@ -31,10 +32,11 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline void split_page_owner(struct page *page, unsigned int nr) +static inline void split_page_owner(struct page *page, int old_order, + int new_order) { if (static_branch_unlikely(&page_owner_inited)) - __split_page_owner(page, nr); + __split_page_owner(page, old_order, new_order); } static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { @@ -56,11 +58,11 @@ static inline void reset_page_owner(struct page *page, unsigned short order) { } static inline void set_page_owner(struct page *page, - unsigned int order, gfp_t gfp_mask) + unsigned short order, gfp_t gfp_mask) { } -static inline void split_page_owner(struct page *page, - unsigned short order) +static inline void split_page_owner(struct page *page, int old_order, + int new_order) { } static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 87cc678adc85..fcc06c300a72 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -27,6 +27,7 @@ struct folio; */ struct folio_batch { unsigned char nr; + unsigned char i; bool percpu_pvec_drained; struct folio *folios[PAGEVEC_SIZE]; }; @@ -40,12 +41,14 @@ struct folio_batch { static inline void folio_batch_init(struct folio_batch *fbatch) { fbatch->nr = 0; + fbatch->i = 0; fbatch->percpu_pvec_drained = false; } static inline void folio_batch_reinit(struct folio_batch *fbatch) { fbatch->nr = 0; + fbatch->i = 0; } static inline unsigned int folio_batch_count(struct folio_batch *fbatch) @@ -75,6 +78,21 @@ static inline unsigned folio_batch_add(struct folio_batch *fbatch, return folio_batch_space(fbatch); } +/** + * folio_batch_next - Return the next folio to process. + * @fbatch: The folio batch being processed. + * + * Use this function to implement a queue of folios. + * + * Return: The next folio in the queue, or NULL if the queue is empty. + */ +static inline struct folio *folio_batch_next(struct folio_batch *fbatch) +{ + if (fbatch->i == fbatch->nr) + return NULL; + return fbatch->folios[fbatch->i++]; +} + void __folio_batch_release(struct folio_batch *pvec); static inline void folio_batch_release(struct folio_batch *fbatch) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f6d0e3513948..85fc7554cd52 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,15 +212,37 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif -#ifndef set_ptes +#ifndef pte_batch_hint +/** + * pte_batch_hint - Number of pages that can be added to batch without scanning. + * @ptep: Page table pointer for the entry. + * @pte: Page table entry. + * + * Some architectures know that a set of contiguous ptes all map the same + * contiguous memory with the same permissions. In this case, it can provide a + * hint to aid pte batching without the core code needing to scan every pte. + * + * An architecture implementation may ignore the PTE accessed state. Further, + * the dirty state must apply atomically to all the PTEs described by the hint. + * + * May be overridden by the architecture, else pte_batch_hint is always 1. + */ +static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) +{ + return 1; +} +#endif -#ifndef pte_next_pfn -static inline pte_t pte_next_pfn(pte_t pte) +#ifndef pte_advance_pfn +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { - return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } #endif +#define pte_next_pfn(pte) pte_advance_pfn(pte, 1) + +#ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. @@ -229,6 +251,10 @@ static inline pte_t pte_next_pfn(pte_t pte) * @pte: Page table entry for the first page. * @nr: Number of pages to map. * + * When nr==1, initial state of pte may be present or not present, and new state + * may be present or not present. When nr>1, initial state of all ptes must be + * not present, and new state must be present. + * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * @@ -580,6 +606,76 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, } #endif +#ifndef get_and_clear_full_ptes +/** + * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of + * the same folio, collecting dirty/accessed bits. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the + * returned PTE. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr, int full) +{ + pte_t pte, tmp_pte; + + pte = ptep_get_and_clear_full(mm, addr, ptep, full); + while (--nr) { + ptep++; + addr += PAGE_SIZE; + tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full); + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } + return pte; +} +#endif + +#ifndef clear_full_ptes +/** + * clear_full_ptes - Clear present PTEs that map consecutive pages of the same + * folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_get_and_clear_full(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + ptep_get_and_clear_full(mm, addr, ptep, full); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif /* * If two threads concurrently fault at the same page, the thread that @@ -650,6 +746,37 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef wrprotect_ptes +/** + * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same + * folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to write-protect. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_set_wrprotect(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + for (;;) { + ptep_set_wrprotect(mm, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif + /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings @@ -1650,16 +1777,16 @@ typedef unsigned int pgtbl_mod_mask; * Only meaningful when called on a valid entry. */ #ifndef pgd_leaf -#define pgd_leaf(x) 0 +#define pgd_leaf(x) false #endif #ifndef p4d_leaf -#define p4d_leaf(x) 0 +#define p4d_leaf(x) false #endif #ifndef pud_leaf -#define pud_leaf(x) 0 +#define pud_leaf(x) false #endif #ifndef pmd_leaf -#define pmd_leaf(x) 0 +#define pmd_leaf(x) false #endif #ifndef pgd_leaf_size diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index 2a3a95586425..8dbd51ea8626 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -18,6 +18,16 @@ struct ptdump_state { const struct ptdump_range *range; }; +bool ptdump_walk_pgd_level_core(struct seq_file *m, + struct mm_struct *mm, pgd_t *pgd, + bool checkwx, bool dmesg); void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd); +bool ptdump_check_wx(void); + +static inline void debug_checkwx(void) +{ + if (IS_ENABLED(CONFIG_DEBUG_WX)) + ptdump_check_wx(); +} #endif /* _LINUX_PTDUMP_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 17cb0761ff65..94f0e618865b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1261,6 +1261,7 @@ struct task_struct { /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; + u8 il_weight; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING @@ -1625,15 +1626,15 @@ extern struct pid *cad_pid; #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ -#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF__HOLE__00010000 0x00010000 #define PF_KSWAPD 0x00020000 /* I am kswapd */ -#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ -#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ +#define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ @@ -1643,7 +1644,8 @@ extern struct pid *cad_pid; #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ +#define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning. + * See memalloc_pin_save() */ #define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 9a19f1b42f64..7a4066d22883 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -315,7 +315,8 @@ static inline void might_alloc(gfp_t gfp_mask) * point of view. Use memalloc_noio_restore to end the scope with flags * returned by this function. * - * This function is safe to be used from any context. + * Context: This function is safe to be used from any context. + * Return: The saved flags to be passed to memalloc_noio_restore. */ static inline unsigned int memalloc_noio_save(void) { @@ -346,7 +347,8 @@ static inline void memalloc_noio_restore(unsigned int flags) * point of view. Use memalloc_nofs_restore to end the scope with flags * returned by this function. * - * This function is safe to be used from any context. + * Context: This function is safe to be used from any context. + * Return: The saved flags to be passed to memalloc_nofs_restore. */ static inline unsigned int memalloc_nofs_save(void) { @@ -368,6 +370,29 @@ static inline void memalloc_nofs_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; } +/** + * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope. + * + * This function marks the beginning of the __GFP_MEMALLOC allocation scope. + * All further allocations will implicitly add the __GFP_MEMALLOC flag, which + * prevents entering reclaim and allows access to all memory reserves. This + * should only be used when the caller guarantees the allocation will allow more + * memory to be freed very shortly, i.e. it needs to allocate some memory in + * the process of freeing memory, and cannot reclaim due to potential recursion. + * + * Users of this scope have to be extremely careful to not deplete the reserves + * completely and implement a throttling mechanism which controls the + * consumption of the reserve based on the amount of freed memory. Usage of a + * pre-allocated pool (e.g. mempool) should be always considered before using + * this scope. + * + * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC + * + * Context: This function should not be used in an interrupt context as that one + * does not give PF_MEMALLOC access to reserves. + * See __gfp_pfmemalloc_flags(). + * Return: The saved flags to be passed to memalloc_noreclaim_restore. + */ static inline unsigned int memalloc_noreclaim_save(void) { unsigned int flags = current->flags & PF_MEMALLOC; @@ -375,11 +400,29 @@ static inline unsigned int memalloc_noreclaim_save(void) return flags; } +/** + * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope. + * @flags: Flags to restore. + * + * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save + * function. Always make sure that the given flags is the return value from the + * pairing memalloc_noreclaim_save call. + */ static inline void memalloc_noreclaim_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC) | flags; } +/** + * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope. + * + * This function marks the beginning of the ~__GFP_MOVABLE allocation scope. + * All further allocations will implicitly remove the __GFP_MOVABLE flag, which + * will constraint the allocations to zones that allow long term pinning, i.e. + * not ZONE_MOVABLE zones. + * + * Return: The saved flags to be passed to memalloc_pin_restore. + */ static inline unsigned int memalloc_pin_save(void) { unsigned int flags = current->flags & PF_MEMALLOC_PIN; @@ -388,6 +431,14 @@ static inline unsigned int memalloc_pin_save(void) return flags; } +/** + * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope. + * @flags: Flags to restore. + * + * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function. + * Always make sure that the given flags is the return value from the pairing + * memalloc_pin_save call. + */ static inline void memalloc_pin_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index adcbb8f23600..3c6caa5abc7c 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -30,6 +30,53 @@ typedef u32 depot_stack_handle_t; */ #define STACK_DEPOT_EXTRA_BITS 5 +#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) + +#define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */ +#define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER)) +#define DEPOT_STACK_ALIGN 4 +#define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) +#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \ + STACK_DEPOT_EXTRA_BITS) + +#ifdef CONFIG_STACKDEPOT +/* Compact structure that stores a reference to a stack. */ +union handle_parts { + depot_stack_handle_t handle; + struct { + /* pool_index is offset by 1 */ + u32 pool_index : DEPOT_POOL_INDEX_BITS; + u32 offset : DEPOT_OFFSET_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; + }; +}; + +struct stack_record { + struct list_head hash_list; /* Links in the hash table */ + u32 hash; /* Hash in hash table */ + u32 size; /* Number of stored frames */ + union handle_parts handle; /* Constant after initialization */ + refcount_t count; + union { + unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ + struct { + /* + * An important invariant of the implementation is to + * only place a stack record onto the freelist iff its + * refcount is zero. Because stack records with a zero + * refcount are never considered as valid, it is safe to + * union @entries and freelist management state below. + * Conversely, as soon as an entry is off the freelist + * and its refcount becomes non-zero, the below must not + * be accessed until being placed back on the freelist. + */ + struct list_head free_list; /* Links in the freelist */ + unsigned long rcu_state; /* RCU cookie */ + }; + }; +}; +#endif + typedef u32 depot_flags_t; /* @@ -132,6 +179,17 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); /** + * __stack_depot_get_stack_record - Get a pointer to a stack_record struct + * + * @handle: Stack depot handle + * + * This function is only for internal purposes. + * + * Return: Returns a pointer to a stack_record struct + */ +struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle); + +/** * stack_depot_fetch - Fetch a stack trace from stack depot * * @handle: Stack depot handle returned from stack_depot_save() diff --git a/include/linux/swap.h b/include/linux/swap.h index 95410ca891a7..f53d608daa01 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -437,9 +437,9 @@ static inline unsigned long total_swapcache_pages(void) return global_node_page_state(NR_SWAPCACHE); } -extern void free_swap_cache(struct page *page); -extern void free_page_and_swap_cache(struct page *); -extern void free_pages_and_swap_cache(struct encoded_page **, int); +void free_swap_cache(struct folio *folio); +void free_page_and_swap_cache(struct page *); +void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; @@ -521,7 +521,7 @@ static inline void put_swap_device(struct swap_info_struct *si) /* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */ #define free_swap_and_cache(e) is_pfn_swap_entry(e) -static inline void free_swap_cache(struct page *page) +static inline void free_swap_cache(struct folio *folio) { } diff --git a/include/linux/swapops.h b/include/linux/swapops.h index bff1e8d97de0..48b700ba1d18 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -468,6 +468,19 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) return p; } +static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) +{ + struct folio *folio = pfn_folio(swp_offset_pfn(entry)); + + /* + * Any use of migration entries may only occur while the + * corresponding folio is locked + */ + BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); + + return folio; +} + /* * A pfn swap entry is a special type of swap entry that always has a pfn stored * in the swap offset. They are used to represent unaddressable device memory diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e4056547fbe6..05d59f74fc88 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -36,6 +36,52 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) +/* + * Start with fault_pending_wqh and fault_wqh so they're more likely + * to be in the same cacheline. + * + * Locking order: + * fd_wqh.lock + * fault_pending_wqh.lock + * fault_wqh.lock + * event_wqh.lock + * + * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, + * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's + * also taken in IRQ context. + */ +struct userfaultfd_ctx { + /* waitqueue head for the pending (i.e. not read) userfaults */ + wait_queue_head_t fault_pending_wqh; + /* waitqueue head for the userfaults */ + wait_queue_head_t fault_wqh; + /* waitqueue head for the pseudo fd to wakeup poll/read */ + wait_queue_head_t fd_wqh; + /* waitqueue head for events */ + wait_queue_head_t event_wqh; + /* a refile sequence protected by fault_pending_wqh lock */ + seqcount_spinlock_t refile_seq; + /* pseudo fd refcounting */ + refcount_t refcount; + /* userfaultfd syscall flags */ + unsigned int flags; + /* features requested from the userspace */ + unsigned int features; + /* released */ + bool released; + /* + * Prevents userfaultfd operations (fill/move/wp) from happening while + * some non-cooperative event(s) is taking place. Increments are done + * in write-mode. Whereas, userfaultfd operations, which includes + * reading mmap_changing, is done under read-mode. + */ + struct rw_semaphore map_changing_lock; + /* memory mappings are changing because of non-cooperative event */ + atomic_t mmap_changing; + /* mm with one ore more vmas attached to this userfaultfd_ctx */ + struct mm_struct *mm; +}; + extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); /* A combined operation mode + behavior flags. */ @@ -74,31 +120,26 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd, unsigned long dst_addr, struct page *page, bool newly_allocated, uffd_flags_t flags); -extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, +extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, uffd_flags_t flags); -extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, + uffd_flags_t flags); +extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, unsigned long dst_start, - unsigned long len, - atomic_t *mmap_changing); -extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, atomic_t *mmap_changing, - uffd_flags_t flags); -extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing, - uffd_flags_t flags); -extern int mwriteprotect_range(struct mm_struct *dst_mm, - unsigned long start, unsigned long len, - bool enable_wp, atomic_t *mmap_changing); + unsigned long len); +extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long len, uffd_flags_t flags); +extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, uffd_flags_t flags); +extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, bool enable_wp); extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* move_pages */ void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, - unsigned long dst_start, unsigned long src_start, - unsigned long len, __u64 flags); +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long src_start, unsigned long len, __u64 flags); int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0f72c85a377b..98ea90e90439 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -258,7 +258,6 @@ extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count); /* * Internals. Don't use.. */ -extern struct list_head vmap_area_list; extern __init void vm_area_add_early(struct vm_struct *vm); extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h new file mode 100644 index 000000000000..e1dec1a6a749 --- /dev/null +++ b/include/linux/vmcore_info.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_VMCORE_INFO_H +#define LINUX_VMCORE_INFO_H + +#include <linux/linkage.h> +#include <linux/elfcore.h> +#include <linux/elf.h> + +#define CRASH_CORE_NOTE_NAME "CORE" +#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) +#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) + +/* + * The per-cpu notes area is a list of notes terminated by a "NULL" + * note header. For kdump, the code in vmcore.c runs in the context + * of the second kernel to combine them into one note. + */ +#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + CRASH_CORE_NOTE_NAME_BYTES + \ + CRASH_CORE_NOTE_DESC_BYTES) + +#define VMCOREINFO_BYTES PAGE_SIZE +#define VMCOREINFO_NOTE_NAME "VMCOREINFO" +#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) +#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + VMCOREINFO_NOTE_NAME_BYTES + \ + VMCOREINFO_BYTES) + +typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; +/* Per cpu memory for storing cpu states in case of system crash. */ +extern note_buf_t __percpu *crash_notes; + +void crash_update_vmcoreinfo_safecopy(void *ptr); +void crash_save_vmcoreinfo(void); +void arch_crash_save_vmcoreinfo(void); +__printf(1, 2) +void vmcoreinfo_append_str(const char *fmt, ...); +phys_addr_t paddr_vmcoreinfo_note(void); + +#define VMCOREINFO_OSRELEASE(value) \ + vmcoreinfo_append_str("OSRELEASE=%s\n", value) +#define VMCOREINFO_BUILD_ID() \ + ({ \ + static_assert(sizeof(vmlinux_build_id) == 20); \ + vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \ + }) + +#define VMCOREINFO_PAGESIZE(value) \ + vmcoreinfo_append_str("PAGESIZE=%ld\n", value) +#define VMCOREINFO_SYMBOL(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) +#define VMCOREINFO_SYMBOL_ARRAY(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name) +#define VMCOREINFO_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(name)) +#define VMCOREINFO_STRUCT_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(struct name)) +#define VMCOREINFO_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(struct name, field)) +#define VMCOREINFO_TYPE_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(name, field)) +#define VMCOREINFO_LENGTH(name, value) \ + vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) +#define VMCOREINFO_NUMBER(name) \ + vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) +#define VMCOREINFO_CONFIG(name) \ + vmcoreinfo_append_str("CONFIG_%s=y\n", #name) + +extern unsigned char *vmcoreinfo_data; +extern size_t vmcoreinfo_size; +extern u32 *vmcoreinfo_note; + +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len); +void final_note(Elf_Word *buf); +#endif /* LINUX_VMCORE_INFO_H */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 453736fd1d23..9845cb62e40b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -11,6 +11,7 @@ #include <linux/flex_proportions.h> #include <linux/backing-dev-defs.h> #include <linux/blk_types.h> +#include <linux/pagevec.h> struct bio; @@ -40,6 +41,7 @@ enum writeback_sync_modes { * in a manner such that unspecified fields are set to zero. */ struct writeback_control { + /* public fields that can be set and/or consumed by the caller: */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ @@ -77,6 +79,11 @@ struct writeback_control { */ struct swap_iocb **swap_plug; + /* internal fields used by the ->writepages implementation: */ + struct folio_batch fbatch; + pgoff_t index; + int saved_err; + #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ @@ -360,11 +367,12 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, bool wb_over_bg_thresh(struct bdi_writeback *wb); +struct folio *writeback_iter(struct address_space *mapping, + struct writeback_control *wbc, struct folio *folio, int *error); + typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, void *data); -void tag_pages_for_writeback(struct address_space *mapping, - pgoff_t start, pgoff_t end); int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data); diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 0b709f5bc65f..341aea490070 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -29,8 +29,8 @@ struct zswap_lruvec_state { bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); -void zswap_invalidate(int type, pgoff_t offset); -void zswap_swapon(int type); +void zswap_invalidate(swp_entry_t swp); +int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); @@ -50,8 +50,11 @@ static inline bool zswap_load(struct folio *folio) return false; } -static inline void zswap_invalidate(int type, pgoff_t offset) {} -static inline void zswap_swapon(int type) {} +static inline void zswap_invalidate(swp_entry_t swp) {} +static inline int zswap_swapon(int type, unsigned long nr_pages) +{ + return 0; +} static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} |