diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/backing-dev.c | 2 | ||||
| -rw-r--r-- | mm/damon/core.c | 15 | ||||
| -rw-r--r-- | mm/damon/lru_sort.c | 43 | ||||
| -rw-r--r-- | mm/damon/reclaim.c | 18 | ||||
| -rw-r--r-- | mm/damon/sysfs-schemes.c | 6 | ||||
| -rw-r--r-- | mm/huge_memory.c | 18 | ||||
| -rw-r--r-- | mm/kasan/generic.c | 3 | ||||
| -rw-r--r-- | mm/madvise.c | 1 | ||||
| -rw-r--r-- | mm/memblock.c | 9 | ||||
| -rw-r--r-- | mm/memcontrol.c | 95 | ||||
| -rw-r--r-- | mm/memory-failure.c | 5 | ||||
| -rw-r--r-- | mm/memory.c | 26 | ||||
| -rw-r--r-- | mm/mmap.c | 6 | ||||
| -rw-r--r-- | mm/page-writeback.c | 4 | ||||
| -rw-r--r-- | mm/readahead.c | 4 | ||||
| -rw-r--r-- | mm/swap.h | 5 | ||||
| -rw-r--r-- | mm/swap_state.c | 10 | ||||
| -rw-r--r-- | mm/swapfile.c | 13 | ||||
| -rw-r--r-- | mm/userfaultfd.c | 29 | ||||
| -rw-r--r-- | mm/zswap.c | 27 |
20 files changed, 256 insertions, 83 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1e3447bccdb1..e039d05304dd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); - wb->dirty_sleep = jiffies; err = fprop_local_init_percpu(&wb->completions, gfp); if (err) @@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi) INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); + bdi->last_bdp_sleep = jiffies; return cgwb_bdi_init(bdi); } diff --git a/mm/damon/core.c b/mm/damon/core.c index 36f6f1d21ff0..5b325749fc12 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1026,6 +1026,9 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; + if (c->passed_sample_intervals != s->next_apply_sis) + continue; + if (!s->wmarks.activated) continue; @@ -1176,10 +1179,6 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (c->passed_sample_intervals != s->next_apply_sis) continue; - s->next_apply_sis += - (s->apply_interval_us ? s->apply_interval_us : - c->attrs.aggr_interval) / sample_interval; - if (!s->wmarks.activated) continue; @@ -1195,6 +1194,14 @@ static void kdamond_apply_schemes(struct damon_ctx *c) damon_for_each_region_safe(r, next_r, t) damon_do_apply_schemes(c, t, r); } + + damon_for_each_scheme(s, c) { + if (c->passed_sample_intervals != s->next_apply_sis) + continue; + s->next_apply_sis += + (s->apply_interval_us ? s->apply_interval_us : + c->attrs.aggr_interval) / sample_interval; + } } /* diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index f2e5f9431892..3de2916a65c3 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -185,9 +185,21 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); } +static void damon_lru_sort_copy_quota_status(struct damos_quota *dst, + struct damos_quota *src) +{ + dst->total_charged_sz = src->total_charged_sz; + dst->total_charged_ns = src->total_charged_ns; + dst->charged_sz = src->charged_sz; + dst->charged_from = src->charged_from; + dst->charge_target_from = src->charge_target_from; + dst->charge_addr_from = src->charge_addr_from; +} + static int damon_lru_sort_apply_parameters(void) { - struct damos *scheme; + struct damos *scheme, *hot_scheme, *cold_scheme; + struct damos *old_hot_scheme = NULL, *old_cold_scheme = NULL; unsigned int hot_thres, cold_thres; int err = 0; @@ -195,18 +207,35 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; + damon_for_each_scheme(scheme, ctx) { + if (!old_hot_scheme) { + old_hot_scheme = scheme; + continue; + } + old_cold_scheme = scheme; + } + hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) * hot_thres_access_freq / 1000; - scheme = damon_lru_sort_new_hot_scheme(hot_thres); - if (!scheme) + hot_scheme = damon_lru_sort_new_hot_scheme(hot_thres); + if (!hot_scheme) return -ENOMEM; - damon_set_schemes(ctx, &scheme, 1); + if (old_hot_scheme) + damon_lru_sort_copy_quota_status(&hot_scheme->quota, + &old_hot_scheme->quota); cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; - scheme = damon_lru_sort_new_cold_scheme(cold_thres); - if (!scheme) + cold_scheme = damon_lru_sort_new_cold_scheme(cold_thres); + if (!cold_scheme) { + damon_destroy_scheme(hot_scheme); return -ENOMEM; - damon_add_scheme(ctx, scheme); + } + if (old_cold_scheme) + damon_lru_sort_copy_quota_status(&cold_scheme->quota, + &old_cold_scheme->quota); + + damon_set_schemes(ctx, &hot_scheme, 1); + damon_add_scheme(ctx, cold_scheme); return damon_set_region_biggest_system_ram_default(target, &monitor_region_start, diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index ab974e477d2f..66e190f0374a 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -150,9 +150,20 @@ static struct damos *damon_reclaim_new_scheme(void) &damon_reclaim_wmarks); } +static void damon_reclaim_copy_quota_status(struct damos_quota *dst, + struct damos_quota *src) +{ + dst->total_charged_sz = src->total_charged_sz; + dst->total_charged_ns = src->total_charged_ns; + dst->charged_sz = src->charged_sz; + dst->charged_from = src->charged_from; + dst->charge_target_from = src->charge_target_from; + dst->charge_addr_from = src->charge_addr_from; +} + static int damon_reclaim_apply_parameters(void) { - struct damos *scheme; + struct damos *scheme, *old_scheme; struct damos_filter *filter; int err = 0; @@ -164,6 +175,11 @@ static int damon_reclaim_apply_parameters(void) scheme = damon_reclaim_new_scheme(); if (!scheme) return -ENOMEM; + if (!list_empty(&ctx->schemes)) { + damon_for_each_scheme(old_scheme, ctx) + damon_reclaim_copy_quota_status(&scheme->quota, + &old_scheme->quota); + } if (skip_anon) { filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); if (!filter) { diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 8dbaac6e5c2d..ae0f0b314f3a 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1905,6 +1905,10 @@ void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, damon_for_each_scheme(scheme, ctx) { struct damon_sysfs_scheme *sysfs_scheme; + /* user could have removed the scheme sysfs dir */ + if (i >= sysfs_schemes->nr) + break; + sysfs_scheme = sysfs_schemes->schemes_arr[i]; damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals, &scheme->quota); @@ -2194,7 +2198,7 @@ static void damos_tried_regions_init_upd_status( sysfs_regions->upd_timeout_jiffies = jiffies + 2 * usecs_to_jiffies(scheme->apply_interval_us ? scheme->apply_interval_us : - ctx->attrs.sample_interval); + ctx->attrs.aggr_interval); } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 94ef5c02b459..94c958f7ebb5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -37,6 +37,7 @@ #include <linux/page_owner.h> #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> +#include <linux/compat.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -809,7 +810,10 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, { loff_t off_end = off + len; loff_t off_align = round_up(off, size); - unsigned long len_pad, ret; + unsigned long len_pad, ret, off_sub; + + if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall()) + return 0; if (off_end <= off_align || (off_end - off_align) < size) return 0; @@ -835,7 +839,13 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, if (ret == addr) return addr; - ret += (off - ret) & (size - 1); + off_sub = (off - ret) & (size - 1); + + if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown && + !off_sub) + return ret + size; + + ret += off_sub; return ret; } @@ -2437,7 +2447,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, page = pmd_page(old_pmd); folio = page_folio(page); if (!folio_test_dirty(folio) && pmd_dirty(old_pmd)) - folio_set_dirty(folio); + folio_mark_dirty(folio); if (!folio_test_referenced(folio) && pmd_young(old_pmd)) folio_set_referenced(folio); folio_remove_rmap_pmd(folio, page, vma); @@ -3563,7 +3573,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, } if (pmd_dirty(pmdval)) - folio_set_dirty(folio); + folio_mark_dirty(folio); if (pmd_write(pmdval)) entry = make_writable_migration_entry(page_to_pfn(page)); else if (anon_exclusive) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index df6627f62402..032bf3e98c24 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -522,6 +522,9 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta) static void release_free_meta(const void *object, struct kasan_free_meta *meta) { + if (!kasan_arch_is_ready()) + return; + /* Check if free meta is valid. */ if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META) return; diff --git a/mm/madvise.c b/mm/madvise.c index 912155a94ed5..cfa5e7288261 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -429,6 +429,7 @@ restart: if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; diff --git a/mm/memblock.c b/mm/memblock.c index abd92869874d..d09136e040d3 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -180,8 +180,9 @@ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) /* * Address comparison utilities */ -static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, - phys_addr_t base2, phys_addr_t size2) +unsigned long __init_memblock +memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2, + phys_addr_t size2) { return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); } @@ -2176,6 +2177,9 @@ static void __init memmap_init_reserved_pages(void) start = region->base; end = start + region->size; + if (nid == NUMA_NO_NODE || nid >= MAX_NUMNODES) + nid = early_pfn_to_nid(PFN_DOWN(start)); + reserve_bootmem_region(start, end, nid); } } @@ -2246,6 +2250,7 @@ static const char * const flagname[] = { [ilog2(MEMBLOCK_MIRROR)] = "MIRROR", [ilog2(MEMBLOCK_NOMAP)] = "NOMAP", [ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG", + [ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT", }; static int memblock_debug_show(struct seq_file *m, void *private) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e4c8735e7c85..61932c9215e7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -621,6 +621,15 @@ static inline int memcg_events_index(enum vm_event_item idx) } struct memcg_vmstats_percpu { + /* Stats updates since the last flush */ + unsigned int stats_updates; + + /* Cached pointers for fast iteration in memcg_rstat_updated() */ + struct memcg_vmstats_percpu *parent; + struct memcg_vmstats *vmstats; + + /* The above should fit a single cacheline for memcg_rstat_updated() */ + /* Local (CPU and cgroup) page state & events */ long state[MEMCG_NR_STAT]; unsigned long events[NR_MEMCG_EVENTS]; @@ -632,10 +641,7 @@ struct memcg_vmstats_percpu { /* Cgroup1: threshold notifications & softlimit tree updates */ unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; - - /* Stats updates since the last flush */ - unsigned int stats_updates; -}; +} ____cacheline_aligned; struct memcg_vmstats { /* Aggregated (CPU and subtree) page state & events */ @@ -698,36 +704,35 @@ static void memcg_stats_unlock(void) } -static bool memcg_should_flush_stats(struct mem_cgroup *memcg) +static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) { - return atomic64_read(&memcg->vmstats->stats_updates) > + return atomic64_read(&vmstats->stats_updates) > MEMCG_CHARGE_BATCH * num_online_cpus(); } static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { + struct memcg_vmstats_percpu *statc; int cpu = smp_processor_id(); - unsigned int x; if (!val) return; cgroup_rstat_updated(memcg->css.cgroup, cpu); - - for (; memcg; memcg = parent_mem_cgroup(memcg)) { - x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates, - abs(val)); - - if (x < MEMCG_CHARGE_BATCH) + statc = this_cpu_ptr(memcg->vmstats_percpu); + for (; statc; statc = statc->parent) { + statc->stats_updates += abs(val); + if (statc->stats_updates < MEMCG_CHARGE_BATCH) continue; /* * If @memcg is already flush-able, increasing stats_updates is * redundant. Avoid the overhead of the atomic update. */ - if (!memcg_should_flush_stats(memcg)) - atomic64_add(x, &memcg->vmstats->stats_updates); - __this_cpu_write(memcg->vmstats_percpu->stats_updates, 0); + if (!memcg_vmstats_needs_flush(statc->vmstats)) + atomic64_add(statc->stats_updates, + &statc->vmstats->stats_updates); + statc->stats_updates = 0; } } @@ -756,7 +761,7 @@ void mem_cgroup_flush_stats(struct mem_cgroup *memcg) if (!memcg) memcg = root_mem_cgroup; - if (memcg_should_flush_stats(memcg)) + if (memcg_vmstats_needs_flush(memcg->vmstats)) do_flush_stats(memcg); } @@ -770,7 +775,7 @@ void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) static void flush_memcg_stats_dwork(struct work_struct *w) { /* - * Deliberately ignore memcg_should_flush_stats() here so that flushing + * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing * in latency-sensitive paths is as cheap as possible. */ do_flush_stats(root_mem_cgroup); @@ -2623,8 +2628,9 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, } /* - * Scheduled by try_charge() to be executed from the userland return path - * and reclaims memory over the high limit. + * Reclaims memory over the high limit. Called directly from + * try_charge() (context permitting), as well as from the userland + * return path where reclaim is always able to block. */ void mem_cgroup_handle_over_high(gfp_t gfp_mask) { @@ -2644,6 +2650,17 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask) retry_reclaim: /* + * Bail if the task is already exiting. Unlike memory.max, + * memory.high enforcement isn't as strict, and there is no + * OOM killer involved, which means the excess could already + * be much bigger (and still growing) than it could for + * memory.max; the dying task could get stuck in fruitless + * reclaim for a long time, which isn't desirable. + */ + if (task_is_dying()) + goto out; + + /* * The allocating task should reclaim at least the batch size, but for * subsequent retries we only want to do what's necessary to prevent oom * or breaching resource isolation. @@ -2693,6 +2710,9 @@ retry_reclaim: } /* + * Reclaim didn't manage to push usage below the limit, slow + * this allocating task down. + * * If we exit early, we're guaranteed to die (since * schedule_timeout_killable sets TASK_KILLABLE). This means we don't * need to account for any ill-begotten jiffies to pay them off later. @@ -2887,11 +2907,17 @@ done_restock: } } while ((memcg = parent_mem_cgroup(memcg))); + /* + * Reclaim is set up above to be called from the userland + * return path. But also attempt synchronous reclaim to avoid + * excessive overrun while the task is still inside the + * kernel. If this is successful, the return path will see it + * when it rechecks the overage and simply bail out. + */ if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && !(current->flags & PF_MEMALLOC) && - gfpflags_allow_blocking(gfp_mask)) { + gfpflags_allow_blocking(gfp_mask)) mem_cgroup_handle_over_high(gfp_mask); - } return 0; } @@ -5456,10 +5482,11 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) __mem_cgroup_free(memcg); } -static struct mem_cgroup *mem_cgroup_alloc(void) +static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) { + struct memcg_vmstats_percpu *statc, *pstatc; struct mem_cgroup *memcg; - int node; + int node, cpu; int __maybe_unused i; long error = -ENOMEM; @@ -5483,6 +5510,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg->vmstats_percpu) goto fail; + for_each_possible_cpu(cpu) { + if (parent) + pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); + statc->parent = parent ? pstatc : NULL; + statc->vmstats = memcg->vmstats; + } + for_each_node(node) if (alloc_mem_cgroup_per_node_info(memcg, node)) goto fail; @@ -5528,7 +5563,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) struct mem_cgroup *memcg, *old_memcg; old_memcg = set_active_memcg(parent); - memcg = mem_cgroup_alloc(); + memcg = mem_cgroup_alloc(parent); set_active_memcg(old_memcg); if (IS_ERR(memcg)) return ERR_CAST(memcg); @@ -7936,9 +7971,13 @@ bool mem_cgroup_swap_full(struct folio *folio) static int __init setup_swap_account(char *s) { - pr_warn_once("The swapaccount= commandline option is deprecated. " - "Please report your usecase to linux-mm@kvack.org if you " - "depend on this functionality.\n"); + bool res; + + if (!kstrtobool(s, &res) && !res) + pr_warn_once("The swapaccount=0 commandline option is deprecated " + "in favor of configuring swap control via cgroupfs. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); return 1; } __setup("swapaccount=", setup_swap_account); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4f9b61f4a668..9349948f1abf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -982,7 +982,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p, int count = page_count(p) - 1; if (extra_pins) - count -= 1; + count -= folio_nr_pages(page_folio(p)); if (count > 0) { pr_err("%#lx: %s still referenced by %d users\n", @@ -1377,6 +1377,9 @@ void ClearPageHWPoisonTakenOff(struct page *page) */ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) { + if (PageSlab(page)) + return false; + /* Soft offline could migrate non-LRU movable pages */ if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) return true; diff --git a/mm/memory.c b/mm/memory.c index 7e1f4849463a..0bfc8b007c01 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1464,7 +1464,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, delay_rmap = 0; if (!folio_test_anon(folio)) { if (pte_dirty(ptent)) { - folio_set_dirty(folio); + folio_mark_dirty(folio); if (tlb_delay_rmap(tlb)) { delay_rmap = 1; force_flush = 1; @@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; + bool need_clear_cache = false; bool exclusive = false; swp_entry_t entry; pte_t pte; @@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { + /* + * Prevent parallel swapin from proceeding with + * the cache flag. Otherwise, another thread may + * finish swapin first, free the entry, and swapout + * reusing the same entry. It's undetectable as + * pte_same() returns true due to entry reuse. + */ + if (swapcache_prepare(entry)) { + /* Relax a bit to prevent rapid repeated page faults */ + schedule_timeout_uninterruptible(1); + goto out; + } + need_clear_cache = true; + /* skip swapcache */ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address, false); @@ -4117,6 +4132,9 @@ unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out: + /* Clear the swap cache pin for direct swapin after PTL unlock */ + if (need_clear_cache) + swapcache_clear(si, entry); if (si) put_swap_device(si); return ret; @@ -4131,6 +4149,8 @@ out_release: folio_unlock(swapcache); folio_put(swapcache); } + if (need_clear_cache) + swapcache_clear(si, entry); if (si) put_swap_device(si); return ret; @@ -5478,7 +5498,7 @@ static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs return true; if (regs && !user_mode(regs)) { - unsigned long ip = instruction_pointer(regs); + unsigned long ip = exception_ip(regs); if (!search_exception_tables(ip)) return false; } @@ -5503,7 +5523,7 @@ static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_r { mmap_read_unlock(mm); if (regs && !user_mode(regs)) { - unsigned long ip = instruction_pointer(regs); + unsigned long ip = exception_ip(regs); if (!search_exception_tables(ip)) return false; } diff --git a/mm/mmap.c b/mm/mmap.c index b78e83d351d2..d89770eaab6b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1825,15 +1825,17 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, /* * mmap_region() will call shmem_zero_setup() to create a file, * so use shmem's get_unmapped_area in case it can be huge. - * do_mmap() will clear pgoff, so match alignment. */ - pgoff = 0; get_area = shmem_get_unmapped_area; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { /* Ensures that larger anonymous mappings are THP aligned. */ get_area = thp_get_unmapped_area; } + /* Always treat pgoff as zero for anonymous memory. */ + if (!file) + pgoff = 0; + addr = get_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) return addr; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index cd4e4ae77c40..3f255534986a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1638,7 +1638,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) */ dtc->wb_thresh = __wb_calc_thresh(dtc); dtc->wb_bg_thresh = dtc->thresh ? - div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; + div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need @@ -1921,7 +1921,7 @@ pause: break; } __set_current_state(TASK_KILLABLE); - wb->dirty_sleep = now; + bdi->last_bdp_sleep = jiffies; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; diff --git a/mm/readahead.c b/mm/readahead.c index 23620c57c122..2648ec4f0494 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -469,7 +469,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, if (!folio) return -ENOMEM; - mark = round_up(mark, 1UL << order); + mark = round_down(mark, 1UL << order); if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); @@ -575,7 +575,7 @@ static void ondemand_readahead(struct readahead_control *ractl, * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - expected = round_up(ra->start + ra->size - ra->async_size, + expected = round_down(ra->start + ra->size - ra->async_size, 1UL << order); if (index == expected || index == (ra->start + ra->size)) { ra->start += ra->size; diff --git a/mm/swap.h b/mm/swap.h index 758c46ca671e..fc2f6ade7f80 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -41,6 +41,7 @@ void __delete_from_swap_cache(struct folio *folio, void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); struct folio *filemap_get_incore_folio(struct address_space *mapping, @@ -97,6 +98,10 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc) return 0; } +static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +{ +} + static inline struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { diff --git a/mm/swap_state.c b/mm/swap_state.c index e671266ad772..7255c01a1e4e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -680,9 +680,10 @@ skip: /* The page was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); - if (unlikely(page_allocated)) + if (unlikely(page_allocated)) { + zswap_folio_swapin(folio); swap_read_folio(folio, false, NULL); - zswap_folio_swapin(folio); + } return folio; } @@ -855,9 +856,10 @@ skip: /* The folio was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); - if (unlikely(page_allocated)) + if (unlikely(page_allocated)) { + zswap_folio_swapin(folio); swap_read_folio(folio, false, NULL); - zswap_folio_swapin(folio); + } return folio; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 556ff7347d5f..746aa9da5302 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3365,6 +3365,19 @@ int swapcache_prepare(swp_entry_t entry) return __swap_duplicate(entry, SWAP_HAS_CACHE); } +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +{ + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + unsigned char usage; + + ci = lock_cluster_or_swap_info(si, offset); + usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE); + unlock_cluster_or_swap_info(si, ci); + if (!usage) + free_swap_slot(entry); +} + struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return swap_type_to_swap_info(swp_type(entry)); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 20e3b0d9cf7e..7cf7d4384259 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -357,6 +357,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( unsigned long dst_start, unsigned long src_start, unsigned long len, + atomic_t *mmap_changing, uffd_flags_t flags) { struct mm_struct *dst_mm = dst_vma->vm_mm; @@ -472,6 +473,15 @@ retry: goto out; } mmap_read_lock(dst_mm); + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + if (mmap_changing && atomic_read(mmap_changing)) { + err = -EAGAIN; + break; + } dst_vma = NULL; goto retry; @@ -506,6 +516,7 @@ extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, + atomic_t *mmap_changing, uffd_flags_t flags); #endif /* CONFIG_HUGETLB_PAGE */ @@ -622,8 +633,8 @@ retry: * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) - return mfill_atomic_hugetlb(dst_vma, dst_start, - src_start, len, flags); + return mfill_atomic_hugetlb(dst_vma, dst_start, src_start, + len, mmap_changing, flags); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; @@ -891,8 +902,8 @@ static int move_present_pte(struct mm_struct *mm, double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(*src_pte, orig_src_pte) || - !pte_same(*dst_pte, orig_dst_pte)) { + if (!pte_same(ptep_get(src_pte), orig_src_pte) || + !pte_same(ptep_get(dst_pte), orig_dst_pte)) { err = -EAGAIN; goto out; } @@ -935,8 +946,8 @@ static int move_swap_pte(struct mm_struct *mm, double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(*src_pte, orig_src_pte) || - !pte_same(*dst_pte, orig_dst_pte)) { + if (!pte_same(ptep_get(src_pte), orig_src_pte) || + !pte_same(ptep_get(dst_pte), orig_dst_pte)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } @@ -1005,7 +1016,7 @@ retry: } spin_lock(dst_ptl); - orig_dst_pte = *dst_pte; + orig_dst_pte = ptep_get(dst_pte); spin_unlock(dst_ptl); if (!pte_none(orig_dst_pte)) { err = -EEXIST; @@ -1013,7 +1024,7 @@ retry: } spin_lock(src_ptl); - orig_src_pte = *src_pte; + orig_src_pte = ptep_get(src_pte); spin_unlock(src_ptl); if (pte_none(orig_src_pte)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) @@ -1043,7 +1054,7 @@ retry: * page isn't freed under us */ spin_lock(src_ptl); - if (!pte_same(orig_src_pte, *src_pte)) { + if (!pte_same(orig_src_pte, ptep_get(src_pte))) { spin_unlock(src_ptl); err = -EAGAIN; goto out; diff --git a/mm/zswap.c b/mm/zswap.c index ca25b676048e..db4625af65fb 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -377,10 +377,9 @@ void zswap_folio_swapin(struct folio *folio) { struct lruvec *lruvec; - if (folio) { - lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - } + VM_WARN_ON_ONCE(!folio_test_locked(folio)); + lruvec = folio_lruvec(folio); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); } /********************************* @@ -536,10 +535,6 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry) */ static void zswap_free_entry(struct zswap_entry *entry) { - if (entry->objcg) { - obj_cgroup_uncharge_zswap(entry->objcg, entry->length); - obj_cgroup_put(entry->objcg); - } if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { @@ -548,6 +543,10 @@ static void zswap_free_entry(struct zswap_entry *entry) atomic_dec(&entry->pool->nr_stored); zswap_pool_put(entry->pool); } + if (entry->objcg) { + obj_cgroup_uncharge_zswap(entry->objcg, entry->length); + obj_cgroup_put(entry->objcg); + } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); zswap_update_total_size(); @@ -895,10 +894,8 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * into the warmer region. We should terminate shrinking (if we're in the dynamic * shrinker context). */ - if (writeback_result == -EEXIST && encountered_page_in_swapcache) { - ret = LRU_SKIP; + if (writeback_result == -EEXIST && encountered_page_in_swapcache) *encountered_page_in_swapcache = true; - } goto put_unlock; } @@ -1442,6 +1439,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry, if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { spin_unlock(&tree->lock); delete_from_swap_cache(folio); + folio_unlock(folio); + folio_put(folio); return -ENOMEM; } spin_unlock(&tree->lock); @@ -1519,7 +1518,7 @@ bool zswap_store(struct folio *folio) if (folio_test_large(folio)) return false; - if (!zswap_enabled || !tree) + if (!tree) return false; /* @@ -1534,6 +1533,10 @@ bool zswap_store(struct folio *folio) zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock); + + if (!zswap_enabled) + return false; + objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { memcg = get_mem_cgroup_from_objcg(objcg); |
