diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/hugetlb.c | 10 | ||||
-rw-r--r-- | mm/kasan/quarantine.c | 7 | ||||
-rw-r--r-- | mm/memcontrol.c | 12 | ||||
-rw-r--r-- | mm/memory-failure.c | 158 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 14 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 54 | ||||
-rw-r--r-- | mm/page_alloc.c | 2 | ||||
-rw-r--r-- | mm/userfaultfd.c | 15 | ||||
-rw-r--r-- | mm/util.c | 11 | ||||
-rw-r--r-- | mm/vmalloc.c | 36 | ||||
-rw-r--r-- | mm/workingset.c | 2 |
13 files changed, 240 insertions, 91 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f8ca7cca3c1a..3fc721789743 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6785,6 +6785,16 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) return ret; } +int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +{ + int ret; + + spin_lock_irq(&hugetlb_lock); + ret = __get_huge_page_for_hwpoison(pfn, flags); + spin_unlock_irq(&hugetlb_lock); + return ret; +} + void putback_active_hugepage(struct page *page) { spin_lock_irq(&hugetlb_lock); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 08291ed33e93..0a9def8ce5e8 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -315,6 +315,13 @@ static void per_cpu_remove_cache(void *arg) struct qlist_head *q; q = this_cpu_ptr(&cpu_quarantine); + /* + * Ensure the ordering between the writing to q->offline and + * per_cpu_remove_cache. Prevent cpu_quarantine from being corrupted + * by interrupt. + */ + if (READ_ONCE(q->offline)) + return; qlist_move_cache(q, &to_free, cache); qlist_free_all(&to_free, cache); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 725f76723220..598fece89e2b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -587,6 +587,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); static DEFINE_SPINLOCK(stats_flush_lock); static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_threshold = ATOMIC_INIT(0); +static u64 flush_next_time; + +#define FLUSH_TIME (2UL*HZ) /* * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can @@ -637,6 +640,7 @@ static void __mem_cgroup_flush_stats(void) if (!spin_trylock_irqsave(&stats_flush_lock, flag)) return; + flush_next_time = jiffies_64 + 2*FLUSH_TIME; cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); atomic_set(&stats_flush_threshold, 0); spin_unlock_irqrestore(&stats_flush_lock, flag); @@ -648,10 +652,16 @@ void mem_cgroup_flush_stats(void) __mem_cgroup_flush_stats(); } +void mem_cgroup_flush_stats_delayed(void) +{ + if (time_after64(jiffies_64, flush_next_time)) + mem_cgroup_flush_stats(); +} + static void flush_memcg_stats_dwork(struct work_struct *w) { __mem_cgroup_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } /** diff --git a/mm/memory-failure.c b/mm/memory-failure.c index dcb6bb9cf731..27760c19bad7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1498,50 +1498,113 @@ static int try_to_split_thp_page(struct page *page, const char *msg) return 0; } -static int memory_failure_hugetlb(unsigned long pfn, int flags) +/* + * Called from hugetlb code with hugetlb_lock held. + * + * Return values: + * 0 - free hugepage + * 1 - in-use hugepage + * 2 - not a hugepage + * -EBUSY - the hugepage is busy (try to retry) + * -EHWPOISON - the hugepage is already hwpoisoned + */ +int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +{ + struct page *page = pfn_to_page(pfn); + struct page *head = compound_head(page); + int ret = 2; /* fallback to normal page handling */ + bool count_increased = false; + + if (!PageHeadHuge(head)) + goto out; + + if (flags & MF_COUNT_INCREASED) { + ret = 1; + count_increased = true; + } else if (HPageFreed(head) || HPageMigratable(head)) { + ret = get_page_unless_zero(head); + if (ret) + count_increased = true; + } else { + ret = -EBUSY; + goto out; + } + + if (TestSetPageHWPoison(head)) { + ret = -EHWPOISON; + goto out; + } + + return ret; +out: + if (count_increased) + put_page(head); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * Taking refcount of hugetlb pages needs extra care about race conditions + * with basic operations like hugepage allocation/free/demotion. + * So some of prechecks for hwpoison (pinning, and testing/setting + * PageHWPoison) should be done in single hugetlb_lock range. + */ +static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) { - struct page *p = pfn_to_page(pfn); - struct page *head = compound_head(p); int res; + struct page *p = pfn_to_page(pfn); + struct page *head; unsigned long page_flags; + bool retry = true; - if (TestSetPageHWPoison(head)) { - pr_err("Memory failure: %#lx: already hardware poisoned\n", - pfn); - res = -EHWPOISON; - if (flags & MF_ACTION_REQUIRED) + *hugetlb = 1; +retry: + res = get_huge_page_for_hwpoison(pfn, flags); + if (res == 2) { /* fallback to normal page handling */ + *hugetlb = 0; + return 0; + } else if (res == -EHWPOISON) { + pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); + if (flags & MF_ACTION_REQUIRED) { + head = compound_head(p); res = kill_accessing_process(current, page_to_pfn(head), flags); + } + return res; + } else if (res == -EBUSY) { + if (retry) { + retry = false; + goto retry; + } + action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); return res; } + head = compound_head(p); + lock_page(head); + + if (hwpoison_filter(p)) { + ClearPageHWPoison(head); + res = -EOPNOTSUPP; + goto out; + } + num_poisoned_pages_inc(); - if (!(flags & MF_COUNT_INCREASED)) { - res = get_hwpoison_page(p, flags); - if (!res) { - lock_page(head); - if (hwpoison_filter(p)) { - if (TestClearPageHWPoison(head)) - num_poisoned_pages_dec(); - unlock_page(head); - return -EOPNOTSUPP; - } - unlock_page(head); - res = MF_FAILED; - if (__page_handle_poison(p)) { - page_ref_inc(p); - res = MF_RECOVERED; - } - action_result(pfn, MF_MSG_FREE_HUGE, res); - return res == MF_RECOVERED ? 0 : -EBUSY; - } else if (res < 0) { - action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); - return -EBUSY; + /* + * Handling free hugepage. The possible race with hugepage allocation + * or demotion can be prevented by PageHWPoison flag. + */ + if (res == 0) { + unlock_page(head); + res = MF_FAILED; + if (__page_handle_poison(p)) { + page_ref_inc(p); + res = MF_RECOVERED; } + action_result(pfn, MF_MSG_FREE_HUGE, res); + return res == MF_RECOVERED ? 0 : -EBUSY; } - lock_page(head); - /* * The page could have changed compound pages due to race window. * If this happens just bail out. @@ -1554,14 +1617,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) page_flags = head->flags; - if (hwpoison_filter(p)) { - if (TestClearPageHWPoison(head)) - num_poisoned_pages_dec(); - put_page(p); - res = -EOPNOTSUPP; - goto out; - } - /* * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so * simply disable it. In order to make it work properly, we need @@ -1588,6 +1643,12 @@ out: unlock_page(head); return res; } +#else +static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) +{ + return 0; +} +#endif static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) @@ -1712,6 +1773,7 @@ int memory_failure(unsigned long pfn, int flags) int res = 0; unsigned long page_flags; bool retry = true; + int hugetlb = 0; if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1739,10 +1801,9 @@ int memory_failure(unsigned long pfn, int flags) } try_again: - if (PageHuge(p)) { - res = memory_failure_hugetlb(pfn, flags); + res = try_memory_failure_hugetlb(pfn, flags, &hugetlb); + if (hugetlb) goto unlock_mutex; - } if (TestSetPageHWPoison(p)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", @@ -1800,6 +1861,19 @@ try_again: if (PageTransHuge(hpage)) { /* + * Bail out before SetPageHasHWPoisoned() if hpage is + * huge_zero_page, although PG_has_hwpoisoned is not + * checked in set_huge_zero_page(). + * + * TODO: Handle memory failure of huge_zero_page thoroughly. + */ + if (is_huge_zero_page(hpage)) { + action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); + res = -EBUSY; + goto unlock_mutex; + } + + /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. * And the flag can't be set in get_hwpoison_page() since diff --git a/mm/mmap.c b/mm/mmap.c index 3aa839f81e63..313b57d55a63 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2117,14 +2117,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) return addr; } -#ifndef arch_get_mmap_end -#define arch_get_mmap_end(addr) (TASK_SIZE) -#endif - -#ifndef arch_get_mmap_base -#define arch_get_mmap_base(addr, base) (base) -#endif - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 459d195d2ff6..f45ff1b7626a 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -1036,6 +1036,18 @@ int mmu_interval_notifier_insert_locked( } EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); +static bool +mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions, + unsigned long seq) +{ + bool ret; + + spin_lock(&subscriptions->lock); + ret = subscriptions->invalidate_seq != seq; + spin_unlock(&subscriptions->lock); + return ret; +} + /** * mmu_interval_notifier_remove - Remove a interval notifier * @interval_sub: Interval subscription to unregister @@ -1083,7 +1095,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub) lock_map_release(&__mmu_notifier_invalidate_range_start_map); if (seq) wait_event(subscriptions->wq, - READ_ONCE(subscriptions->invalidate_seq) != seq); + mmu_interval_seq_released(subscriptions, seq)); /* pairs with mmgrab in mmu_interval_notifier_insert() */ mmdrop(mm); diff --git a/mm/nommu.c b/mm/nommu.c index 55a9e48a7a02..9d7afc2d959e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -226,6 +226,8 @@ void *vmalloc(unsigned long size) } EXPORT_SYMBOL(vmalloc); +void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc); + /* * vzalloc - allocate virtually contiguous memory with zero fill * diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7ec38194f8e1..49d7df39b02d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -632,7 +632,7 @@ done: */ set_bit(MMF_OOM_SKIP, &mm->flags); - /* Drop a reference taken by wake_oom_reaper */ + /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); } @@ -644,12 +644,12 @@ static int oom_reaper(void *unused) struct task_struct *tsk = NULL; wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); - spin_lock(&oom_reaper_lock); + spin_lock_irq(&oom_reaper_lock); if (oom_reaper_list != NULL) { tsk = oom_reaper_list; oom_reaper_list = tsk->oom_reaper_list; } - spin_unlock(&oom_reaper_lock); + spin_unlock_irq(&oom_reaper_lock); if (tsk) oom_reap_task(tsk); @@ -658,22 +658,48 @@ static int oom_reaper(void *unused) return 0; } -static void wake_oom_reaper(struct task_struct *tsk) +static void wake_oom_reaper(struct timer_list *timer) { - /* mm is already queued? */ - if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) - return; + struct task_struct *tsk = container_of(timer, struct task_struct, + oom_reaper_timer); + struct mm_struct *mm = tsk->signal->oom_mm; + unsigned long flags; - get_task_struct(tsk); + /* The victim managed to terminate on its own - see exit_mmap */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + put_task_struct(tsk); + return; + } - spin_lock(&oom_reaper_lock); + spin_lock_irqsave(&oom_reaper_lock, flags); tsk->oom_reaper_list = oom_reaper_list; oom_reaper_list = tsk; - spin_unlock(&oom_reaper_lock); + spin_unlock_irqrestore(&oom_reaper_lock, flags); trace_wake_reaper(tsk->pid); wake_up(&oom_reaper_wait); } +/* + * Give the OOM victim time to exit naturally before invoking the oom_reaping. + * The timers timeout is arbitrary... the longer it is, the longer the worst + * case scenario for the OOM can take. If it is too small, the oom_reaper can + * get in the way and release resources needed by the process exit path. + * e.g. The futex robust list can sit in Anon|Private memory that gets reaped + * before the exit path is able to wake the futex waiters. + */ +#define OOM_REAPER_DELAY (2*HZ) +static void queue_oom_reaper(struct task_struct *tsk) +{ + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + return; + + get_task_struct(tsk); + timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); + tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; + add_timer(&tsk->oom_reaper_timer); +} + static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -681,7 +707,7 @@ static int __init oom_init(void) } subsys_initcall(oom_init) #else -static inline void wake_oom_reaper(struct task_struct *tsk) +static inline void queue_oom_reaper(struct task_struct *tsk) { } #endif /* CONFIG_MMU */ @@ -932,7 +958,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) rcu_read_unlock(); if (can_oom_reap) - wake_oom_reaper(victim); + queue_oom_reaper(victim); mmdrop(mm); put_task_struct(victim); @@ -968,7 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) task_lock(victim); if (task_will_free_mem(victim)) { mark_oom_victim(victim); - wake_oom_reaper(victim); + queue_oom_reaper(victim); task_unlock(victim); put_task_struct(victim); return; @@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *oc) */ if (task_will_free_mem(current)) { mark_oom_victim(current); - wake_oom_reaper(current); + queue_oom_reaper(current); return true; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 33ca8cab21e6..0e42038382c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8919,7 +8919,7 @@ void *__init alloc_large_system_hash(const char *tablename, table = memblock_alloc_raw(size, SMP_CACHE_BYTES); } else if (get_order(size) >= MAX_ORDER || hashdist) { - table = __vmalloc(size, gfp_flags); + table = vmalloc_huge(size, gfp_flags); virt = true; if (table) huge = is_vm_area_hugepages(table); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0cb8e5ef1713..e9bb6db002aa 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -72,12 +72,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, _dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared) writable = false; - if (writable) { - if (wp_copy) - _dst_pte = pte_mkuffd_wp(_dst_pte); - else - _dst_pte = pte_mkwrite(_dst_pte); - } + + /* + * Always mark a PTE as write-protected when needed, regardless of + * VM_WRITE, which the user might change. + */ + if (wp_copy) + _dst_pte = pte_mkuffd_wp(_dst_pte); + else if (writable) + _dst_pte = pte_mkwrite(_dst_pte); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); diff --git a/mm/util.c b/mm/util.c index 54e5e761a9a9..3492a9e81aa3 100644 --- a/mm/util.c +++ b/mm/util.c @@ -592,8 +592,15 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) return NULL; } - return __vmalloc_node(size, 1, flags, node, - __builtin_return_address(0)); + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 07da85ae825b..cadfbb5155ea 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2653,15 +2653,18 @@ static void __vunmap(const void *addr, int deallocate_pages) vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { - unsigned int page_order = vm_area_page_order(area); - int i, step = 1U << page_order; + int i; - for (i = 0; i < area->nr_pages; i += step) { + for (i = 0; i < area->nr_pages; i++) { struct page *page = area->pages[i]; BUG_ON(!page); - mod_memcg_page_state(page, MEMCG_VMALLOC, -step); - __free_pages(page, page_order); + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + /* + * High-order allocs for huge vmallocs are split, so + * can be freed as an array of order-0 allocations + */ + __free_pages(page, 0); cond_resched(); } atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); @@ -2914,12 +2917,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (nr != nr_pages_request) break; } - } else - /* - * Compound pages required for remap_vmalloc_page if - * high-order pages. - */ - gfp |= __GFP_COMP; + } /* High-order pages or fallback path if "bulk" fails. */ @@ -2933,6 +2931,15 @@ vm_area_alloc_pages(gfp_t gfp, int nid, page = alloc_pages_node(nid, gfp, order); if (unlikely(!page)) break; + /* + * Higher order allocations must be able to be treated as + * indepdenent small pages by callers (as they can with + * small-page vmallocs). Some drivers do their own refcounting + * on vmalloc_to_page() pages, some use page->mapping, + * page->lru, etc. + */ + if (order) + split_page(page, order); /* * Careful, we allocate and map page-order pages, but @@ -2992,11 +2999,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (gfp_mask & __GFP_ACCOUNT) { - int i, step = 1U << page_order; + int i; - for (i = 0; i < area->nr_pages; i += step) - mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, - step); + for (i = 0; i < area->nr_pages; i++) + mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); } /* diff --git a/mm/workingset.c b/mm/workingset.c index 8a3828acc0bf..592569a8974c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -355,7 +355,7 @@ void workingset_refault(struct folio *folio, void *shadow) mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats_delayed(); /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if |