diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 1 | ||||
-rw-r--r-- | mm/backing-dev.c | 3 | ||||
-rw-r--r-- | mm/filemap.c | 9 | ||||
-rw-r--r-- | mm/gup.c | 3 | ||||
-rw-r--r-- | mm/huge_memory.c | 5 | ||||
-rw-r--r-- | mm/memcontrol.c | 2 | ||||
-rw-r--r-- | mm/migrate.c | 20 | ||||
-rw-r--r-- | mm/mmap.c | 87 | ||||
-rw-r--r-- | mm/oom_kill.c | 81 | ||||
-rw-r--r-- | mm/page-writeback.c | 18 | ||||
-rw-r--r-- | mm/rmap.c | 3 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/vmscan.c | 21 | ||||
-rw-r--r-- | mm/vmstat.c | 6 | ||||
-rw-r--r-- | mm/z3fold.c | 42 |
15 files changed, 208 insertions, 95 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index d5004d82a1d6..e14c01513bfd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -636,6 +636,7 @@ config DEFERRED_STRUCT_PAGE_INIT default n depends on NO_BOOTMEM depends on !FLATMEM + depends on !NEED_PER_CPU_KM help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 023190c69dce..7441bd93b732 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -115,6 +115,7 @@ static int bdi_debug_register(struct backing_dev_info *bdi, const char *name) bdi, &bdi_debug_stats_fops); if (!bdi->debug_stats) { debugfs_remove(bdi->debug_dir); + bdi->debug_dir = NULL; return -ENOMEM; } @@ -383,7 +384,7 @@ static void wb_shutdown(struct bdi_writeback *wb) * the barrier provided by test_and_clear_bit() above. */ smp_wmb(); - clear_bit(WB_shutting_down, &wb->state); + clear_and_wake_up_bit(WB_shutting_down, &wb->state); } static void wb_exit(struct bdi_writeback *wb) diff --git a/mm/filemap.c b/mm/filemap.c index 9276bdb2343c..0604cb02e6f3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -786,7 +786,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) VM_BUG_ON_PAGE(!PageLocked(new), new); VM_BUG_ON_PAGE(new->mapping, new); - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK); if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); @@ -842,7 +842,7 @@ static int __add_to_page_cache_locked(struct page *page, return error; } - error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); + error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK); if (error) { if (!huge) mem_cgroup_cancel_charge(page, memcg, false); @@ -1585,8 +1585,7 @@ no_page: if (fgp_flags & FGP_ACCESSED) __SetPageReferenced(page); - err = add_to_page_cache_lru(page, mapping, offset, - gfp_mask & GFP_RECLAIM_MASK); + err = add_to_page_cache_lru(page, mapping, offset, gfp_mask); if (unlikely(err)) { put_page(page); page = NULL; @@ -2387,7 +2386,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) if (!page) return -ENOMEM; - ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL); + ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) @@ -544,6 +544,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (vm_flags & (VM_IO | VM_PFNMAP)) return -EFAULT; + if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) + return -EFAULT; + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 14ed6ee5e02f..a3a1815f8e11 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2925,7 +2925,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = maybe_pmd_mkwrite(pmde, vma); flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); - page_add_anon_rmap(new, vma, mmun_start, true); + if (PageAnon(new)) + page_add_anon_rmap(new, vma, mmun_start, true); + else + page_add_file_rmap(new, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); if (vma->vm_flags & VM_LOCKED) mlock_vma_page(new); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e074f7c637aa..2bd3df3d101a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2192,7 +2192,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, { struct memcg_kmem_cache_create_work *cw; - cw = kmalloc(sizeof(*cw), GFP_NOWAIT); + cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); if (!cw) return; diff --git a/mm/migrate.c b/mm/migrate.c index f65dd69e1fd1..8c0af0f7cab1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -472,7 +472,7 @@ int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); - expected_count += 1 + page_has_private(page); + expected_count += hpage_nr_pages(page) + page_has_private(page); if (page_count(page) != expected_count || radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { @@ -505,7 +505,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ newpage->index = page->index; newpage->mapping = page->mapping; - get_page(newpage); /* add cache reference */ + page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */ if (PageSwapBacked(page)) { __SetPageSwapBacked(newpage); if (PageSwapCache(page)) { @@ -524,13 +524,24 @@ int migrate_page_move_mapping(struct address_space *mapping, } radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + if (PageTransHuge(page)) { + int i; + int index = page_index(page); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + pslot = radix_tree_lookup_slot(&mapping->i_pages, + index + i); + radix_tree_replace_slot(&mapping->i_pages, pslot, + newpage + i); + } + } /* * Drop cache reference from old page by unfreezing * to one less reference. * We know this isn't the last reference. */ - page_ref_unfreeze(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); xa_unlock(&mapping->i_pages); /* Leave irq disabled to prevent preemption while updating stats */ @@ -1622,6 +1633,9 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, current_node = NUMA_NO_NODE; } out_flush: + if (list_empty(&pagelist)) + return err; + /* Make sure we do not overwrite the existing error */ err1 = do_move_pages_to_node(mm, &pagelist, current_node); if (!err1) diff --git a/mm/mmap.c b/mm/mmap.c index 188f195883b9..78e14facdb6e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -100,11 +100,20 @@ pgprot_t protection_map[16] __ro_after_init = { __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 }; +#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT +static inline pgprot_t arch_filter_pgprot(pgprot_t prot) +{ + return prot; +} +#endif + pgprot_t vm_get_page_prot(unsigned long vm_flags) { - return __pgprot(pgprot_val(protection_map[vm_flags & + pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | pgprot_val(arch_vm_get_page_prot(vm_flags))); + + return arch_filter_pgprot(ret); } EXPORT_SYMBOL(vm_get_page_prot); @@ -1315,6 +1324,35 @@ static inline int mlock_future_check(struct mm_struct *mm, return 0; } +static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return inode->i_sb->s_maxbytes; + + if (S_ISBLK(inode->i_mode)) + return MAX_LFS_FILESIZE; + + /* Special "we do even unsigned file positions" case */ + if (file->f_mode & FMODE_UNSIGNED_OFFSET) + return 0; + + /* Yes, random drivers might want more. But I'm tired of buggy drivers */ + return ULONG_MAX; +} + +static inline bool file_mmap_ok(struct file *file, struct inode *inode, + unsigned long pgoff, unsigned long len) +{ + u64 maxsize = file_mmap_size_max(file, inode); + + if (maxsize && len > maxsize) + return false; + maxsize -= len; + if (pgoff > maxsize >> PAGE_SHIFT) + return false; + return true; +} + /* * The caller must hold down_write(¤t->mm->mmap_sem). */ @@ -1400,6 +1438,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, struct inode *inode = file_inode(file); unsigned long flags_mask; + if (!file_mmap_ok(file, inode, pgoff, len)) + return -EOVERFLOW; + flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags; switch (flags & MAP_TYPE) { @@ -3015,6 +3056,32 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); + if (unlikely(mm_is_oom_victim(mm))) { + /* + * Manually reap the mm to free as much memory as possible. + * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard + * this mm from further consideration. Taking mm->mmap_sem for + * write after setting MMF_OOM_SKIP will guarantee that the oom + * reaper will not run on this mm again after mmap_sem is + * dropped. + * + * Nothing can be holding mm->mmap_sem here and the above call + * to mmu_notifier_release(mm) ensures mmu notifier callbacks in + * __oom_reap_task_mm() will not block. + * + * This needs to be done before calling munlock_vma_pages_all(), + * which clears VM_LOCKED, otherwise the oom reaper cannot + * reliably test it. + */ + mutex_lock(&oom_lock); + __oom_reap_task_mm(mm); + mutex_unlock(&oom_lock); + + set_bit(MMF_OOM_SKIP, &mm->flags); + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } + if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -3036,24 +3103,6 @@ void exit_mmap(struct mm_struct *mm) /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - - if (unlikely(mm_is_oom_victim(mm))) { - /* - * Wait for oom_reap_task() to stop working on this - * mm. Because MMF_OOM_SKIP is already set before - * calling down_read(), oom_reap_task() will not run - * on this "mm" post up_write(). - * - * mm_is_oom_victim() cannot be set from under us - * either because victim->mm is already set to NULL - * under task_lock before calling mmput and oom_mm is - * set not NULL by the OOM killer only if victim->mm - * is found not NULL while holding the task_lock. - */ - set_bit(MMF_OOM_SKIP, &mm->flags); - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); - } free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ff992fa8760a..8ba6cb88cf58 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -469,7 +469,6 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) return false; } - #ifdef CONFIG_MMU /* * OOM Reaper kernel thread which tries to reap the memory used by the OOM @@ -480,16 +479,54 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) +void __oom_reap_task_mm(struct mm_struct *mm) { - struct mmu_gather tlb; struct vm_area_struct *vma; + + /* + * Tell all users of get_user/copy_from_user etc... that the content + * is no longer stable. No barriers really needed because unmapping + * should imply barriers already and the reader would hit a page fault + * if it stumbled over a reaped memory. + */ + set_bit(MMF_UNSTABLE, &mm->flags); + + for (vma = mm->mmap ; vma; vma = vma->vm_next) { + if (!can_madv_dontneed_vma(vma)) + continue; + + /* + * Only anonymous pages have a good chance to be dropped + * without additional steps which we cannot afford as we + * are OOM already. + * + * We do not even care about fs backed pages because all + * which are reclaimable have already been reclaimed and + * we do not want to block exit_mmap by keeping mm ref + * count elevated without a good reason. + */ + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { + const unsigned long start = vma->vm_start; + const unsigned long end = vma->vm_end; + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, mm, start, end); + mmu_notifier_invalidate_range_start(mm, start, end); + unmap_page_range(&tlb, vma, start, end, NULL); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); + } + } +} + +static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) +{ bool ret = true; /* * We have to make sure to not race with the victim exit path * and cause premature new oom victim selection: - * __oom_reap_task_mm exit_mm + * oom_reap_task_mm exit_mm * mmget_not_zero * mmput * atomic_dec_and_test @@ -534,39 +571,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) trace_start_task_reaping(tsk->pid); - /* - * Tell all users of get_user/copy_from_user etc... that the content - * is no longer stable. No barriers really needed because unmapping - * should imply barriers already and the reader would hit a page fault - * if it stumbled over a reaped memory. - */ - set_bit(MMF_UNSTABLE, &mm->flags); - - for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_dontneed_vma(vma)) - continue; + __oom_reap_task_mm(mm); - /* - * Only anonymous pages have a good chance to be dropped - * without additional steps which we cannot afford as we - * are OOM already. - * - * We do not even care about fs backed pages because all - * which are reclaimable have already been reclaimed and - * we do not want to block exit_mmap by keeping mm ref - * count elevated without a good reason. - */ - if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { - const unsigned long start = vma->vm_start; - const unsigned long end = vma->vm_end; - - tlb_gather_mmu(&tlb, mm, start, end); - mmu_notifier_invalidate_range_start(mm, start, end); - unmap_page_range(&tlb, vma, start, end, NULL); - mmu_notifier_invalidate_range_end(mm, start, end); - tlb_finish_mmu(&tlb, start, end); - } - } pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, K(get_mm_counter(mm, MM_ANONPAGES)), @@ -587,14 +593,13 @@ static void oom_reap_task(struct task_struct *tsk) struct mm_struct *mm = tsk->signal->oom_mm; /* Retry the down_read_trylock(mmap_sem) a few times */ - while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) + while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); if (attempts <= MAX_OOM_REAP_RETRIES || test_bit(MMF_OOM_SKIP, &mm->flags)) goto done; - pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); debug_show_all_locks(); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5c1a3279e63f..337c6afb3345 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2502,13 +2502,13 @@ void account_page_redirty(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied--; dec_node_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); } } EXPORT_SYMBOL(account_page_redirty); @@ -2614,15 +2614,15 @@ void __cancel_dirty_page(struct page *page) if (mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; lock_page_memcg(page); - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) account_page_cleaned(page, mapping, wb); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); unlock_page_memcg(page); } else { ClearPageDirty(page); @@ -2654,7 +2654,7 @@ int clear_page_dirty_for_io(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. @@ -2691,14 +2691,14 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) { dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); return ret; } return TestClearPageDirty(page); diff --git a/mm/rmap.c b/mm/rmap.c index f0dd4e4565bc..8d5337fed37b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1374,9 +1374,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (!pvmw.pte && (flags & TTU_MIGRATION)) { VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); - if (!PageAnon(page)) - continue; - set_pmd_migration_entry(&pvmw, page); continue; } diff --git a/mm/sparse.c b/mm/sparse.c index 62eef264a7bd..73dc2fcc0eab 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -629,7 +629,7 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(start_pfn); + unsigned long section_nr = pfn_to_section_nr(pfn); struct mem_section *ms; /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 8b920ce3ae02..9b697323a88c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -303,7 +303,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone /* * Add a shrinker callback to be called from the vm. */ -int register_shrinker(struct shrinker *shrinker) +int prealloc_shrinker(struct shrinker *shrinker) { size_t size = sizeof(*shrinker->nr_deferred); @@ -313,10 +313,29 @@ int register_shrinker(struct shrinker *shrinker) shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); if (!shrinker->nr_deferred) return -ENOMEM; + return 0; +} + +void free_prealloced_shrinker(struct shrinker *shrinker) +{ + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; +} +void register_shrinker_prepared(struct shrinker *shrinker) +{ down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); +} + +int register_shrinker(struct shrinker *shrinker) +{ + int err = prealloc_shrinker(shrinker); + + if (err) + return err; + register_shrinker_prepared(shrinker); return 0; } EXPORT_SYMBOL(register_shrinker); diff --git a/mm/vmstat.c b/mm/vmstat.c index 536332e988b8..a2b9518980ce 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1161,7 +1161,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", - "nr_indirectly_reclaimable", + "", /* nr_indirectly_reclaimable */ /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1740,6 +1740,10 @@ static int vmstat_show(struct seq_file *m, void *arg) unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; + /* Skip hidden vmstat items. */ + if (*vmstat_text[off] == '\0') + return 0; + seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); diff --git a/mm/z3fold.c b/mm/z3fold.c index c0bca6153b95..4b366d181f35 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -144,7 +144,8 @@ enum z3fold_page_flags { PAGE_HEADLESS = 0, MIDDLE_CHUNK_MAPPED, NEEDS_COMPACTING, - PAGE_STALE + PAGE_STALE, + UNDER_RECLAIM }; /***************** @@ -173,6 +174,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); clear_bit(NEEDS_COMPACTING, &page->private); clear_bit(PAGE_STALE, &page->private); + clear_bit(UNDER_RECLAIM, &page->private); spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); @@ -756,6 +758,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) atomic64_dec(&pool->pages_nr); return; } + if (test_bit(UNDER_RECLAIM, &page->private)) { + z3fold_page_unlock(zhdr); + return; + } if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { z3fold_page_unlock(zhdr); return; @@ -840,6 +846,8 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) kref_get(&zhdr->refcount); list_del_init(&zhdr->buddy); zhdr->cpu = -1; + set_bit(UNDER_RECLAIM, &page->private); + break; } list_del_init(&page->lru); @@ -887,25 +895,35 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) goto next; } next: - spin_lock(&pool->lock); if (test_bit(PAGE_HEADLESS, &page->private)) { if (ret == 0) { - spin_unlock(&pool->lock); free_z3fold_page(page); return 0; } - } else if (kref_put(&zhdr->refcount, release_z3fold_page)) { - atomic64_dec(&pool->pages_nr); + spin_lock(&pool->lock); + list_add(&page->lru, &pool->lru); + spin_unlock(&pool->lock); + } else { + z3fold_page_lock(zhdr); + clear_bit(UNDER_RECLAIM, &page->private); + if (kref_put(&zhdr->refcount, + release_z3fold_page_locked)) { + atomic64_dec(&pool->pages_nr); + return 0; + } + /* + * if we are here, the page is still not completely + * free. Take the global pool lock then to be able + * to add it back to the lru list + */ + spin_lock(&pool->lock); + list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); - return 0; + z3fold_page_unlock(zhdr); } - /* - * Add to the beginning of LRU. - * Pool lock has to be kept here to ensure the page has - * not already been released - */ - list_add(&page->lru, &pool->lru); + /* We started off locked to we need to lock the pool back */ + spin_lock(&pool->lock); } spin_unlock(&pool->lock); return -EAGAIN; |