From cea86fe246b694a191804b47378eb9d77aefabec Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:26:39 -0800 Subject: mm/munlock: rmap call mlock_vma_page() munlock_vma_page() Add vma argument to mlock_vma_page() and munlock_vma_page(), make them inline functions which check (vma->vm_flags & VM_LOCKED) before calling mlock_page() and munlock_page() in mm/mlock.c. Add bool compound to mlock_vma_page() and munlock_vma_page(): this is because we have understandable difficulty in accounting pte maps of THPs, and if passed a PageHead page, mlock_page() and munlock_page() cannot tell whether it's a pmd map to be counted or a pte map to be ignored. Add vma arg to page_add_file_rmap() and page_remove_rmap(), like the others, and use that to call mlock_vma_page() at the end of the page adds, and munlock_vma_page() at the end of page_remove_rmap() (end or beginning? unimportant, but end was easier for assertions in testing). No page lock is required (although almost all adds happen to hold it): delete the "Serialize with page migration" BUG_ON(!PageLocked(page))s. Certainly page lock did serialize with page migration, but I'm having difficulty explaining why that was ever important. Mlock accounting on THPs has been hard to define, differed between anon and file, involved PageDoubleMap in some places and not others, required clear_page_mlock() at some points. Keep it simple now: just count the pmds and ignore the ptes, there is no reason for ptes to undo pmd mlocks. page_add_new_anon_rmap() callers unchanged: they have long been calling lru_cache_add_inactive_or_unevictable(), which does its own VM_LOCKED handling (it also checks for not VM_SPECIAL: I think that's overcautious, and inconsistent with other checks, that mmap_region() already prevents VM_LOCKED on VM_SPECIAL; but haven't quite convinced myself to change it). Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/migrate.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index c7da064b4781..7c4223ce2500 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -248,14 +248,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else - page_add_file_rmap(new, false); + page_add_file_rmap(new, vma, false); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } - if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) - mlock_vma_page(new); - - if (PageTransHuge(page) && PageMlocked(page)) - clear_page_mlock(page); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); @@ -2331,7 +2326,7 @@ again: * drop page refcount. Page won't be freed, as we took * a reference just above. */ - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); put_page(page); if (pte_present(pte)) -- cgit v1.2.3-70-g09d2 From c3096e6782b733158bf34f6bbb4567808d4e0740 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:33:17 -0800 Subject: mm/migrate: __unmap_and_move() push good newpage to LRU Compaction, NUMA page movement, THP collapse/split, and memory failure do isolate unevictable pages from their "LRU", losing the record of mlock_count in doing so (isolators are likely to use page->lru for their own private lists, so mlock_count has to be presumed lost). That's unfortunate, and we should put in some work to correct that: one can imagine a function to build up the mlock_count again - but it would require i_mmap_rwsem for read, so be careful where it's called. Or page_referenced_one() and try_to_unmap_one() might do that extra work. But one place that can very easily be improved is page migration's __unmap_and_move(): a small adjustment to where the successful new page is put back on LRU, and its mlock_count (if any) is built back up by remove_migration_ptes(). Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/migrate.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 7c4223ce2500..f4bcf1541b62 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1032,6 +1032,21 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (!page_mapped(page)) rc = move_to_new_page(newpage, page, mode); + /* + * When successful, push newpage to LRU immediately: so that if it + * turns out to be an mlocked page, remove_migration_ptes() will + * automatically build up the correct newpage->mlock_count for it. + * + * We would like to do something similar for the old page, when + * unsuccessful, and other cases when a page has been temporarily + * isolated from the unevictable LRU: but this case is the easiest. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + lru_cache_add(newpage); + if (page_was_mapped) + lru_add_drain(); + } + if (page_was_mapped) remove_migration_ptes(page, rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); @@ -1045,20 +1060,12 @@ out_unlock: unlock_page(page); out: /* - * If migration is successful, decrease refcount of the newpage + * If migration is successful, decrease refcount of the newpage, * which will not free the page because new page owner increased - * refcounter. As well, if it is LRU page, add the page to LRU - * list in here. Use the old state of the isolated source page to - * determine if we migrated a LRU page. newpage was already unlocked - * and possibly modified by its owner - don't rely on the page - * state. + * refcounter. */ - if (rc == MIGRATEPAGE_SUCCESS) { - if (unlikely(!is_lru)) - put_page(newpage); - else - putback_lru_page(newpage); - } + if (rc == MIGRATEPAGE_SUCCESS) + put_page(newpage); return rc; } -- cgit v1.2.3-70-g09d2 From b74355078b6554271371532a5daa3b1a3db620f9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:38:47 -0800 Subject: mm/munlock: page migration needs mlock pagevec drained Page migration of a VM_LOCKED page tends to fail, because when the old page is unmapped, it is put on the mlock pagevec with raised refcount, which then fails the freeze. At first I thought this would be fixed by a local mlock_page_drain() at the upper rmap_walk() level - which would have nicely batched all the munlocks of that page; but tests show that the task can too easily move to another cpu, leaving pagevec residue behind which fails the migration. So try_to_migrate_one() drain the local pagevec after page_remove_rmap() from a VM_LOCKED vma; and do the same in try_to_unmap_one(), whose TTU_IGNORE_MLOCK users would want the same treatment; and do the same in remove_migration_pte() - not important when successfully inserting a new page, but necessary when hoping to retry after failure. Any new pagevec runs the risk of adding a new way of stranding, and we might discover other corners where mlock_page_drain() or lru_add_drain() would now help. Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/migrate.c | 2 ++ mm/rmap.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index f4bcf1541b62..e7d0b68d5dcb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -251,6 +251,8 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, page_add_file_rmap(new, vma, false); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); diff --git a/mm/rmap.c b/mm/rmap.c index 5442a5c97a85..714bfdc72c7b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1656,6 +1656,8 @@ discard: * See Documentation/vm/mmu_notifier.rst */ page_remove_rmap(subpage, vma, PageHuge(page)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); put_page(page); } @@ -1930,6 +1932,8 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ page_remove_rmap(subpage, vma, PageHuge(page)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); put_page(page); } -- cgit v1.2.3-70-g09d2 From 27674ef6c73f0c9096a9827dc5d6ba9fc7808422 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:36 +1100 Subject: mm: remove the extra ZONE_DEVICE struct page refcount ZONE_DEVICE struct pages have an extra reference count that complicates the code for put_page() and several places in the kernel that need to check the reference count to see that a page is not being used (gup, compaction, migration, etc.). Clean up the code so the reference count doesn't need to be treated specially for ZONE_DEVICE pages. Note that this excludes the special idle page wakeup for fsdax pages, which still happens at refcount 1. This is a separate issue and will be sorted out later. Given that only fsdax pages require the notifiacation when the refcount hits 1 now, the PAGEMAP_OPS Kconfig symbol can go away and be replaced with a FS_DAX check for this hook in the put_page fastpath. Based on an earlier patch from Ralph Campbell . Link: https://lkml.kernel.org/r/20220210072828.2930359-8-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Acked-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- arch/powerpc/kvm/book3s_hv_uvmem.c | 1 - drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 1 - drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 - fs/Kconfig | 1 - include/linux/memremap.h | 12 +++---- include/linux/mm.h | 6 ++-- lib/test_hmm.c | 1 - mm/Kconfig | 4 --- mm/internal.h | 2 ++ mm/memcontrol.c | 11 ++---- mm/memremap.c | 57 +++++++++++--------------------- mm/migrate.c | 6 ---- mm/swap.c | 16 +++------ 13 files changed, 36 insertions(+), 83 deletions(-) (limited to 'mm/migrate.c') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 881951604227..8cabdb39cbbc 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -713,7 +713,6 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - get_page(dpage); lock_page(dpage); return dpage; out_clear: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index cb835f95a76e..e27ca3758762 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -225,7 +225,6 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - get_page(page); lock_page(page); } diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index a5cdfbe32b5e..7ba66ad68a8a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -326,7 +326,6 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) return NULL; } - get_page(page); lock_page(page); return page; } diff --git a/fs/Kconfig b/fs/Kconfig index 6c7dc1387beb..e9433bbc4801 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -48,7 +48,6 @@ config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) - select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) select FS_IOMAP select DAX help diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 514ab46f597e..d6a114dd5ea8 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -68,9 +68,9 @@ enum memory_type { struct dev_pagemap_ops { /* - * Called once the page refcount reaches 1. (ZONE_DEVICE pages never - * reach 0 refcount unless there is a refcount bug. This allows the - * device driver to implement its own memory management.) + * Called once the page refcount reaches 0. The reference count will be + * reset to one by the core code after the method is called to prepare + * for handing out the page again. */ void (*page_free)(struct page *page); @@ -133,16 +133,14 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) static inline bool is_device_private_page(const struct page *page) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PRIVATE; } static inline bool is_pci_p2pdma_page(const struct page *page) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_PCI_P2PDMA) && + return IS_ENABLED(CONFIG_PCI_P2PDMA) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } diff --git a/include/linux/mm.h b/include/linux/mm.h index cb8bee88e70c..0201d258c646 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1090,7 +1090,7 @@ static inline bool is_zone_movable_page(const struct page *page) return page_zonenum(page) == ZONE_MOVABLE; } -#ifdef CONFIG_DEV_PAGEMAP_OPS +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); bool __put_devmap_managed_page(struct page *page); @@ -1103,12 +1103,12 @@ static inline bool put_devmap_managed_page(struct page *page) return __put_devmap_managed_page(page); } -#else /* CONFIG_DEV_PAGEMAP_OPS */ +#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ static inline bool put_devmap_managed_page(struct page *page) { return false; } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e5fc14ba71f3..cfe632047839 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -566,7 +566,6 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) } dpage->zone_device_data = rpage; - get_page(dpage); lock_page(dpage); return dpage; diff --git a/mm/Kconfig b/mm/Kconfig index 3326ee3903f3..a1901ae6d062 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -776,9 +776,6 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. -config DEV_PAGEMAP_OPS - bool - # # Helpers to mirror range of the CPU page tables of a process into device page # tables. @@ -790,7 +787,6 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" depends on ZONE_DEVICE - select DEV_PAGEMAP_OPS help Allows creation of struct pages to represent unaddressable device diff --git a/mm/internal.h b/mm/internal.h index 450a2c8a43f3..3756dd5d2c92 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -735,4 +735,6 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); +void free_zone_device_page(struct page *page); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2c5032294c9f..8fef072dc1ce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5503,17 +5503,12 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return NULL; /* - * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to - * a device and because they are not accessible by CPU they are store - * as special swap entry in the CPU page table. + * Handle device private pages that are not accessible by the CPU, but + * stored as special swap entries in the page table. */ if (is_device_private_entry(ent)) { page = pfn_swap_entry_to_page(ent); - /* - * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have - * a refcount of 1 when free (unlike normal page) - */ - if (!page_ref_add_unless(page, 1, 1)) + if (!get_page_unless_zero(page)) return NULL; return page; } diff --git a/mm/memremap.c b/mm/memremap.c index a0ece2344c2c..fef5734d5e49 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -12,6 +12,7 @@ #include #include #include +#include "internal.h" static DEFINE_XARRAY(pgmap_array); @@ -37,21 +38,19 @@ unsigned long memremap_compat_align(void) EXPORT_SYMBOL_GPL(memremap_compat_align); #endif -#ifdef CONFIG_DEV_PAGEMAP_OPS +#ifdef CONFIG_FS_DAX DEFINE_STATIC_KEY_FALSE(devmap_managed_key); EXPORT_SYMBOL(devmap_managed_key); static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_FS_DAX) + if (pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_dec(&devmap_managed_key); } static void devmap_managed_enable_get(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_FS_DAX) + if (pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_inc(&devmap_managed_key); } #else @@ -61,7 +60,7 @@ static void devmap_managed_enable_get(struct dev_pagemap *pgmap) static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_FS_DAX */ static void pgmap_array_delete(struct range *range) { @@ -102,23 +101,12 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) return (range->start + range_len(range)) >> PAGE_SHIFT; } -static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn) -{ - if (pfn % (1024 << pgmap->vmemmap_shift)) - cond_resched(); - return pfn + pgmap_vmemmap_nr(pgmap); -} - static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) { return (pfn_end(pgmap, range_id) - pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; } -#define for_each_device_pfn(pfn, map, i) \ - for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ - pfn = pfn_next(map, pfn)) - static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { struct range *range = &pgmap->ranges[range_id]; @@ -147,13 +135,11 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) void memunmap_pages(struct dev_pagemap *pgmap) { - unsigned long pfn; int i; percpu_ref_kill(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) - for_each_device_pfn(pfn, pgmap, i) - put_page(pfn_to_page(pfn)); + percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); wait_for_completion(&pgmap->done); percpu_ref_exit(&pgmap->ref); @@ -464,14 +450,10 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, } EXPORT_SYMBOL_GPL(get_dev_pagemap); -#ifdef CONFIG_DEV_PAGEMAP_OPS -void free_devmap_managed_page(struct page *page) +void free_zone_device_page(struct page *page) { - /* notify page idle for dax */ - if (!is_device_private_page(page)) { - wake_up_var(&page->_refcount); + if (WARN_ON_ONCE(!is_device_private_page(page))) return; - } __ClearPageWaiters(page); @@ -500,28 +482,27 @@ void free_devmap_managed_page(struct page *page) */ page->mapping = NULL; page->pgmap->ops->page_free(page); + + /* + * Reset the page count to 1 to prepare for handing out the page again. + */ + set_page_count(page, 1); } +#ifdef CONFIG_FS_DAX bool __put_devmap_managed_page(struct page *page) { - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && - page->pgmap->type != MEMORY_DEVICE_FS_DAX) + if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) return false; /* - * devmap page refcounts are 1-based, rather than 0-based: if + * fsdax page refcounts are 1-based, rather than 0-based: if * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - switch (page_ref_dec_return(page)) { - case 1: - free_devmap_managed_page(page); - break; - case 0: - __put_page(page); - break; - } + if (page_ref_dec_return(page) == 1) + wake_up_var(&page->_refcount); return true; } EXPORT_SYMBOL(__put_devmap_managed_page); -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_FS_DAX */ diff --git a/mm/migrate.c b/mm/migrate.c index e7d0b68d5dcb..af0534de618a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -338,14 +338,8 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) { int expected_count = 1; - /* - * Device private pages have an extra refcount as they are - * ZONE_DEVICE pages. - */ - expected_count += is_device_private_page(page); if (mapping) expected_count += compound_nr(page) + page_has_private(page); - return expected_count; } diff --git a/mm/swap.c b/mm/swap.c index db8d0eea13d7..fc3b7989f5b2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -122,17 +122,9 @@ static void __put_compound_page(struct page *page) void __put_page(struct page *page) { - if (is_zone_device_page(page)) { - put_dev_pagemap(page->pgmap); - - /* - * The page belongs to the device that created pgmap. Do - * not return it to page allocator. - */ - return; - } - - if (unlikely(PageCompound(page))) + if (unlikely(is_zone_device_page(page))) + free_zone_device_page(page); + else if (unlikely(PageCompound(page))) __put_compound_page(page); else __put_single_page(page); @@ -933,7 +925,7 @@ void release_pages(struct page **pages, int nr) if (put_devmap_managed_page(page)) continue; if (put_page_testzero(page)) - put_dev_pagemap(page->pgmap); + free_zone_device_page(page); continue; } -- cgit v1.2.3-70-g09d2 From 1776c0d102482d4aeccd56e404285bc47a481be8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:37 +1100 Subject: mm: refactor the ZONE_DEVICE handling in migrate_vma_insert_page Make the flow a little more clear and prepare for adding a new ZONE_DEVICE memory type. Link: https://lkml.kernel.org/r/20220210072828.2930359-12-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Alistair Popple Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- mm/migrate.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index af0534de618a..4246dca6c469 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2674,26 +2674,25 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, */ __SetPageUptodate(page); - if (is_zone_device_page(page)) { - if (is_device_private_page(page)) { - swp_entry_t swp_entry; + if (is_device_private_page(page)) { + swp_entry_t swp_entry; - if (vma->vm_flags & VM_WRITE) - swp_entry = make_writable_device_private_entry( - page_to_pfn(page)); - else - swp_entry = make_readable_device_private_entry( - page_to_pfn(page)); - entry = swp_entry_to_pte(swp_entry); - } else { - /* - * For now we only support migrating to un-addressable - * device memory. - */ + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); + entry = swp_entry_to_pte(swp_entry); + } else { + /* + * For now we only support migrating to un-addressable device + * memory. + */ + if (is_zone_device_page(page)) { pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); goto abort; } - } else { entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); -- cgit v1.2.3-70-g09d2 From aaf7d70cc595c78d27e915451e93a4459cfc36f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:37 +1100 Subject: mm: refactor the ZONE_DEVICE handling in migrate_vma_pages Make the flow a little more clear and prepare for adding a new ZONE_DEVICE memory type. Link: https://lkml.kernel.org/r/20220210072828.2930359-13-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Alistair Popple Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- mm/migrate.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 4246dca6c469..7dffd5f82a60 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2792,24 +2792,21 @@ void migrate_vma_pages(struct migrate_vma *migrate) mapping = page_mapping(page); - if (is_zone_device_page(newpage)) { - if (is_device_private_page(newpage)) { - /* - * For now only support private anonymous when - * migrating to un-addressable device memory. - */ - if (mapping) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - continue; - } - } else { - /* - * Other types of ZONE_DEVICE page are not - * supported. - */ + if (is_device_private_page(newpage)) { + /* + * For now only support private anonymous when migrating + * to un-addressable device memory. + */ + if (mapping) { migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; continue; } + } else if (is_zone_device_page(newpage)) { + /* + * Other types of ZONE_DEVICE page are not supported. + */ + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; } r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); -- cgit v1.2.3-70-g09d2 From 76cbbead253ddcae9878be0d702208bb1e4fac6f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:38 +1100 Subject: mm: move the migrate_vma_* device migration code into its own file Split the code used to migrate to and from ZONE_DEVICE memory from migrate.c into a new file. Link: https://lkml.kernel.org/r/20220210072828.2930359-14-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- mm/Kconfig | 3 + mm/Makefile | 1 + mm/migrate.c | 753 --------------------------------------------------- mm/migrate_device.c | 766 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 770 insertions(+), 753 deletions(-) create mode 100644 mm/migrate_device.c (limited to 'mm/migrate.c') diff --git a/mm/Kconfig b/mm/Kconfig index a1901ae6d062..6391d8d3a616 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -249,6 +249,9 @@ config MIGRATION pages as migration can relocate pages to satisfy a huge page allocation instead of reclaiming. +config DEVICE_MIGRATION + def_bool MIGRATION && DEVICE_PRIVATE + config ARCH_ENABLE_HUGEPAGE_MIGRATION bool diff --git a/mm/Makefile b/mm/Makefile index 70d4309c9ce3..4cc13f3179a5 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o +obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o diff --git a/mm/migrate.c b/mm/migrate.c index 7dffd5f82a60..f4076093c855 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,12 +38,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include @@ -2129,757 +2127,6 @@ out: #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */ -#ifdef CONFIG_DEVICE_PRIVATE -static int migrate_vma_collect_skip(unsigned long start, - unsigned long end, - struct mm_walk *walk) -{ - struct migrate_vma *migrate = walk->private; - unsigned long addr; - - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->dst[migrate->npages] = 0; - migrate->src[migrate->npages++] = 0; - } - - return 0; -} - -static int migrate_vma_collect_hole(unsigned long start, - unsigned long end, - __always_unused int depth, - struct mm_walk *walk) -{ - struct migrate_vma *migrate = walk->private; - unsigned long addr; - - /* Only allow populating anonymous memory. */ - if (!vma_is_anonymous(walk->vma)) - return migrate_vma_collect_skip(start, end, walk); - - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; - migrate->dst[migrate->npages] = 0; - migrate->npages++; - migrate->cpages++; - } - - return 0; -} - -static int migrate_vma_collect_pmd(pmd_t *pmdp, - unsigned long start, - unsigned long end, - struct mm_walk *walk) -{ - struct migrate_vma *migrate = walk->private; - struct vm_area_struct *vma = walk->vma; - struct mm_struct *mm = vma->vm_mm; - unsigned long addr = start, unmapped = 0; - spinlock_t *ptl; - pte_t *ptep; - -again: - if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, -1, walk); - - if (pmd_trans_huge(*pmdp)) { - struct page *page; - - ptl = pmd_lock(mm, pmdp); - if (unlikely(!pmd_trans_huge(*pmdp))) { - spin_unlock(ptl); - goto again; - } - - page = pmd_page(*pmdp); - if (is_huge_zero_page(page)) { - spin_unlock(ptl); - split_huge_pmd(vma, pmdp, addr); - if (pmd_trans_unstable(pmdp)) - return migrate_vma_collect_skip(start, end, - walk); - } else { - int ret; - - get_page(page); - spin_unlock(ptl); - if (unlikely(!trylock_page(page))) - return migrate_vma_collect_skip(start, end, - walk); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - if (ret) - return migrate_vma_collect_skip(start, end, - walk); - if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, -1, - walk); - } - } - - if (unlikely(pmd_bad(*pmdp))) - return migrate_vma_collect_skip(start, end, walk); - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - arch_enter_lazy_mmu_mode(); - - for (; addr < end; addr += PAGE_SIZE, ptep++) { - unsigned long mpfn = 0, pfn; - struct page *page; - swp_entry_t entry; - pte_t pte; - - pte = *ptep; - - if (pte_none(pte)) { - if (vma_is_anonymous(vma)) { - mpfn = MIGRATE_PFN_MIGRATE; - migrate->cpages++; - } - goto next; - } - - if (!pte_present(pte)) { - /* - * Only care about unaddressable device page special - * page table entry. Other special swap entries are not - * migratable, and we ignore regular swapped page. - */ - entry = pte_to_swp_entry(pte); - if (!is_device_private_entry(entry)) - goto next; - - page = pfn_swap_entry_to_page(entry); - if (!(migrate->flags & - MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || - page->pgmap->owner != migrate->pgmap_owner) - goto next; - - mpfn = migrate_pfn(page_to_pfn(page)) | - MIGRATE_PFN_MIGRATE; - if (is_writable_device_private_entry(entry)) - mpfn |= MIGRATE_PFN_WRITE; - } else { - if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) - goto next; - pfn = pte_pfn(pte); - if (is_zero_pfn(pfn)) { - mpfn = MIGRATE_PFN_MIGRATE; - migrate->cpages++; - goto next; - } - page = vm_normal_page(migrate->vma, addr, pte); - mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; - mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; - } - - /* FIXME support THP */ - if (!page || !page->mapping || PageTransCompound(page)) { - mpfn = 0; - goto next; - } - - /* - * By getting a reference on the page we pin it and that blocks - * any kind of migration. Side effect is that it "freezes" the - * pte. - * - * We drop this reference after isolating the page from the lru - * for non device page (device page are not on the lru and thus - * can't be dropped from it). - */ - get_page(page); - - /* - * Optimize for the common case where page is only mapped once - * in one process. If we can lock the page, then we can safely - * set up a special migration page table entry now. - */ - if (trylock_page(page)) { - pte_t swp_pte; - - migrate->cpages++; - ptep_get_and_clear(mm, addr, ptep); - - /* Setup special migration page table entry */ - if (mpfn & MIGRATE_PFN_WRITE) - entry = make_writable_migration_entry( - page_to_pfn(page)); - else - entry = make_readable_migration_entry( - page_to_pfn(page)); - swp_pte = swp_entry_to_pte(entry); - if (pte_present(pte)) { - if (pte_soft_dirty(pte)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_uffd_wp(pte)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); - } else { - if (pte_swp_soft_dirty(pte)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_swp_uffd_wp(pte)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); - } - set_pte_at(mm, addr, ptep, swp_pte); - - /* - * This is like regular unmap: we remove the rmap and - * drop page refcount. Page won't be freed, as we took - * a reference just above. - */ - page_remove_rmap(page, vma, false); - put_page(page); - - if (pte_present(pte)) - unmapped++; - } else { - put_page(page); - mpfn = 0; - } - -next: - migrate->dst[migrate->npages] = 0; - migrate->src[migrate->npages++] = mpfn; - } - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(ptep - 1, ptl); - - /* Only flush the TLB if we actually modified any entries */ - if (unmapped) - flush_tlb_range(walk->vma, start, end); - - return 0; -} - -static const struct mm_walk_ops migrate_vma_walk_ops = { - .pmd_entry = migrate_vma_collect_pmd, - .pte_hole = migrate_vma_collect_hole, -}; - -/* - * migrate_vma_collect() - collect pages over a range of virtual addresses - * @migrate: migrate struct containing all migration information - * - * This will walk the CPU page table. For each virtual address backed by a - * valid page, it updates the src array and takes a reference on the page, in - * order to pin the page until we lock it and unmap it. - */ -static void migrate_vma_collect(struct migrate_vma *migrate) -{ - struct mmu_notifier_range range; - - /* - * Note that the pgmap_owner is passed to the mmu notifier callback so - * that the registered device driver can skip invalidating device - * private page mappings that won't be migrated. - */ - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, - migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, - migrate->pgmap_owner); - mmu_notifier_invalidate_range_start(&range); - - walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, - &migrate_vma_walk_ops, migrate); - - mmu_notifier_invalidate_range_end(&range); - migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); -} - -/* - * migrate_vma_check_page() - check if page is pinned or not - * @page: struct page to check - * - * Pinned pages cannot be migrated. This is the same test as in - * folio_migrate_mapping(), except that here we allow migration of a - * ZONE_DEVICE page. - */ -static bool migrate_vma_check_page(struct page *page) -{ - /* - * One extra ref because caller holds an extra reference, either from - * isolate_lru_page() for a regular page, or migrate_vma_collect() for - * a device page. - */ - int extra = 1; - - /* - * FIXME support THP (transparent huge page), it is bit more complex to - * check them than regular pages, because they can be mapped with a pmd - * or with a pte (split pte mapping). - */ - if (PageCompound(page)) - return false; - - /* Page from ZONE_DEVICE have one extra reference */ - if (is_zone_device_page(page)) - extra++; - - /* For file back page */ - if (page_mapping(page)) - extra += 1 + page_has_private(page); - - if ((page_count(page) - extra) > page_mapcount(page)) - return false; - - return true; -} - -/* - * migrate_vma_unmap() - replace page mapping with special migration pte entry - * @migrate: migrate struct containing all migration information - * - * Isolate pages from the LRU and replace mappings (CPU page table pte) with a - * special migration pte entry and check if it has been pinned. Pinned pages are - * restored because we cannot migrate them. - * - * This is the last step before we call the device driver callback to allocate - * destination memory and copy contents of original page over to new page. - */ -static void migrate_vma_unmap(struct migrate_vma *migrate) -{ - const unsigned long npages = migrate->npages; - unsigned long i, restore = 0; - bool allow_drain = true; - - lru_add_drain(); - - for (i = 0; i < npages; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); - - if (!page) - continue; - - /* ZONE_DEVICE pages are not on LRU */ - if (!is_zone_device_page(page)) { - if (!PageLRU(page) && allow_drain) { - /* Drain CPU's pagevec */ - lru_add_drain_all(); - allow_drain = false; - } - - if (isolate_lru_page(page)) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; - restore++; - continue; - } - - /* Drop the reference we took in collect */ - put_page(page); - } - - if (page_mapped(page)) - try_to_migrate(page, 0); - - if (page_mapped(page) || !migrate_vma_check_page(page)) { - if (!is_zone_device_page(page)) { - get_page(page); - putback_lru_page(page); - } - - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; - restore++; - continue; - } - } - - for (i = 0; i < npages && restore; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); - - if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) - continue; - - remove_migration_ptes(page, page, false); - - migrate->src[i] = 0; - unlock_page(page); - put_page(page); - restore--; - } -} - -/** - * migrate_vma_setup() - prepare to migrate a range of memory - * @args: contains the vma, start, and pfns arrays for the migration - * - * Returns: negative errno on failures, 0 when 0 or more pages were migrated - * without an error. - * - * Prepare to migrate a range of memory virtual address range by collecting all - * the pages backing each virtual address in the range, saving them inside the - * src array. Then lock those pages and unmap them. Once the pages are locked - * and unmapped, check whether each page is pinned or not. Pages that aren't - * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the - * corresponding src array entry. Then restores any pages that are pinned, by - * remapping and unlocking those pages. - * - * The caller should then allocate destination memory and copy source memory to - * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE - * flag set). Once these are allocated and copied, the caller must update each - * corresponding entry in the dst array with the pfn value of the destination - * page and with MIGRATE_PFN_VALID. Destination pages must be locked via - * lock_page(). - * - * Note that the caller does not have to migrate all the pages that are marked - * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from - * device memory to system memory. If the caller cannot migrate a device page - * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe - * consequences for the userspace process, so it must be avoided if at all - * possible. - * - * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we - * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus - * allowing the caller to allocate device memory for those unbacked virtual - * addresses. For this the caller simply has to allocate device memory and - * properly set the destination entry like for regular migration. Note that - * this can still fail, and thus inside the device driver you must check if the - * migration was successful for those entries after calling migrate_vma_pages(), - * just like for regular migration. - * - * After that, the callers must call migrate_vma_pages() to go over each entry - * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag - * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, - * then migrate_vma_pages() to migrate struct page information from the source - * struct page to the destination struct page. If it fails to migrate the - * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the - * src array. - * - * At this point all successfully migrated pages have an entry in the src - * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst - * array entry with MIGRATE_PFN_VALID flag set. - * - * Once migrate_vma_pages() returns the caller may inspect which pages were - * successfully migrated, and which were not. Successfully migrated pages will - * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. - * - * It is safe to update device page table after migrate_vma_pages() because - * both destination and source page are still locked, and the mmap_lock is held - * in read mode (hence no one can unmap the range being migrated). - * - * Once the caller is done cleaning up things and updating its page table (if it - * chose to do so, this is not an obligation) it finally calls - * migrate_vma_finalize() to update the CPU page table to point to new pages - * for successfully migrated pages or otherwise restore the CPU page table to - * point to the original source pages. - */ -int migrate_vma_setup(struct migrate_vma *args) -{ - long nr_pages = (args->end - args->start) >> PAGE_SHIFT; - - args->start &= PAGE_MASK; - args->end &= PAGE_MASK; - if (!args->vma || is_vm_hugetlb_page(args->vma) || - (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) - return -EINVAL; - if (nr_pages <= 0) - return -EINVAL; - if (args->start < args->vma->vm_start || - args->start >= args->vma->vm_end) - return -EINVAL; - if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) - return -EINVAL; - if (!args->src || !args->dst) - return -EINVAL; - - memset(args->src, 0, sizeof(*args->src) * nr_pages); - args->cpages = 0; - args->npages = 0; - - migrate_vma_collect(args); - - if (args->cpages) - migrate_vma_unmap(args); - - /* - * At this point pages are locked and unmapped, and thus they have - * stable content and can safely be copied to destination memory that - * is allocated by the drivers. - */ - return 0; - -} -EXPORT_SYMBOL(migrate_vma_setup); - -/* - * This code closely matches the code in: - * __handle_mm_fault() - * handle_pte_fault() - * do_anonymous_page() - * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE - * private page. - */ -static void migrate_vma_insert_page(struct migrate_vma *migrate, - unsigned long addr, - struct page *page, - unsigned long *src) -{ - struct vm_area_struct *vma = migrate->vma; - struct mm_struct *mm = vma->vm_mm; - bool flush = false; - spinlock_t *ptl; - pte_t entry; - pgd_t *pgdp; - p4d_t *p4dp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - /* Only allow populating anonymous memory */ - if (!vma_is_anonymous(vma)) - goto abort; - - pgdp = pgd_offset(mm, addr); - p4dp = p4d_alloc(mm, pgdp, addr); - if (!p4dp) - goto abort; - pudp = pud_alloc(mm, p4dp, addr); - if (!pudp) - goto abort; - pmdp = pmd_alloc(mm, pudp, addr); - if (!pmdp) - goto abort; - - if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) - goto abort; - - /* - * Use pte_alloc() instead of pte_alloc_map(). We can't run - * pte_offset_map() on pmds where a huge pmd might be created - * from a different thread. - * - * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when - * parallel threads are excluded by other means. - * - * Here we only have mmap_read_lock(mm). - */ - if (pte_alloc(mm, pmdp)) - goto abort; - - /* See the comment in pte_alloc_one_map() */ - if (unlikely(pmd_trans_unstable(pmdp))) - goto abort; - - if (unlikely(anon_vma_prepare(vma))) - goto abort; - if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) - goto abort; - - /* - * The memory barrier inside __SetPageUptodate makes sure that - * preceding stores to the page contents become visible before - * the set_pte_at() write. - */ - __SetPageUptodate(page); - - if (is_device_private_page(page)) { - swp_entry_t swp_entry; - - if (vma->vm_flags & VM_WRITE) - swp_entry = make_writable_device_private_entry( - page_to_pfn(page)); - else - swp_entry = make_readable_device_private_entry( - page_to_pfn(page)); - entry = swp_entry_to_pte(swp_entry); - } else { - /* - * For now we only support migrating to un-addressable device - * memory. - */ - if (is_zone_device_page(page)) { - pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); - goto abort; - } - entry = mk_pte(page, vma->vm_page_prot); - if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry)); - } - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - - if (check_stable_address_space(mm)) - goto unlock_abort; - - if (pte_present(*ptep)) { - unsigned long pfn = pte_pfn(*ptep); - - if (!is_zero_pfn(pfn)) - goto unlock_abort; - flush = true; - } else if (!pte_none(*ptep)) - goto unlock_abort; - - /* - * Check for userfaultfd but do not deliver the fault. Instead, - * just back off. - */ - if (userfaultfd_missing(vma)) - goto unlock_abort; - - inc_mm_counter(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr, false); - if (!is_zone_device_page(page)) - lru_cache_add_inactive_or_unevictable(page, vma); - get_page(page); - - if (flush) { - flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); - } else { - /* No need to invalidate - it was non-present before */ - set_pte_at(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); - } - - pte_unmap_unlock(ptep, ptl); - *src = MIGRATE_PFN_MIGRATE; - return; - -unlock_abort: - pte_unmap_unlock(ptep, ptl); -abort: - *src &= ~MIGRATE_PFN_MIGRATE; -} - -/** - * migrate_vma_pages() - migrate meta-data from src page to dst page - * @migrate: migrate struct containing all migration information - * - * This migrates struct page meta-data from source struct page to destination - * struct page. This effectively finishes the migration from source page to the - * destination page. - */ -void migrate_vma_pages(struct migrate_vma *migrate) -{ - const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; - struct mmu_notifier_range range; - unsigned long addr, i; - bool notified = false; - - for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); - struct address_space *mapping; - int r; - - if (!newpage) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - continue; - } - - if (!page) { - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) - continue; - if (!notified) { - notified = true; - - mmu_notifier_range_init_owner(&range, - MMU_NOTIFY_MIGRATE, 0, migrate->vma, - migrate->vma->vm_mm, addr, migrate->end, - migrate->pgmap_owner); - mmu_notifier_invalidate_range_start(&range); - } - migrate_vma_insert_page(migrate, addr, newpage, - &migrate->src[i]); - continue; - } - - mapping = page_mapping(page); - - if (is_device_private_page(newpage)) { - /* - * For now only support private anonymous when migrating - * to un-addressable device memory. - */ - if (mapping) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - continue; - } - } else if (is_zone_device_page(newpage)) { - /* - * Other types of ZONE_DEVICE page are not supported. - */ - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - continue; - } - - r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); - if (r != MIGRATEPAGE_SUCCESS) - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - } - - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() - * did already call it. - */ - if (notified) - mmu_notifier_invalidate_range_only_end(&range); -} -EXPORT_SYMBOL(migrate_vma_pages); - -/** - * migrate_vma_finalize() - restore CPU page table entry - * @migrate: migrate struct containing all migration information - * - * This replaces the special migration pte entry with either a mapping to the - * new page if migration was successful for that page, or to the original page - * otherwise. - * - * This also unlocks the pages and puts them back on the lru, or drops the extra - * refcount, for device pages. - */ -void migrate_vma_finalize(struct migrate_vma *migrate) -{ - const unsigned long npages = migrate->npages; - unsigned long i; - - for (i = 0; i < npages; i++) { - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); - - if (!page) { - if (newpage) { - unlock_page(newpage); - put_page(newpage); - } - continue; - } - - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { - if (newpage) { - unlock_page(newpage); - put_page(newpage); - } - newpage = page; - } - - remove_migration_ptes(page, newpage, false); - unlock_page(page); - - if (is_zone_device_page(page)) - put_page(page); - else - putback_lru_page(page); - - if (newpage != page) { - unlock_page(newpage); - if (is_zone_device_page(newpage)) - put_page(newpage); - else - putback_lru_page(newpage); - } - } -} -EXPORT_SYMBOL(migrate_vma_finalize); -#endif /* CONFIG_DEVICE_PRIVATE */ - /* * node_demotion[] example: * diff --git a/mm/migrate_device.c b/mm/migrate_device.c new file mode 100644 index 000000000000..0326b901d2fd --- /dev/null +++ b/mm/migrate_device.c @@ -0,0 +1,766 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device Memory Migration functionality. + * + * Originally written by Jérôme Glisse. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int migrate_vma_collect_skip(unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = 0; + } + + return 0; +} + +static int migrate_vma_collect_hole(unsigned long start, + unsigned long end, + __always_unused int depth, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + /* Only allow populating anonymous memory. */ + if (!vma_is_anonymous(walk->vma)) + return migrate_vma_collect_skip(start, end, walk); + + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; + migrate->dst[migrate->npages] = 0; + migrate->npages++; + migrate->cpages++; + } + + return 0; +} + +static int migrate_vma_collect_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + struct vm_area_struct *vma = walk->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start, unmapped = 0; + spinlock_t *ptl; + pte_t *ptep; + +again: + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, -1, walk); + + if (pmd_trans_huge(*pmdp)) { + struct page *page; + + ptl = pmd_lock(mm, pmdp); + if (unlikely(!pmd_trans_huge(*pmdp))) { + spin_unlock(ptl); + goto again; + } + + page = pmd_page(*pmdp); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmdp, addr); + if (pmd_trans_unstable(pmdp)) + return migrate_vma_collect_skip(start, end, + walk); + } else { + int ret; + + get_page(page); + spin_unlock(ptl); + if (unlikely(!trylock_page(page))) + return migrate_vma_collect_skip(start, end, + walk); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return migrate_vma_collect_skip(start, end, + walk); + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, -1, + walk); + } + } + + if (unlikely(pmd_bad(*pmdp))) + return migrate_vma_collect_skip(start, end, walk); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + arch_enter_lazy_mmu_mode(); + + for (; addr < end; addr += PAGE_SIZE, ptep++) { + unsigned long mpfn = 0, pfn; + struct page *page; + swp_entry_t entry; + pte_t pte; + + pte = *ptep; + + if (pte_none(pte)) { + if (vma_is_anonymous(vma)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + } + goto next; + } + + if (!pte_present(pte)) { + /* + * Only care about unaddressable device page special + * page table entry. Other special swap entries are not + * migratable, and we ignore regular swapped page. + */ + entry = pte_to_swp_entry(pte); + if (!is_device_private_entry(entry)) + goto next; + + page = pfn_swap_entry_to_page(entry); + if (!(migrate->flags & + MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || + page->pgmap->owner != migrate->pgmap_owner) + goto next; + + mpfn = migrate_pfn(page_to_pfn(page)) | + MIGRATE_PFN_MIGRATE; + if (is_writable_device_private_entry(entry)) + mpfn |= MIGRATE_PFN_WRITE; + } else { + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) + goto next; + pfn = pte_pfn(pte); + if (is_zero_pfn(pfn)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + goto next; + } + page = vm_normal_page(migrate->vma, addr, pte); + mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; + } + + /* FIXME support THP */ + if (!page || !page->mapping || PageTransCompound(page)) { + mpfn = 0; + goto next; + } + + /* + * By getting a reference on the page we pin it and that blocks + * any kind of migration. Side effect is that it "freezes" the + * pte. + * + * We drop this reference after isolating the page from the lru + * for non device page (device page are not on the lru and thus + * can't be dropped from it). + */ + get_page(page); + + /* + * Optimize for the common case where page is only mapped once + * in one process. If we can lock the page, then we can safely + * set up a special migration page table entry now. + */ + if (trylock_page(page)) { + pte_t swp_pte; + + migrate->cpages++; + ptep_get_and_clear(mm, addr, ptep); + + /* Setup special migration page table entry */ + if (mpfn & MIGRATE_PFN_WRITE) + entry = make_writable_migration_entry( + page_to_pfn(page)); + else + entry = make_readable_migration_entry( + page_to_pfn(page)); + swp_pte = swp_entry_to_pte(entry); + if (pte_present(pte)) { + if (pte_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } else { + if (pte_swp_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } + set_pte_at(mm, addr, ptep, swp_pte); + + /* + * This is like regular unmap: we remove the rmap and + * drop page refcount. Page won't be freed, as we took + * a reference just above. + */ + page_remove_rmap(page, vma, false); + put_page(page); + + if (pte_present(pte)) + unmapped++; + } else { + put_page(page); + mpfn = 0; + } + +next: + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = mpfn; + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep - 1, ptl); + + /* Only flush the TLB if we actually modified any entries */ + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + return 0; +} + +static const struct mm_walk_ops migrate_vma_walk_ops = { + .pmd_entry = migrate_vma_collect_pmd, + .pte_hole = migrate_vma_collect_hole, +}; + +/* + * migrate_vma_collect() - collect pages over a range of virtual addresses + * @migrate: migrate struct containing all migration information + * + * This will walk the CPU page table. For each virtual address backed by a + * valid page, it updates the src array and takes a reference on the page, in + * order to pin the page until we lock it and unmap it. + */ +static void migrate_vma_collect(struct migrate_vma *migrate) +{ + struct mmu_notifier_range range; + + /* + * Note that the pgmap_owner is passed to the mmu notifier callback so + * that the registered device driver can skip invalidating device + * private page mappings that won't be migrated. + */ + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, + migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, + migrate->pgmap_owner); + mmu_notifier_invalidate_range_start(&range); + + walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, + &migrate_vma_walk_ops, migrate); + + mmu_notifier_invalidate_range_end(&range); + migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); +} + +/* + * migrate_vma_check_page() - check if page is pinned or not + * @page: struct page to check + * + * Pinned pages cannot be migrated. This is the same test as in + * folio_migrate_mapping(), except that here we allow migration of a + * ZONE_DEVICE page. + */ +static bool migrate_vma_check_page(struct page *page) +{ + /* + * One extra ref because caller holds an extra reference, either from + * isolate_lru_page() for a regular page, or migrate_vma_collect() for + * a device page. + */ + int extra = 1; + + /* + * FIXME support THP (transparent huge page), it is bit more complex to + * check them than regular pages, because they can be mapped with a pmd + * or with a pte (split pte mapping). + */ + if (PageCompound(page)) + return false; + + /* Page from ZONE_DEVICE have one extra reference */ + if (is_zone_device_page(page)) + extra++; + + /* For file back page */ + if (page_mapping(page)) + extra += 1 + page_has_private(page); + + if ((page_count(page) - extra) > page_mapcount(page)) + return false; + + return true; +} + +/* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Isolate pages from the LRU and replace mappings (CPU page table pte) with a + * special migration pte entry and check if it has been pinned. Pinned pages are + * restored because we cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ +static void migrate_vma_unmap(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + unsigned long i, restore = 0; + bool allow_drain = true; + + lru_add_drain(); + + for (i = 0; i < npages; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page) + continue; + + /* ZONE_DEVICE pages are not on LRU */ + if (!is_zone_device_page(page)) { + if (!PageLRU(page) && allow_drain) { + /* Drain CPU's pagevec */ + lru_add_drain_all(); + allow_drain = false; + } + + if (isolate_lru_page(page)) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + continue; + } + + /* Drop the reference we took in collect */ + put_page(page); + } + + if (page_mapped(page)) + try_to_migrate(page, 0); + + if (page_mapped(page) || !migrate_vma_check_page(page)) { + if (!is_zone_device_page(page)) { + get_page(page); + putback_lru_page(page); + } + + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + continue; + } + } + + for (i = 0; i < npages && restore; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + remove_migration_ptes(page, page, false); + + migrate->src[i] = 0; + unlock_page(page); + put_page(page); + restore--; + } +} + +/** + * migrate_vma_setup() - prepare to migrate a range of memory + * @args: contains the vma, start, and pfns arrays for the migration + * + * Returns: negative errno on failures, 0 when 0 or more pages were migrated + * without an error. + * + * Prepare to migrate a range of memory virtual address range by collecting all + * the pages backing each virtual address in the range, saving them inside the + * src array. Then lock those pages and unmap them. Once the pages are locked + * and unmapped, check whether each page is pinned or not. Pages that aren't + * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the + * corresponding src array entry. Then restores any pages that are pinned, by + * remapping and unlocking those pages. + * + * The caller should then allocate destination memory and copy source memory to + * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE + * flag set). Once these are allocated and copied, the caller must update each + * corresponding entry in the dst array with the pfn value of the destination + * page and with MIGRATE_PFN_VALID. Destination pages must be locked via + * lock_page(). + * + * Note that the caller does not have to migrate all the pages that are marked + * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from + * device memory to system memory. If the caller cannot migrate a device page + * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe + * consequences for the userspace process, so it must be avoided if at all + * possible. + * + * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we + * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus + * allowing the caller to allocate device memory for those unbacked virtual + * addresses. For this the caller simply has to allocate device memory and + * properly set the destination entry like for regular migration. Note that + * this can still fail, and thus inside the device driver you must check if the + * migration was successful for those entries after calling migrate_vma_pages(), + * just like for regular migration. + * + * After that, the callers must call migrate_vma_pages() to go over each entry + * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, + * then migrate_vma_pages() to migrate struct page information from the source + * struct page to the destination struct page. If it fails to migrate the + * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the + * src array. + * + * At this point all successfully migrated pages have an entry in the src + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst + * array entry with MIGRATE_PFN_VALID flag set. + * + * Once migrate_vma_pages() returns the caller may inspect which pages were + * successfully migrated, and which were not. Successfully migrated pages will + * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. + * + * It is safe to update device page table after migrate_vma_pages() because + * both destination and source page are still locked, and the mmap_lock is held + * in read mode (hence no one can unmap the range being migrated). + * + * Once the caller is done cleaning up things and updating its page table (if it + * chose to do so, this is not an obligation) it finally calls + * migrate_vma_finalize() to update the CPU page table to point to new pages + * for successfully migrated pages or otherwise restore the CPU page table to + * point to the original source pages. + */ +int migrate_vma_setup(struct migrate_vma *args) +{ + long nr_pages = (args->end - args->start) >> PAGE_SHIFT; + + args->start &= PAGE_MASK; + args->end &= PAGE_MASK; + if (!args->vma || is_vm_hugetlb_page(args->vma) || + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) + return -EINVAL; + if (nr_pages <= 0) + return -EINVAL; + if (args->start < args->vma->vm_start || + args->start >= args->vma->vm_end) + return -EINVAL; + if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) + return -EINVAL; + if (!args->src || !args->dst) + return -EINVAL; + + memset(args->src, 0, sizeof(*args->src) * nr_pages); + args->cpages = 0; + args->npages = 0; + + migrate_vma_collect(args); + + if (args->cpages) + migrate_vma_unmap(args); + + /* + * At this point pages are locked and unmapped, and thus they have + * stable content and can safely be copied to destination memory that + * is allocated by the drivers. + */ + return 0; + +} +EXPORT_SYMBOL(migrate_vma_setup); + +/* + * This code closely matches the code in: + * __handle_mm_fault() + * handle_pte_fault() + * do_anonymous_page() + * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE + * private page. + */ +static void migrate_vma_insert_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src) +{ + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + bool flush = false; + spinlock_t *ptl; + pte_t entry; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + /* Only allow populating anonymous memory */ + if (!vma_is_anonymous(vma)) + goto abort; + + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + goto abort; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + goto abort; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + goto abort; + + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) + goto abort; + + /* + * Use pte_alloc() instead of pte_alloc_map(). We can't run + * pte_offset_map() on pmds where a huge pmd might be created + * from a different thread. + * + * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when + * parallel threads are excluded by other means. + * + * Here we only have mmap_read_lock(mm). + */ + if (pte_alloc(mm, pmdp)) + goto abort; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmdp))) + goto abort; + + if (unlikely(anon_vma_prepare(vma))) + goto abort; + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) + goto abort; + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + if (is_device_private_page(page)) { + swp_entry_t swp_entry; + + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); + entry = swp_entry_to_pte(swp_entry); + } else { + /* + * For now we only support migrating to un-addressable device + * memory. + */ + if (is_zone_device_page(page)) { + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); + goto abort; + } + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + } + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + + if (check_stable_address_space(mm)) + goto unlock_abort; + + if (pte_present(*ptep)) { + unsigned long pfn = pte_pfn(*ptep); + + if (!is_zero_pfn(pfn)) + goto unlock_abort; + flush = true; + } else if (!pte_none(*ptep)) + goto unlock_abort; + + /* + * Check for userfaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) + goto unlock_abort; + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr, false); + if (!is_zone_device_page(page)) + lru_cache_add_inactive_or_unevictable(page, vma); + get_page(page); + + if (flush) { + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } else { + /* No need to invalidate - it was non-present before */ + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } + + pte_unmap_unlock(ptep, ptl); + *src = MIGRATE_PFN_MIGRATE; + return; + +unlock_abort: + pte_unmap_unlock(ptep, ptl); +abort: + *src &= ~MIGRATE_PFN_MIGRATE; +} + +/** + * migrate_vma_pages() - migrate meta-data from src page to dst page + * @migrate: migrate struct containing all migration information + * + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. + */ +void migrate_vma_pages(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + struct mmu_notifier_range range; + unsigned long addr, i; + bool notified = false; + + for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct address_space *mapping; + int r; + + if (!newpage) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + + if (!page) { + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + if (!notified) { + notified = true; + + mmu_notifier_range_init_owner(&range, + MMU_NOTIFY_MIGRATE, 0, migrate->vma, + migrate->vma->vm_mm, addr, migrate->end, + migrate->pgmap_owner); + mmu_notifier_invalidate_range_start(&range); + } + migrate_vma_insert_page(migrate, addr, newpage, + &migrate->src[i]); + continue; + } + + mapping = page_mapping(page); + + if (is_device_private_page(newpage)) { + /* + * For now only support private anonymous when migrating + * to un-addressable device memory. + */ + if (mapping) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } else if (is_zone_device_page(newpage)) { + /* + * Other types of ZONE_DEVICE page are not supported. + */ + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + + r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); + if (r != MIGRATEPAGE_SUCCESS) + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + } + + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() + * did already call it. + */ + if (notified) + mmu_notifier_invalidate_range_only_end(&range); +} +EXPORT_SYMBOL(migrate_vma_pages); + +/** + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ +void migrate_vma_finalize(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + unsigned long i; + + for (i = 0; i < npages; i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + continue; + } + + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + newpage = page; + } + + remove_migration_ptes(page, newpage, false); + unlock_page(page); + + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); + + if (newpage != page) { + unlock_page(newpage); + if (is_zone_device_page(newpage)) + put_page(newpage); + else + putback_lru_page(newpage); + } + } +} +EXPORT_SYMBOL(migrate_vma_finalize); -- cgit v1.2.3-70-g09d2 From eed05e54d275b3cfc5d8c79843c5276a5878e94a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 3 Feb 2022 09:06:08 -0500 Subject: mm: Add DEFINE_PAGE_VMA_WALK and DEFINE_FOLIO_VMA_WALK Instead of declaring a struct page_vma_mapped_walk directly, use these helpers to allow us to transition to a PFN approach in the following patches. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/rmap.h | 16 ++++++++++++++++ kernel/events/uprobes.c | 6 +----- mm/damon/paddr.c | 12 ++---------- mm/ksm.c | 5 +---- mm/migrate.c | 7 +------ mm/page_idle.c | 6 +----- mm/rmap.c | 31 +++++-------------------------- 7 files changed, 27 insertions(+), 56 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index ac29b076082b..0d894a2bfaa1 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -214,6 +214,22 @@ struct page_vma_mapped_walk { unsigned int flags; }; +#define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ + struct page_vma_mapped_walk name = { \ + .page = _page, \ + .vma = _vma, \ + .address = _address, \ + .flags = _flags, \ + } + +#define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ + struct page_vma_mapped_walk name = { \ + .page = &_folio->page, \ + .vma = _vma, \ + .address = _address, \ + .flags = _flags, \ + } + static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) { /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eed2f7437d96..6418083901d4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -155,11 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = compound_head(old_page), - .vma = vma, - .address = addr, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0); int err; struct mmu_notifier_range range; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 5e8244f65a1a..cb45d49c731d 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -19,11 +19,7 @@ static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg) { - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = addr, - }; + DEFINE_PAGE_VMA_WALK(pvmw, page, vma, addr, 0); while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; @@ -93,11 +89,7 @@ static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg) { struct damon_pa_access_chk_result *result = arg; - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = addr, - }; + DEFINE_PAGE_VMA_WALK(pvmw, page, vma, addr, 0); result->accessed = false; result->page_sz = PAGE_SIZE; diff --git a/mm/ksm.c b/mm/ksm.c index c5a4403b5dc9..ea82fef93a31 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1034,10 +1034,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - }; + DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0); int swapped; int err = -EFAULT; struct mmu_notifier_range range; diff --git a/mm/migrate.c b/mm/migrate.c index f4076093c855..71f92e8ed934 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -174,12 +174,7 @@ void putback_movable_pages(struct list_head *l) static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *old) { - struct page_vma_mapped_walk pvmw = { - .page = old, - .vma = vma, - .address = addr, - .flags = PVMW_SYNC | PVMW_MIGRATION, - }; + DEFINE_PAGE_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); struct page *new; pte_t pte; swp_entry_t entry; diff --git a/mm/page_idle.c b/mm/page_idle.c index edead6a8a5f9..3e05bf1ce825 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -48,11 +48,7 @@ static bool page_idle_clear_pte_refs_one(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg) { - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = addr, - }; + DEFINE_PAGE_VMA_WALK(pvmw, page, vma, addr, 0); bool referenced = false; while (page_vma_mapped_walk(&pvmw)) { diff --git a/mm/rmap.c b/mm/rmap.c index 1a13d5d6cfc7..a7f06b76b503 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -802,11 +802,7 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct page_referenced_arg *pra = arg; - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - }; + DEFINE_PAGE_VMA_WALK(pvmw, page, vma, address, 0); int referenced = 0; while (page_vma_mapped_walk(&pvmw)) { @@ -934,12 +930,7 @@ int page_referenced(struct page *page, static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - .flags = PVMW_SYNC, - }; + DEFINE_PAGE_VMA_WALK(pvmw, page, vma, address, PVMW_SYNC); struct mmu_notifier_range range; int *cleaned = arg; @@ -1419,11 +1410,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - }; + DEFINE_PAGE_VMA_WALK(pvmw, page, vma, address, 0); pte_t pteval; struct page *subpage; bool ret = true; @@ -1714,11 +1701,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - }; + DEFINE_PAGE_VMA_WALK(pvmw, page, vma, address, 0); pte_t pteval; struct page *subpage; bool ret = true; @@ -2001,11 +1984,7 @@ static bool page_make_device_exclusive_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *priv) { struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - }; + DEFINE_PAGE_VMA_WALK(pvmw, page, vma, address, 0); struct make_exclusive_args *args = priv; pte_t pteval; struct page *subpage; -- cgit v1.2.3-70-g09d2 From 2aff7a4755bed2870ee23b75bc88cdc8d76cdd03 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 3 Feb 2022 11:40:17 -0500 Subject: mm: Convert page_vma_mapped_walk to work on PFNs page_mapped_in_vma() really just wants to walk one page, but as the code stands, if passed the head page of a compound page, it will walk every page in the compound page. Extract pfn/nr_pages/pgoff from the struct page early, so they can be overridden by page_mapped_in_vma(). Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/hugetlb.h | 5 +++++ include/linux/rmap.h | 17 ++++++++++----- mm/internal.h | 15 ++++++++----- mm/migrate.c | 5 +++-- mm/page_vma_mapped.c | 58 ++++++++++++++++++++++--------------------------- mm/rmap.c | 8 +++---- 6 files changed, 58 insertions(+), 50 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d1897a69c540..6ba2f8e74fbb 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -970,6 +970,11 @@ static inline struct hstate *page_hstate(struct page *page) return NULL; } +static inline struct hstate *size_to_hstate(unsigned long size) +{ + return NULL; +} + static inline unsigned long huge_page_size(struct hstate *h) { return PAGE_SIZE; diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 0d894a2bfaa1..0c838ba1a8ee 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * The anon_vma heads a list of private "related" vmas, to scan if @@ -201,11 +202,13 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) -/* Look for migarion entries rather than present PTEs */ +/* Look for migration entries rather than present PTEs */ #define PVMW_MIGRATION (1 << 1) struct page_vma_mapped_walk { - struct page *page; + unsigned long pfn; + unsigned long nr_pages; + pgoff_t pgoff; struct vm_area_struct *vma; unsigned long address; pmd_t *pmd; @@ -216,7 +219,9 @@ struct page_vma_mapped_walk { #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ - .page = _page, \ + .pfn = page_to_pfn(_page), \ + .nr_pages = compound_nr(page), \ + .pgoff = page_to_pgoff(page), \ .vma = _vma, \ .address = _address, \ .flags = _flags, \ @@ -224,7 +229,9 @@ struct page_vma_mapped_walk { #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ - .page = &_folio->page, \ + .pfn = folio_pfn(_folio), \ + .nr_pages = folio_nr_pages(_folio), \ + .pgoff = folio_pgoff(_folio), \ .vma = _vma, \ .address = _address, \ .flags = _flags, \ @@ -233,7 +240,7 @@ struct page_vma_mapped_walk { static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) { /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ - if (pvmw->pte && !PageHuge(pvmw->page)) + if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma)) pte_unmap(pvmw->pte); if (pvmw->ptl) spin_unlock(pvmw->ptl); diff --git a/mm/internal.h b/mm/internal.h index 6047268076e7..3b652444f070 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -10,6 +10,7 @@ #include #include #include +#include #include struct folio_batch; @@ -475,18 +476,20 @@ vma_address(struct page *page, struct vm_area_struct *vma) } /* - * Then at what user virtual address will none of the page be found in vma? + * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. - * If page is a compound head, the entire compound page is considered. */ -static inline unsigned long -vma_address_end(struct page *page, struct vm_area_struct *vma) +static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw) { + struct vm_area_struct *vma = pvmw->vma; pgoff_t pgoff; unsigned long address; - VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ - pgoff = page_to_pgoff(page) + compound_nr(page); + /* Common case, plus ->pgoff is invalid for KSM */ + if (pvmw->nr_pages == 1) + return pvmw->address + PAGE_SIZE; + + pgoff = pvmw->pgoff + pvmw->nr_pages; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address > vma->vm_end) diff --git a/mm/migrate.c b/mm/migrate.c index 71f92e8ed934..358bc311caaa 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -174,7 +174,8 @@ void putback_movable_pages(struct list_head *l) static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *old) { - DEFINE_PAGE_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); + DEFINE_PAGE_VMA_WALK(pvmw, (struct page *)old, vma, addr, + PVMW_SYNC | PVMW_MIGRATION); struct page *new; pte_t pte; swp_entry_t entry; @@ -184,7 +185,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (PageKsm(page)) new = page; else - new = page - pvmw.page->index + + new = page - pvmw.pgoff + linear_page_index(vma, pvmw.address); #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index f7b331081791..1187f9c1ec5b 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -53,18 +53,6 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw) return true; } -static inline bool pfn_is_match(struct page *page, unsigned long pfn) -{ - unsigned long page_pfn = page_to_pfn(page); - - /* normal page and hugetlbfs page */ - if (!PageTransCompound(page) || PageHuge(page)) - return page_pfn == pfn; - - /* THP can be referenced by any subpage */ - return pfn >= page_pfn && pfn - page_pfn < thp_nr_pages(page); -} - /** * check_pte - check if @pvmw->page is mapped at the @pvmw->pte * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking @@ -116,7 +104,17 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) pfn = pte_pfn(*pvmw->pte); } - return pfn_is_match(pvmw->page, pfn); + return (pfn - pvmw->pfn) < pvmw->nr_pages; +} + +/* Returns true if the two ranges overlap. Careful to not overflow. */ +static bool check_pmd(unsigned long pfn, struct page_vma_mapped_walk *pvmw) +{ + if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn) + return false; + if (pfn > pvmw->pfn + pvmw->nr_pages - 1) + return false; + return true; } static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) @@ -127,7 +125,7 @@ static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) } /** - * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at + * page_vma_mapped_walk - check if @pvmw->pfn is mapped in @pvmw->vma at * @pvmw->address * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags * must be set. pmd, pte and ptl must be NULL. @@ -152,8 +150,8 @@ static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) */ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) { - struct mm_struct *mm = pvmw->vma->vm_mm; - struct page *page = pvmw->page; + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; unsigned long end; pgd_t *pgd; p4d_t *p4d; @@ -164,32 +162,26 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (pvmw->pmd && !pvmw->pte) return not_found(pvmw); - if (unlikely(PageHuge(page))) { + if (unlikely(is_vm_hugetlb_page(vma))) { + unsigned long size = pvmw->nr_pages * PAGE_SIZE; /* The only possible mapping was handled on last iteration */ if (pvmw->pte) return not_found(pvmw); /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); + pvmw->pte = huge_pte_offset(mm, pvmw->address, size); if (!pvmw->pte) return false; - pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte); + pvmw->ptl = huge_pte_lockptr(size_to_hstate(size), mm, + pvmw->pte); spin_lock(pvmw->ptl); if (!check_pte(pvmw)) return not_found(pvmw); return true; } - /* - * Seek to next pte only makes sense for THP. - * But more important than that optimization, is to filter out - * any PageKsm page: whose page->index misleads vma_address() - * and vma_address_end() to disaster. - */ - end = PageTransCompound(page) ? - vma_address_end(page, pvmw->vma) : - pvmw->address + PAGE_SIZE; + end = vma_address_end(pvmw); if (pvmw->pte) goto next_pte; restart: @@ -224,7 +216,7 @@ restart: if (likely(pmd_trans_huge(pmde))) { if (pvmw->flags & PVMW_MIGRATION) return not_found(pvmw); - if (pmd_page(pmde) != page) + if (!check_pmd(pmd_pfn(pmde), pvmw)) return not_found(pvmw); return true; } @@ -236,7 +228,7 @@ restart: return not_found(pvmw); entry = pmd_to_swp_entry(pmde); if (!is_migration_entry(entry) || - pfn_swap_entry_to_page(entry) != page) + !check_pmd(swp_offset(entry), pvmw)) return not_found(pvmw); return true; } @@ -250,7 +242,8 @@ restart: * cleared *pmd but not decremented compound_mapcount(). */ if ((pvmw->flags & PVMW_SYNC) && - PageTransCompound(page)) { + transparent_hugepage_active(vma) && + (pvmw->nr_pages >= HPAGE_PMD_NR)) { spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); spin_unlock(ptl); @@ -307,7 +300,8 @@ next_pte: int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { struct page_vma_mapped_walk pvmw = { - .page = page, + .pfn = page_to_pfn(page), + .nr_pages = 1, .vma = vma, .flags = PVMW_SYNC, }; diff --git a/mm/rmap.c b/mm/rmap.c index a7f06b76b503..e27ba4172069 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -940,7 +940,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma, vma->vm_mm, address, - vma_address_end(page, vma)); + vma_address_end(&pvmw)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { @@ -1437,8 +1437,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - range.end = PageKsm(page) ? - address + PAGE_SIZE : vma_address_end(page, vma); + range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, range.end); if (PageHuge(page)) { @@ -1732,8 +1731,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - range.end = PageKsm(page) ? - address + PAGE_SIZE : vma_address_end(page, vma); + range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, range.end); if (PageHuge(page)) { -- cgit v1.2.3-70-g09d2 From 4b8554c527f3cfa183f6c06d231a9387873205a0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 28 Jan 2022 14:29:43 -0500 Subject: mm/rmap: Convert try_to_migrate() to folios Convert the callers to pass a folio and the try_to_migrate_one() worker to use a folio throughout. Fixes an assumption that a folio must be <= PMD size. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/rmap.h | 2 +- mm/huge_memory.c | 4 ++-- mm/migrate.c | 6 ++++-- mm/migrate_device.c | 6 ++++-- mm/rmap.c | 59 +++++++++++++++++++++++++++------------------------- 5 files changed, 42 insertions(+), 35 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index a0c5c38c733f..e6e935c81414 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -193,7 +193,7 @@ static inline void page_dup_rmap(struct page *page, bool compound) int folio_referenced(struct folio *, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags); -void try_to_migrate(struct page *page, enum ttu_flags flags); +void try_to_migrate(struct folio *folio, enum ttu_flags flags); void try_to_unmap(struct folio *, enum ttu_flags flags); int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de684427f79c..7df1934d6528 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2262,8 +2262,8 @@ static void unmap_page(struct page *page) * pages can simply be left unmapped, then faulted back on demand. * If that is ever changed (perhaps for mlock), update remap_page(). */ - if (PageAnon(page)) - try_to_migrate(page, ttu_flags); + if (folio_test_anon(folio)) + try_to_migrate(folio, ttu_flags); else try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); diff --git a/mm/migrate.c b/mm/migrate.c index 358bc311caaa..6ed85a5d1be5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -912,6 +912,7 @@ out: static int __unmap_and_move(struct page *page, struct page *newpage, int force, enum migrate_mode mode) { + struct folio *folio = page_folio(page); int rc = -EAGAIN; bool page_was_mapped = false; struct anon_vma *anon_vma = NULL; @@ -1015,7 +1016,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, /* Establish migration ptes */ VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, page); - try_to_migrate(page, 0); + try_to_migrate(folio, 0); page_was_mapped = true; } @@ -1165,6 +1166,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, enum migrate_mode mode, int reason, struct list_head *ret) { + struct folio *src = page_folio(hpage); int rc = -EAGAIN; int page_was_mapped = 0; struct page *new_hpage; @@ -1241,7 +1243,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, ttu |= TTU_RMAP_LOCKED; } - try_to_migrate(hpage, ttu); + try_to_migrate(src, ttu); page_was_mapped = 1; if (mapping_locked) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 0326b901d2fd..b2c611d4bdb2 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -333,6 +333,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) for (i = 0; i < npages; i++) { struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct folio *folio; if (!page) continue; @@ -356,8 +357,9 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) put_page(page); } - if (page_mapped(page)) - try_to_migrate(page, 0); + folio = page_folio(page); + if (folio_mapped(folio)) + try_to_migrate(folio, 0); if (page_mapped(page) || !migrate_vma_check_page(page)) { if (!is_zone_device_page(page)) { diff --git a/mm/rmap.c b/mm/rmap.c index cf6e3de9d2f7..8497da29193c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1706,7 +1706,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, { struct folio *folio = page_folio(page); struct mm_struct *mm = vma->vm_mm; - DEFINE_PAGE_VMA_WALK(pvmw, page, vma, address, 0); + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); pte_t pteval; struct page *subpage; bool ret = true; @@ -1740,7 +1740,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, range.end); - if (PageHuge(page)) { + if (folio_test_hugetlb(folio)) { /* * If sharing is possible, start and end will be adjusted * accordingly. @@ -1754,21 +1754,24 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte) { - VM_BUG_ON_PAGE(PageHuge(page) || - !PageTransCompound(page), page); + subpage = folio_page(folio, + pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); + VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || + !folio_test_pmd_mappable(folio), folio); - set_pmd_migration_entry(&pvmw, page); + set_pmd_migration_entry(&pvmw, subpage); continue; } #endif /* Unexpected PMD-mapped THP? */ - VM_BUG_ON_PAGE(!pvmw.pte, page); + VM_BUG_ON_FOLIO(!pvmw.pte, folio); - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); + subpage = folio_page(folio, + pte_pfn(*pvmw.pte) - folio_pfn(folio)); address = pvmw.address; - if (PageHuge(page) && !PageAnon(page)) { + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly @@ -1806,15 +1809,15 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); pteval = ptep_clear_flush(vma, address, pvmw.pte); - /* Move the dirty bit to the page. Now the pte is gone. */ + /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) - set_page_dirty(page); + folio_mark_dirty(folio); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (is_zone_device_page(page)) { - unsigned long pfn = page_to_pfn(page); + if (folio_is_zone_device(folio)) { + unsigned long pfn = folio_pfn(folio); swp_entry_t entry; pte_t swp_pte; @@ -1850,16 +1853,16 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * changed when hugepage migrations to device private * memory are supported. */ - subpage = page; - } else if (PageHWPoison(page)) { + subpage = &folio->page; + } else if (PageHWPoison(subpage)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); - if (PageHuge(page)) { - hugetlb_count_sub(compound_nr(page), mm); + if (folio_test_hugetlb(folio)) { + hugetlb_count_sub(folio_nr_pages(folio), mm); set_huge_swap_pte_at(mm, address, pvmw.pte, pteval, vma_mmu_pagesize(vma)); } else { - dec_mm_counter(mm, mm_counter(page)); + dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -1874,7 +1877,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * migration) will not expect userfaults on already * copied pages. */ - dec_mm_counter(mm, mm_counter(page)); + dec_mm_counter(mm, mm_counter(&folio->page)); /* We have to invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); @@ -1920,10 +1923,10 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, vma, PageHuge(page)); + page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) mlock_page_drain(smp_processor_id()); - put_page(page); + folio_put(folio); } mmu_notifier_invalidate_range_end(&range); @@ -1933,13 +1936,13 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, /** * try_to_migrate - try to replace all page table mappings with swap entries - * @page: the page to replace page table entries for + * @folio: the folio to replace page table entries for * @flags: action and flags * - * Tries to remove all the page table entries which are mapping this page and - * replace them with special swap entries. Caller must hold the page lock. + * Tries to remove all the page table entries which are mapping this folio and + * replace them with special swap entries. Caller must hold the folio lock. */ -void try_to_migrate(struct page *page, enum ttu_flags flags) +void try_to_migrate(struct folio *folio, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_migrate_one, @@ -1956,7 +1959,7 @@ void try_to_migrate(struct page *page, enum ttu_flags flags) TTU_SYNC))) return; - if (is_zone_device_page(page) && !is_device_private_page(page)) + if (folio_is_zone_device(folio) && !folio_is_device_private(folio)) return; /* @@ -1967,13 +1970,13 @@ void try_to_migrate(struct page *page, enum ttu_flags flags) * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ - if (!PageKsm(page) && PageAnon(page)) + if (!folio_test_ksm(folio) && folio_test_anon(folio)) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) - rmap_walk_locked(page, &rwc); + rmap_walk_locked(&folio->page, &rwc); else - rmap_walk(page, &rwc); + rmap_walk(&folio->page, &rwc); } #ifdef CONFIG_DEVICE_PRIVATE -- cgit v1.2.3-70-g09d2 From 4eecb8b9163df82c87c91764a02fff228ef25f6d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 28 Jan 2022 23:32:59 -0500 Subject: mm/migrate: Convert remove_migration_ptes() to folios Convert the implementation and all callers. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/rmap.h | 2 +- mm/huge_memory.c | 24 ++++++++++++----------- mm/migrate.c | 55 ++++++++++++++++++++++++++++------------------------ mm/migrate_device.c | 15 +++++++++----- 4 files changed, 54 insertions(+), 42 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e6e935c81414..21af80d5b711 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -261,7 +261,7 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); */ int folio_mkclean(struct folio *); -void remove_migration_ptes(struct page *old, struct page *new, bool locked); +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); /* * Called by memory-failure.c to kill processes. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7df1934d6528..d55b25f1ceba 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2270,18 +2270,19 @@ static void unmap_page(struct page *page) VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); } -static void remap_page(struct page *page, unsigned int nr) +static void remap_page(struct folio *folio, unsigned long nr) { - int i; + int i = 0; /* If unmap_page() uses try_to_migrate() on file, remove this check */ - if (!PageAnon(page)) + if (!folio_test_anon(folio)) return; - if (PageTransHuge(page)) { - remove_migration_ptes(page, page, true); - } else { - for (i = 0; i < nr; i++) - remove_migration_ptes(page + i, page + i, true); + for (;;) { + remove_migration_ptes(folio, folio, true); + i += folio_nr_pages(folio); + if (i >= nr) + break; + folio = folio_next(folio); } } @@ -2441,7 +2442,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } local_irq_enable(); - remap_page(head, nr); + remap_page(folio, nr); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2550,7 +2551,8 @@ bool can_split_huge_page(struct page *page, int *pextra_pins) */ int split_huge_page_to_list(struct page *page, struct list_head *list) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); + struct page *head = &folio->page; struct deferred_split *ds_queue = get_deferred_split_queue(head); XA_STATE(xas, &head->mapping->i_pages, head->index); struct anon_vma *anon_vma = NULL; @@ -2667,7 +2669,7 @@ fail: if (mapping) xas_unlock(&xas); local_irq_enable(); - remap_page(head, thp_nr_pages(head)); + remap_page(folio, folio_nr_pages(folio)); ret = -EBUSY; } diff --git a/mm/migrate.c b/mm/migrate.c index 6ed85a5d1be5..eba3cd5376e3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -174,30 +174,32 @@ void putback_movable_pages(struct list_head *l) static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *old) { - DEFINE_PAGE_VMA_WALK(pvmw, (struct page *)old, vma, addr, - PVMW_SYNC | PVMW_MIGRATION); - struct page *new; - pte_t pte; - swp_entry_t entry; + struct folio *folio = page_folio(page); + DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); VM_BUG_ON_PAGE(PageTail(page), page); while (page_vma_mapped_walk(&pvmw)) { - if (PageKsm(page)) - new = page; - else - new = page - pvmw.pgoff + - linear_page_index(vma, pvmw.address); + pte_t pte; + swp_entry_t entry; + struct page *new; + unsigned long idx = 0; + + /* pgoff is invalid for ksm pages, but they are never large */ + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff; + new = folio_page(folio, idx); #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte) { - VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); + VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || + !folio_test_pmd_mappable(folio), folio); remove_migration_pmd(&pvmw, new); continue; } #endif - get_page(new); + folio_get(folio); pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(*pvmw.pte)) pte = pte_mksoft_dirty(pte); @@ -226,12 +228,12 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, } #ifdef CONFIG_HUGETLB_PAGE - if (PageHuge(new)) { + if (folio_test_hugetlb(folio)) { unsigned int shift = huge_page_shift(hstate_vma(vma)); pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); - if (PageAnon(new)) + if (folio_test_anon(folio)) hugepage_add_anon_rmap(new, vma, pvmw.address); else page_dup_rmap(new, true); @@ -239,7 +241,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, } else #endif { - if (PageAnon(new)) + if (folio_test_anon(folio)) page_add_anon_rmap(new, vma, pvmw.address, false); else page_add_file_rmap(new, vma, false); @@ -259,17 +261,17 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct page *old, struct page *new, bool locked) +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) { struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, - .arg = old, + .arg = src, }; if (locked) - rmap_walk_locked(new, &rwc); + rmap_walk_locked(&dst->page, &rwc); else - rmap_walk(new, &rwc); + rmap_walk(&dst->page, &rwc); } /* @@ -756,6 +758,7 @@ int buffer_migrate_page_norefs(struct address_space *mapping, */ static int writeout(struct address_space *mapping, struct page *page) { + struct folio *folio = page_folio(page); struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = 1, @@ -781,7 +784,7 @@ static int writeout(struct address_space *mapping, struct page *page) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(page, page, false); + remove_migration_ptes(folio, folio, false); rc = mapping->a_ops->writepage(page, &wbc); @@ -913,6 +916,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, int force, enum migrate_mode mode) { struct folio *folio = page_folio(page); + struct folio *dst = page_folio(newpage); int rc = -EAGAIN; bool page_was_mapped = false; struct anon_vma *anon_vma = NULL; @@ -1039,8 +1043,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } if (page_was_mapped) - remove_migration_ptes(page, - rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); + remove_migration_ptes(folio, + rc == MIGRATEPAGE_SUCCESS ? dst : folio, false); out_unlock_both: unlock_page(newpage); @@ -1166,7 +1170,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, enum migrate_mode mode, int reason, struct list_head *ret) { - struct folio *src = page_folio(hpage); + struct folio *dst, *src = page_folio(hpage); int rc = -EAGAIN; int page_was_mapped = 0; struct page *new_hpage; @@ -1194,6 +1198,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, new_hpage = get_new_page(hpage, private); if (!new_hpage) return -ENOMEM; + dst = page_folio(new_hpage); if (!trylock_page(hpage)) { if (!force) @@ -1254,8 +1259,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = move_to_new_page(new_hpage, hpage, mode); if (page_was_mapped) - remove_migration_ptes(hpage, - rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); + remove_migration_ptes(src, + rc == MIGRATEPAGE_SUCCESS ? dst : src, false); unlock_put_anon: unlock_page(new_hpage); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index b2c611d4bdb2..70c7dc05bbfc 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -376,15 +376,17 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) for (i = 0; i < npages && restore; i++) { struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct folio *folio; if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) continue; - remove_migration_ptes(page, page, false); + folio = page_folio(page); + remove_migration_ptes(folio, folio, false); migrate->src[i] = 0; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); restore--; } } @@ -729,6 +731,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate) unsigned long i; for (i = 0; i < npages; i++) { + struct folio *dst, *src; struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); struct page *page = migrate_pfn_to_page(migrate->src[i]); @@ -748,8 +751,10 @@ void migrate_vma_finalize(struct migrate_vma *migrate) newpage = page; } - remove_migration_ptes(page, newpage, false); - unlock_page(page); + src = page_folio(page); + dst = page_folio(newpage); + remove_migration_ptes(src, dst, false); + folio_unlock(src); if (is_zone_device_page(page)) put_page(page); -- cgit v1.2.3-70-g09d2 From 2f031c6f042cb8a9b221a8b6b80e69de5170f830 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 29 Jan 2022 16:06:53 -0500 Subject: mm/rmap: Convert rmap_walk() to take a folio This ripples all the way through to every calling and called function from rmap. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/ksm.h | 4 +- include/linux/rmap.h | 11 +++-- mm/damon/paddr.c | 15 ++++--- mm/folio-compat.c | 7 ---- mm/huge_memory.c | 2 +- mm/ksm.c | 12 +++--- mm/migrate.c | 10 ++--- mm/page_idle.c | 7 ++-- mm/rmap.c | 111 ++++++++++++++++++++++++--------------------------- 9 files changed, 80 insertions(+), 99 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index a38a5bca1ba5..0b4f17418f64 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -51,7 +51,7 @@ static inline void ksm_exit(struct mm_struct *mm) struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); -void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); #else /* !CONFIG_KSM */ @@ -78,7 +78,7 @@ static inline struct page *ksm_might_need_to_copy(struct page *page, return page; } -static inline void rmap_walk_ksm(struct page *page, +static inline void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index be020d38b0a5..a74d811c5b3f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -266,7 +266,6 @@ void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); /* * Called by memory-failure.c to kill processes. */ -struct anon_vma *page_lock_anon_vma_read(struct page *page); struct anon_vma *folio_lock_anon_vma_read(struct folio *folio); void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); @@ -286,15 +285,15 @@ struct rmap_walk_control { * Return false if page table scanning in rmap_walk should be stopped. * Otherwise, return true. */ - bool (*rmap_one)(struct page *page, struct vm_area_struct *vma, + bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg); - int (*done)(struct page *page); - struct anon_vma *(*anon_lock)(struct page *page); + int (*done)(struct folio *folio); + struct anon_vma *(*anon_lock)(struct folio *folio); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; -void rmap_walk(struct page *page, struct rmap_walk_control *rwc); -void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); +void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index eee8a3395d71..ae24549921e2 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -16,10 +16,10 @@ #include "../internal.h" #include "prmtv-common.h" -static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma, +static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg) { - DEFINE_PAGE_VMA_WALK(pvmw, page, vma, addr, 0); + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; @@ -37,7 +37,7 @@ static void damon_pa_mkold(unsigned long paddr) struct page *page = damon_get_page(PHYS_PFN(paddr)); struct rmap_walk_control rwc = { .rmap_one = __damon_pa_mkold, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, }; bool need_lock; @@ -54,7 +54,7 @@ static void damon_pa_mkold(unsigned long paddr) if (need_lock && !folio_trylock(folio)) goto out; - rmap_walk(&folio->page, &rwc); + rmap_walk(folio, &rwc); if (need_lock) folio_unlock(folio); @@ -87,10 +87,9 @@ struct damon_pa_access_chk_result { bool accessed; }; -static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma, +static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg) { - struct folio *folio = page_folio(page); struct damon_pa_access_chk_result *result = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); @@ -133,7 +132,7 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) struct rmap_walk_control rwc = { .arg = &result, .rmap_one = __damon_pa_young, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, }; bool need_lock; @@ -156,7 +155,7 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) return NULL; } - rmap_walk(&folio->page, &rwc); + rmap_walk(folio, &rwc); if (need_lock) folio_unlock(folio); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 968ad97bbffa..46fa179e32fb 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -164,10 +164,3 @@ void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } - -#ifdef CONFIG_MMU -struct anon_vma *page_lock_anon_vma_read(struct page *page) -{ - return folio_lock_anon_vma_read(page_folio(page)); -} -#endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d55b25f1ceba..d874d50e703b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2572,7 +2572,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a * reference to it and then lock the anon_vma for write. This - * is similar to page_lock_anon_vma_read except the write lock + * is similar to folio_lock_anon_vma_read except the write lock * is taken to serialise against parallel split or collapse * operations. */ diff --git a/mm/ksm.c b/mm/ksm.c index b25d545e0cd1..dd737f925c04 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2588,21 +2588,21 @@ struct page *ksm_might_need_to_copy(struct page *page, return new_page; } -void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; int search_new_forks = 0; - VM_BUG_ON_PAGE(!PageKsm(page), page); + VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); /* * Rely on the page lock to protect against concurrent modifications * to that page's node of the stable tree. */ - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - stable_node = page_stable_node(page); + stable_node = folio_stable_node(folio); if (!stable_node) return; again: @@ -2637,11 +2637,11 @@ again: if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - if (!rwc->rmap_one(page, vma, addr, rwc->arg)) { + if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) { anon_vma_unlock_read(anon_vma); return; } - if (rwc->done && rwc->done(page)) { + if (rwc->done && rwc->done(folio)) { anon_vma_unlock_read(anon_vma); return; } diff --git a/mm/migrate.c b/mm/migrate.c index eba3cd5376e3..2defe9aa4d0e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -171,13 +171,11 @@ void putback_movable_pages(struct list_head *l) /* * Restore a potential migration pte to a working pte entry */ -static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, - unsigned long addr, void *old) +static bool remove_migration_pte(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, void *old) { - struct folio *folio = page_folio(page); DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); - VM_BUG_ON_PAGE(PageTail(page), page); while (page_vma_mapped_walk(&pvmw)) { pte_t pte; swp_entry_t entry; @@ -269,9 +267,9 @@ void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) }; if (locked) - rmap_walk_locked(&dst->page, &rwc); + rmap_walk_locked(dst, &rwc); else - rmap_walk(&dst->page, &rwc); + rmap_walk(dst, &rwc); } /* diff --git a/mm/page_idle.c b/mm/page_idle.c index 5c73a9b578da..e34ba04e22e2 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -46,11 +46,10 @@ static struct page *page_idle_get_page(unsigned long pfn) return page; } -static bool page_idle_clear_pte_refs_one(struct page *page, +static bool page_idle_clear_pte_refs_one(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg) { - struct folio *folio = page_folio(page); DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); bool referenced = false; @@ -93,7 +92,7 @@ static void page_idle_clear_pte_refs(struct page *page) */ static const struct rmap_walk_control rwc = { .rmap_one = page_idle_clear_pte_refs_one, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, }; bool need_lock; @@ -104,7 +103,7 @@ static void page_idle_clear_pte_refs(struct page *page) if (need_lock && !folio_trylock(folio)) return; - rmap_walk(&folio->page, (struct rmap_walk_control *)&rwc); + rmap_walk(folio, (struct rmap_walk_control *)&rwc); if (need_lock) folio_unlock(folio); diff --git a/mm/rmap.c b/mm/rmap.c index 09301aecf2fc..2cc97c64901e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -107,15 +107,15 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) VM_BUG_ON(atomic_read(&anon_vma->refcount)); /* - * Synchronize against page_lock_anon_vma_read() such that + * Synchronize against folio_lock_anon_vma_read() such that * we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by - * down_read_trylock() from page_lock_anon_vma_read(). This orders: + * down_read_trylock() from folio_lock_anon_vma_read(). This orders: * - * page_lock_anon_vma_read() VS put_anon_vma() + * folio_lock_anon_vma_read() VS put_anon_vma() * down_read_trylock() atomic_dec_and_test() * LOCK MB * atomic_read() rwsem_is_locked() @@ -168,7 +168,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * allocate a new one. * * Anon-vma allocations are very subtle, because we may have - * optimistically looked up an anon_vma in page_lock_anon_vma_read() + * optimistically looked up an anon_vma in folio_lock_anon_vma_read() * and that may actually touch the rwsem even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). @@ -799,10 +799,9 @@ struct folio_referenced_arg { /* * arg: folio_referenced_arg will be passed */ -static bool folio_referenced_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, void *arg) +static bool folio_referenced_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long address, void *arg) { - struct folio *folio = page_folio(page); struct folio_referenced_arg *pra = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); int referenced = 0; @@ -894,7 +893,7 @@ int folio_referenced(struct folio *folio, int is_locked, struct rmap_walk_control rwc = { .rmap_one = folio_referenced_one, .arg = (void *)&pra, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, }; *vm_flags = 0; @@ -919,7 +918,7 @@ int folio_referenced(struct folio *folio, int is_locked, rwc.invalid_vma = invalid_folio_referenced_vma; } - rmap_walk(&folio->page, &rwc); + rmap_walk(folio, &rwc); *vm_flags = pra.vm_flags; if (we_locked) @@ -928,10 +927,9 @@ int folio_referenced(struct folio *folio, int is_locked, return pra.referenced; } -static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, +static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { - struct folio *folio = page_folio(page); DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC); struct mmu_notifier_range range; int *cleaned = arg; @@ -1025,7 +1023,7 @@ int folio_mkclean(struct folio *folio) if (!mapping) return 0; - rmap_walk(&folio->page, &rwc); + rmap_walk(folio, &rwc); return cleaned; } @@ -1410,10 +1408,9 @@ out: /* * @arg: enum ttu_flags will be passed to this argument */ -static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, +static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { - struct folio *folio = page_folio(page); struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); pte_t pteval; @@ -1667,9 +1664,9 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return vma_is_temporary_stack(vma); } -static int page_not_mapped(struct page *page) +static int page_not_mapped(struct folio *folio) { - return !page_mapped(page); + return !folio_mapped(folio); } /** @@ -1689,13 +1686,13 @@ void try_to_unmap(struct folio *folio, enum ttu_flags flags) .rmap_one = try_to_unmap_one, .arg = (void *)flags, .done = page_not_mapped, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, }; if (flags & TTU_RMAP_LOCKED) - rmap_walk_locked(&folio->page, &rwc); + rmap_walk_locked(folio, &rwc); else - rmap_walk(&folio->page, &rwc); + rmap_walk(folio, &rwc); } /* @@ -1704,10 +1701,9 @@ void try_to_unmap(struct folio *folio, enum ttu_flags flags) * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs * containing migration entries. */ -static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, +static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { - struct folio *folio = page_folio(page); struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); pte_t pteval; @@ -1951,7 +1947,7 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) .rmap_one = try_to_migrate_one, .arg = (void *)flags, .done = page_not_mapped, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, }; /* @@ -1977,9 +1973,9 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) - rmap_walk_locked(&folio->page, &rwc); + rmap_walk_locked(folio, &rwc); else - rmap_walk(&folio->page, &rwc); + rmap_walk(folio, &rwc); } #ifdef CONFIG_DEVICE_PRIVATE @@ -1990,10 +1986,9 @@ struct make_exclusive_args { bool valid; }; -static bool page_make_device_exclusive_one(struct page *page, +static bool page_make_device_exclusive_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *priv) { - struct folio *folio = page_folio(page); struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); struct make_exclusive_args *args = priv; @@ -2098,7 +2093,7 @@ static bool folio_make_device_exclusive(struct folio *folio, struct rmap_walk_control rwc = { .rmap_one = page_make_device_exclusive_one, .done = page_not_mapped, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, .arg = &args, }; @@ -2109,7 +2104,7 @@ static bool folio_make_device_exclusive(struct folio *folio, if (!folio_test_anon(folio)) return false; - rmap_walk(&folio->page, &rwc); + rmap_walk(folio, &rwc); return args.valid && !folio_mapcount(folio); } @@ -2177,17 +2172,16 @@ void __put_anon_vma(struct anon_vma *anon_vma) anon_vma_free(root); } -static struct anon_vma *rmap_walk_anon_lock(struct page *page, +static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, struct rmap_walk_control *rwc) { - struct folio *folio = page_folio(page); struct anon_vma *anon_vma; if (rwc->anon_lock) - return rwc->anon_lock(page); + return rwc->anon_lock(folio); /* - * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() + * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing @@ -2209,10 +2203,9 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the anon_vma struct it points to. */ -static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, +static void rmap_walk_anon(struct folio *folio, struct rmap_walk_control *rwc, bool locked) { - struct folio *folio = page_folio(page); struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; @@ -2222,17 +2215,17 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, /* anon_vma disappear under us? */ VM_BUG_ON_FOLIO(!anon_vma, folio); } else { - anon_vma = rmap_walk_anon_lock(page, rwc); + anon_vma = rmap_walk_anon_lock(folio, rwc); } if (!anon_vma) return; - pgoff_start = page_to_pgoff(page); - pgoff_end = pgoff_start + thp_nr_pages(page) - 1; + pgoff_start = folio_pgoff(folio); + pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(page, vma); + unsigned long address = vma_address(&folio->page, vma); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); @@ -2240,9 +2233,9 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - if (!rwc->rmap_one(page, vma, address, rwc->arg)) + if (!rwc->rmap_one(folio, vma, address, rwc->arg)) break; - if (rwc->done && rwc->done(page)) + if (rwc->done && rwc->done(folio)) break; } @@ -2258,10 +2251,10 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. */ -static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, +static void rmap_walk_file(struct folio *folio, struct rmap_walk_control *rwc, bool locked) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio_mapping(folio); pgoff_t pgoff_start, pgoff_end; struct vm_area_struct *vma; @@ -2271,18 +2264,18 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, * structure at mapping cannot be freed and reused yet, * so we can safely take mapping->i_mmap_rwsem. */ - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!mapping) return; - pgoff_start = page_to_pgoff(page); - pgoff_end = pgoff_start + thp_nr_pages(page) - 1; + pgoff_start = folio_pgoff(folio); + pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; if (!locked) i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { - unsigned long address = vma_address(page, vma); + unsigned long address = vma_address(&folio->page, vma); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); @@ -2290,9 +2283,9 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - if (!rwc->rmap_one(page, vma, address, rwc->arg)) + if (!rwc->rmap_one(folio, vma, address, rwc->arg)) goto done; - if (rwc->done && rwc->done(page)) + if (rwc->done && rwc->done(folio)) goto done; } @@ -2301,25 +2294,25 @@ done: i_mmap_unlock_read(mapping); } -void rmap_walk(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { - if (unlikely(PageKsm(page))) - rmap_walk_ksm(page, rwc); - else if (PageAnon(page)) - rmap_walk_anon(page, rwc, false); + if (unlikely(folio_test_ksm(folio))) + rmap_walk_ksm(folio, rwc); + else if (folio_test_anon(folio)) + rmap_walk_anon(folio, rwc, false); else - rmap_walk_file(page, rwc, false); + rmap_walk_file(folio, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ -void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) { /* no ksm support for now */ - VM_BUG_ON_PAGE(PageKsm(page), page); - if (PageAnon(page)) - rmap_walk_anon(page, rwc, true); + VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); + if (folio_test_anon(folio)) + rmap_walk_anon(folio, rwc, true); else - rmap_walk_file(page, rwc, true); + rmap_walk_file(folio, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3-70-g09d2