From 09180ca4b3bc7b1ffdbae62a75f716dfc0685861 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 6 Sep 2017 16:20:58 -0700 Subject: mm/gup: make __gup_device_* require THP These functions are the only bits of generic code that use {pud,pmd}_pfn() without checking for CONFIG_TRANSPARENT_HUGEPAGE. This works fine on x86, the only arch with devmap support, since the *_pfn() functions are always defined there, but this isn't true for every architecture. Link: http://lkml.kernel.org/r/20170626063833.11094-1-oohall@gmail.com Signed-off-by: Oliver O'Halloran Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/gup.c') diff --git a/mm/gup.c b/mm/gup.c index 23f01c40c88f..33d651deeae2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1352,7 +1352,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* __HAVE_ARCH_PTE_SPECIAL */ -#ifdef __HAVE_ARCH_PTE_DEVMAP +#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, struct page **pages, int *nr) { -- cgit v1.2.3-70-g09d2 From 84c3fc4e9c563d8fb91cfdf5948da48fe1af34d3 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 8 Sep 2017 16:11:01 -0700 Subject: mm: thp: check pmd migration entry in common path When THP migration is being used, memory management code needs to handle pmd migration entries properly. This patch uses !pmd_present() or is_swap_pmd() (depending on whether pmd_none() needs separate code or not) to check pmd migration entries at the places where a pmd entry is present. Since pmd-related code uses split_huge_page(), split_huge_pmd(), pmd_trans_huge(), pmd_trans_unstable(), or pmd_none_or_trans_huge_or_clear_bad(), this patch: 1. adds pmd migration entry split code in split_huge_pmd(), 2. takes care of pmd migration entries whenever pmd_trans_huge() is present, 3. makes pmd_none_or_trans_huge_or_clear_bad() pmd migration entry aware. Since split_huge_page() uses split_huge_pmd() and pmd_trans_unstable() is equivalent to pmd_none_or_trans_huge_or_clear_bad(), we do not change them. Until this commit, a pmd entry should be: 1. pointing to a pte page, 2. is_swap_pmd(), 3. pmd_trans_huge(), 4. pmd_devmap(), or 5. pmd_none(). Signed-off-by: Zi Yan Cc: Kirill A. Shutemov Cc: "H. Peter Anvin" Cc: Anshuman Khandual Cc: Dave Hansen Cc: David Nellans Cc: Ingo Molnar Cc: Mel Gorman Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 32 +++++++++++++-------- include/asm-generic/pgtable.h | 18 +++++++++++- include/linux/huge_mm.h | 14 ++++++++-- mm/gup.c | 22 +++++++++++++-- mm/huge_memory.c | 65 +++++++++++++++++++++++++++++++++++++++---- mm/memcontrol.c | 5 ++++ mm/memory.c | 12 ++++++-- mm/mprotect.c | 4 +-- mm/mremap.c | 2 +- 9 files changed, 147 insertions(+), 27 deletions(-) (limited to 'mm/gup.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a290966f91ec..8eec35af32e4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -608,7 +608,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - smaps_pmd_entry(pmd, addr, walk); + if (pmd_present(*pmd)) + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); return 0; } @@ -1012,6 +1013,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, goto out; } + if (!pmd_present(*pmd)) + goto out; + page = pmd_page(*pmd); /* Clear accessed and referenced bits. */ @@ -1293,27 +1297,33 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (ptl) { u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; + struct page *page = NULL; if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; - /* - * Currently pmd for thp is always present because thp - * can not be swapped-out, migrated, or HWPOISONed - * (split in such cases instead.) - * This if-check is just to prepare for future implementation. - */ if (pmd_present(pmd)) { - struct page *page = pmd_page(pmd); - - if (page_mapcount(page) == 1) - flags |= PM_MMAP_EXCLUSIVE; + page = pmd_page(pmd); flags |= PM_PRESENT; if (pm->show_pfn) frame = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + else if (is_swap_pmd(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + frame = swp_type(entry) | + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + flags |= PM_SWAP; + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + page = migration_entry_to_page(entry); + } +#endif + + if (page && page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 4d7bb98f4134..4f93a6d10a47 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -846,7 +846,23 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) #ifdef CONFIG_TRANSPARENT_HUGEPAGE barrier(); #endif - if (pmd_none(pmdval) || pmd_trans_huge(pmdval)) + /* + * !pmd_present() checks for pmd migration entries + * + * The complete check uses is_pmd_migration_entry() in linux/swapops.h + * But using that requires moving current function and pmd_trans_unstable() + * to linux/swapops.h to resovle dependency, which is too much code move. + * + * !pmd_present() is equivalent to is_pmd_migration_entry() currently, + * because !pmd_present() pages can only be under migration not swapped + * out. + * + * pmd_none() is preseved for future condition checks on pmd migration + * entries and not confusing with this function name, although it is + * redundant with !pmd_present(). + */ + if (pmd_none(pmdval) || pmd_trans_huge(pmdval) || + (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval))) return 1; if (unlikely(pmd_bad(pmdval))) { pmd_clear_bad(pmd); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d8f35a0865dc..14bc21c2ee7f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -147,7 +147,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ - if (pmd_trans_huge(*____pmd) \ + if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false, NULL); \ @@ -178,12 +178,18 @@ extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); extern spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); + +static inline int is_swap_pmd(pmd_t pmd) +{ + return !pmd_none(pmd) && !pmd_present(pmd); +} + /* mmap_sem must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma); else return NULL; @@ -299,6 +305,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, long adjust_next) { } +static inline int is_swap_pmd(pmd_t pmd) +{ + return 0; +} static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { diff --git a/mm/gup.c b/mm/gup.c index 33d651deeae2..76fd199aaae2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -234,6 +234,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } +retry: + if (!pmd_present(*pmd)) { + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(*pmd)); + if (is_pmd_migration_entry(*pmd)) + pmd_migration_entry_wait(mm, pmd); + goto retry; + } if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); @@ -247,7 +257,15 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); +retry_locked: ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_present(*pmd))) { + spin_unlock(ptl); + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + pmd_migration_entry_wait(mm, pmd); + goto retry_locked; + } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags); @@ -424,7 +442,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pud = pud_offset(p4d, address); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) return -EFAULT; VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, address); @@ -1534,7 +1552,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + if (!pmd_present(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 937f007794dd..b82585eabe85 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -928,6 +928,23 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (unlikely(is_swap_pmd(pmd))) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + if (is_write_migration_entry(entry)) { + make_migration_entry_read(&entry); + pmd = swp_entry_to_pmd(entry); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + ret = 0; + goto out_unlock; + } +#endif + if (unlikely(!pmd_trans_huge(pmd))) { pte_free(dst_mm, pgtable); goto out_unlock; @@ -1599,6 +1616,12 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_huge_zero_pmd(orig_pmd)) goto out; + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto out; + } + page = pmd_page(orig_pmd); /* * If other processes are mapping this page, we couldn't discard @@ -1810,6 +1833,25 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, preserve_write = prot_numa && pmd_write(*pmd); ret = 1; +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (is_swap_pmd(*pmd)) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); + + VM_BUG_ON(!is_pmd_migration_entry(*pmd)); + if (is_write_migration_entry(entry)) { + pmd_t newpmd; + /* + * A protection check is difficult so + * just be safe and disable write + */ + make_migration_entry_read(&entry); + newpmd = swp_entry_to_pmd(entry); + set_pmd_at(mm, addr, pmd, newpmd); + } + goto unlock; + } +#endif + /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and @@ -1875,7 +1917,8 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || + pmd_devmap(*pmd))) return ptl; spin_unlock(ptl); return NULL; @@ -1993,14 +2036,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t _pmd; - bool young, write, dirty, soft_dirty; + bool young, write, dirty, soft_dirty, pmd_migration = false; unsigned long addr; int i; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); - VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); + VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) + && !pmd_devmap(*pmd)); count_vm_event(THP_SPLIT_PMD); @@ -2025,7 +2069,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } - page = pmd_page(*pmd); +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + pmd_migration = is_pmd_migration_entry(*pmd); + if (pmd_migration) { + swp_entry_t entry; + + entry = pmd_to_swp_entry(*pmd); + page = pfn_to_page(swp_offset(entry)); + } else +#endif + page = pmd_page(*pmd); VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); write = pmd_write(*pmd); @@ -2044,7 +2097,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * transferred to avoid any possibility of altering * permissions across VMAs. */ - if (freeze) { + if (freeze || pmd_migration) { swp_entry_t swp_entry; swp_entry = make_migration_entry(page + i, write); entry = swp_entry_to_pte(swp_entry); @@ -2143,7 +2196,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, page = pmd_page(*pmd); if (PageMlocked(page)) clear_page_mlock(page); - } else if (!pmd_devmap(*pmd)) + } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6532b219b222..f1f3f5b41155 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4664,6 +4664,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, struct page *page = NULL; enum mc_target_type ret = MC_TARGET_NONE; + if (unlikely(is_swap_pmd(pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(pmd)); + return ret; + } page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!(mc.flags & MOVE_ANON)) diff --git a/mm/memory.c b/mm/memory.c index 13ee83b43878..886033b95fd2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1065,7 +1065,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { + if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) + || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); err = copy_huge_pmd(dst_mm, src_mm, @@ -1326,7 +1327,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON_VMA(vma_is_anonymous(vma) && !rwsem_is_locked(&tlb->mm->mmap_sem), vma); @@ -3911,6 +3912,13 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, pmd_t orig_pmd = *vmf.pmd; barrier(); + if (unlikely(is_swap_pmd(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + if (is_pmd_migration_entry(orig_pmd)) + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); diff --git a/mm/mprotect.c b/mm/mprotect.c index bd0f409922cb..a1bfe9545770 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -149,7 +149,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, unsigned long this_pages; next = pmd_addr_end(addr, end); - if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) + if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) && pmd_none_or_clear_bad(pmd)) continue; @@ -159,7 +159,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mni_start, end); } - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { __split_huge_pmd(vma, pmd, addr, false, NULL); } else { diff --git a/mm/mremap.c b/mm/mremap.c index 7395564daa6c..cfec004c4ff9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -223,7 +223,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; - if (pmd_trans_huge(*old_pmd)) { + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE) { bool moved; /* See comment in move_ptes() */ -- cgit v1.2.3-70-g09d2 From df6ad69838fc9dcdbee0dcf2fc2c6f1113f8d609 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:12:24 -0700 Subject: mm/device-public-memory: device memory cache coherent with CPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Platform with advance system bus (like CAPI or CCIX) allow device memory to be accessible from CPU in a cache coherent fashion. Add a new type of ZONE_DEVICE to represent such memory. The use case are the same as for the un-addressable device memory but without all the corners cases. Link: http://lkml.kernel.org/r/20170817000548.32038-19-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Aneesh Kumar Cc: Paul E. McKenney Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: Ross Zwisler Cc: Balbir Singh Cc: David Nellans Cc: Evgeny Baskakov Cc: Johannes Weiner Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Mark Hairgrove Cc: Michal Hocko Cc: Sherry Cheung Cc: Subhash Gutti Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- include/linux/hmm.h | 4 ++-- include/linux/ioport.h | 1 + include/linux/memremap.h | 21 ++++++++++++++++++ include/linux/mm.h | 20 ++++++++++------- kernel/memremap.c | 8 +++---- mm/Kconfig | 11 ++++++++++ mm/gup.c | 7 ++++++ mm/hmm.c | 4 ++-- mm/madvise.c | 2 +- mm/memcontrol.c | 12 +++++----- mm/memory.c | 46 +++++++++++++++++++++++++++++++++----- mm/migrate.c | 57 ++++++++++++++++++++++++++++++++---------------- mm/swap.c | 11 ++++++++++ 14 files changed, 159 insertions(+), 47 deletions(-) (limited to 'mm/gup.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 90ab657f8e56..281880c7e694 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1267,7 +1267,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pm->show_pfn) frame = pte_pfn(pte); flags |= PM_PRESENT; - page = vm_normal_page(vma, addr, pte); + page = _vm_normal_page(vma, addr, pte, true); if (pte_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 67a03b20a2db..6d3b0b4fed4e 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma, #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) struct hmm_devmem; struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, @@ -494,7 +494,7 @@ struct hmm_device { */ struct hmm_device *hmm_device_new(void *drvdata); void hmm_device_put(struct hmm_device *hmm_device); -#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */ +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ /* Below are for HMM internal use only! Not to be used by device driver! */ diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 3a4f69137bc2..f5cf32e80041 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -131,6 +131,7 @@ enum { IORES_DESC_PERSISTENT_MEMORY = 4, IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, + IORES_DESC_DEVICE_PUBLIC_MEMORY = 7, }; /* helpers to define resources */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 8aa6b82679e2..f8ee1c73ad2d 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) * * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/vm/hmm.txt. + * + * MEMORY_DEVICE_PUBLIC: + * Device memory that is cache coherent from device and CPU point of view. This + * is use on platform that have an advance system bus (like CAPI or CCIX). A + * driver can hotplug the device memory using ZONE_DEVICE and with that memory + * type. Any page of a process can be migrated to such memory. However no one + * should be allow to pin such memory so that it can always be evicted. */ enum memory_type { MEMORY_DEVICE_HOST = 0, MEMORY_DEVICE_PRIVATE, + MEMORY_DEVICE_PUBLIC, }; /* @@ -92,6 +100,8 @@ enum memory_type { * The page_free() callback is called once the page refcount reaches 1 * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug. * This allows the device driver to implement its own memory management.) + * + * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter. */ typedef int (*dev_page_fault_t)(struct vm_area_struct *vma, unsigned long addr, @@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct page *page) return is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PRIVATE; } + +static inline bool is_device_public_page(const struct page *page) +{ + return is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PUBLIC; +} #else static inline void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, @@ -157,6 +173,11 @@ static inline bool is_device_private_page(const struct page *page) { return false; } + +static inline bool is_device_public_page(const struct page *page) +{ + return false; +} #endif /** diff --git a/include/linux/mm.h b/include/linux/mm.h index eccdab4bb44a..de66a1127db4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -800,15 +800,16 @@ static inline bool is_zone_device_page(const struct page *page) } #endif -#ifdef CONFIG_DEVICE_PRIVATE -void put_zone_device_private_page(struct page *page); +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) +void put_zone_device_private_or_public_page(struct page *page); #else -static inline void put_zone_device_private_page(struct page *page) +static inline void put_zone_device_private_or_public_page(struct page *page) { } -#endif +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ static inline bool is_device_private_page(const struct page *page); +static inline bool is_device_public_page(const struct page *page); DECLARE_STATIC_KEY_FALSE(device_private_key); @@ -834,8 +835,9 @@ static inline void put_page(struct page *page) * include/linux/memremap.h and HMM for details. */ if (static_branch_unlikely(&device_private_key) && - unlikely(is_device_private_page(page))) { - put_zone_device_private_page(page); + unlikely(is_device_private_page(page) || + is_device_public_page(page))) { + put_zone_device_private_or_public_page(page); return; } @@ -1224,8 +1226,10 @@ struct zap_details { pgoff_t last_index; /* Highest page->index to unmap */ }; -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte); +struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, bool with_public_device); +#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false) + struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); diff --git a/kernel/memremap.c b/kernel/memremap.c index ea0e18a2a5f2..6bcbfbf1a8fd 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -501,8 +501,8 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) #endif /* CONFIG_ZONE_DEVICE */ -#ifdef CONFIG_DEVICE_PRIVATE -void put_zone_device_private_page(struct page *page) +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) +void put_zone_device_private_or_public_page(struct page *page) { int count = page_ref_dec_return(page); @@ -522,5 +522,5 @@ void put_zone_device_private_page(struct page *page) } else if (!count) __put_page(page); } -EXPORT_SYMBOL(put_zone_device_private_page); -#endif /* CONFIG_DEVICE_PRIVATE */ +EXPORT_SYMBOL(put_zone_device_private_or_public_page); +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ diff --git a/mm/Kconfig b/mm/Kconfig index ec27855db133..7bea16697d87 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -720,12 +720,23 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" depends on ARCH_HAS_HMM + select HMM help Allows creation of struct pages to represent unaddressable device memory; i.e., memory that is only accessible from the device (or group of devices). You likely also want to select HMM_MIRROR. +config DEVICE_PUBLIC + bool "Addressable device memory (like GPU memory)" + depends on ARCH_HAS_HMM + select HMM + + help + Allows creation of struct pages to represent addressable device + memory; i.e., memory that is accessible from both the device and + the CPU + config FRAME_VECTOR bool diff --git a/mm/gup.c b/mm/gup.c index 76fd199aaae2..b2b4d4263768 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -456,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) goto unmap; *page = pte_page(*pte); + + /* + * This should never happen (a device public page in the gate + * area). + */ + if (is_device_public_page(*page)) + goto unmap; } get_page(*page); out: diff --git a/mm/hmm.c b/mm/hmm.c index c9d23ef80552..b31d56662202 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -737,7 +737,7 @@ EXPORT_SYMBOL(hmm_vma_fault); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, unsigned long addr) { @@ -1177,4 +1177,4 @@ static int __init hmm_init(void) } device_initcall(hmm_init); -#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */ +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ diff --git a/mm/madvise.c b/mm/madvise.c index eea1c733286f..21261ff0466f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -355,7 +355,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; } - page = vm_normal_page(vma, addr, ptent); + page = _vm_normal_page(vma, addr, ptent, true); if (!page) continue; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8aa98f9bc723..126a939b600a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4623,10 +4623,11 @@ out: * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE - * (so ZONE_DEVICE page and thus not on the lru). For now we such page is - * charge like a regular page would be as for all intent and purposes it is - * just special memory taking the place of a regular page. + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC + * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). + * For now we such page is charge like a regular page would be as for all + * intent and purposes it is just special memory taking the place of a + * regular page. * * See Documentations/vm/hmm.txt and include/linux/hmm.h * @@ -4657,7 +4658,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page)) + if (is_device_private_page(page) || + is_device_public_page(page)) ret = MC_TARGET_DEVICE; if (target) target->page = page; diff --git a/mm/memory.c b/mm/memory.c index 079eeac0b009..ad0ea1af1f44 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -818,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, #else # define HAVE_PTE_SPECIAL 0 #endif -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); @@ -830,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; - if (!is_zero_pfn(pfn)) - print_bad_pte(vma, addr, pte, NULL); + if (is_zero_pfn(pfn)) + return NULL; + + /* + * Device public pages are special pages (they are ZONE_DEVICE + * pages but different from persistent memory). They behave + * allmost like normal pages. The difference is that they are + * not on the lru and thus should never be involve with any- + * thing that involve lru manipulation (mlock, numa balancing, + * ...). + * + * This is why we still want to return NULL for such page from + * vm_normal_page() so that we do not have to special case all + * call site of vm_normal_page(). + */ + if (likely(pfn < highest_memmap_pfn)) { + struct page *page = pfn_to_page(pfn); + + if (is_device_public_page(page)) { + if (with_public_device) + return page; + return NULL; + } + } + print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -1012,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; + } else if (pte_devmap(pte)) { + page = pte_page(pte); + + /* + * Cache coherent device memory behave like regular page and + * not like persistent memory page. For more informations see + * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h + */ + if (is_device_public_page(page)) { + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + } } out_set_pte: @@ -1267,7 +1303,7 @@ again: if (pte_present(ptent)) { struct page *page; - page = vm_normal_page(vma, addr, ptent); + page = _vm_normal_page(vma, addr, ptent, true); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to diff --git a/mm/migrate.c b/mm/migrate.c index e581253ef330..618aeb5e9cde 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -239,10 +240,14 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); - if (unlikely(is_zone_device_page(new)) && - is_device_private_page(new)) { - entry = make_device_private_entry(new, pte_write(pte)); - pte = swp_entry_to_pte(entry); + if (unlikely(is_zone_device_page(new))) { + if (is_device_private_page(new)) { + entry = make_device_private_entry(new, pte_write(pte)); + pte = swp_entry_to_pte(entry); + } else if (is_device_public_page(new)) { + pte = pte_mkdevmap(pte); + flush_dcache_page(new); + } } else flush_dcache_page(new); @@ -437,12 +442,11 @@ int migrate_page_move_mapping(struct address_space *mapping, void **pslot; /* - * ZONE_DEVICE pages have 1 refcount always held by their device - * - * Note that DAX memory will never reach that point as it does not have - * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h). + * Device public or private pages have an extra refcount as they are + * ZONE_DEVICE pages. */ - expected_count += is_zone_device_page(page); + expected_count += is_device_private_page(page); + expected_count += is_device_public_page(page); if (!mapping) { /* Anonymous page without mapping */ @@ -2123,7 +2127,6 @@ out_unlock: #endif /* CONFIG_NUMA */ - struct migrate_vma { struct vm_area_struct *vma; unsigned long *dst; @@ -2263,7 +2266,7 @@ again: pfn = 0; goto next; } - page = vm_normal_page(migrate->vma, addr, pte); + page = _vm_normal_page(migrate->vma, addr, pte, true); mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } @@ -2406,10 +2409,19 @@ static bool migrate_vma_check_page(struct page *page) if (is_device_private_page(page)) return true; - /* Other ZONE_DEVICE memory type are not supported */ - return false; + /* + * Only allow device public page to be migrated and account for + * the extra reference count imply by ZONE_DEVICE pages. + */ + if (!is_device_public_page(page)) + return false; + extra++; } + /* For file back page */ + if (page_mapping(page)) + extra += 1 + page_has_private(page); + if ((page_count(page) - extra) > page_mapcount(page)) return false; @@ -2647,11 +2659,18 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, */ __SetPageUptodate(page); - if (is_zone_device_page(page) && is_device_private_page(page)) { - swp_entry_t swp_entry; - - swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); - entry = swp_entry_to_pte(swp_entry); + if (is_zone_device_page(page)) { + if (is_device_private_page(page)) { + swp_entry_t swp_entry; + + swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); + entry = swp_entry_to_pte(swp_entry); + } else if (is_device_public_page(page)) { + entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + entry = pte_mkdevmap(entry); + } } else { entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) @@ -2768,7 +2787,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; continue; } - } else { + } else if (!is_device_public_page(newpage)) { /* * Other types of ZONE_DEVICE page are not * supported. diff --git a/mm/swap.c b/mm/swap.c index 62d96b8e5eb3..9295ae960d66 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -765,6 +765,17 @@ void release_pages(struct page **pages, int nr, bool cold) if (is_huge_zero_page(page)) continue; + /* Device public page can not be huge page */ + if (is_device_public_page(page)) { + if (locked_pgdat) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, + flags); + locked_pgdat = NULL; + } + put_zone_device_private_or_public_page(page); + continue; + } + page = compound_head(page); if (!put_page_testzero(page)) continue; -- cgit v1.2.3-70-g09d2