diff options
Diffstat (limited to 'mm/ksm.c')
-rw-r--r-- | mm/ksm.c | 89 |
1 files changed, 58 insertions, 31 deletions
@@ -39,6 +39,7 @@ #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/pagewalk.h> #include <asm/tlbflush.h> #include "internal.h" @@ -419,47 +420,74 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page *page = NULL; + spinlock_t *ptl; + pte_t *pte; + int ret; + + if (pmd_leaf(*pmd) || !pmd_present(*pmd)) + return 0; + + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (pte_present(*pte)) { + page = vm_normal_page(walk->vma, addr, *pte); + } else if (!pte_none(*pte)) { + swp_entry_t entry = pte_to_swp_entry(*pte); + + /* + * As KSM pages remain KSM pages until freed, no need to wait + * here for migration to end. + */ + if (is_migration_entry(entry)) + page = pfn_swap_entry_to_page(entry); + } + ret = page && PageKsm(page); + pte_unmap_unlock(pte, ptl); + return ret; +} + +static const struct mm_walk_ops break_ksm_ops = { + .pmd_entry = break_ksm_pmd_entry, +}; + /* - * We use break_ksm to break COW on a ksm page: it's a stripped down - * - * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1) - * put_page(page); + * We use break_ksm to break COW on a ksm page by triggering unsharing, + * such that the ksm page will get replaced by an exclusive anonymous page. * - * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * We take great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem, where we would not want to touch it. * - * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context + * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { - struct page *page; vm_fault_t ret = 0; do { + int ksm_page; + cond_resched(); - page = follow_page(vma, addr, - FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); - if (IS_ERR_OR_NULL(page)) - break; - if (PageKsm(page)) - ret = handle_mm_fault(vma, addr, - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, - NULL); - else - ret = VM_FAULT_WRITE; - put_page(page); - } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); + ksm_page = walk_page_range_vma(vma, addr, addr + 1, + &break_ksm_ops, NULL); + if (WARN_ON_ONCE(ksm_page < 0)) + return ksm_page; + if (!ksm_page) + return 0; + ret = handle_mm_fault(vma, addr, + FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, + NULL); + } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* - * We must loop because handle_mm_fault() may back out if there's - * any difficulty e.g. if pte accessed bit gets updated concurrently. - * - * VM_FAULT_WRITE is what we have been hoping for: it indicates that - * COW has been broken, even if the vma does not permit VM_WRITE; - * but note that a concurrent fault might break PageKsm for us. + * We must loop until we no longer find a KSM page because + * handle_mm_fault() may back out if there's any difficulty e.g. if + * pte accessed bit gets updated concurrently. * * VM_FAULT_SIGBUS could occur if we race with truncation of the * backing file, which also invalidates anonymous pages: that's @@ -1041,7 +1069,6 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, anon_exclusive = PageAnonExclusive(page); if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || - (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || anon_exclusive || mm_tlb_flush_pending(mm)) { pte_t entry; @@ -1079,11 +1106,11 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (pte_dirty(entry)) set_page_dirty(page); + entry = pte_mkclean(entry); + + if (pte_write(entry)) + entry = pte_wrprotect(entry); - if (pte_protnone(entry)) - entry = pte_mkclean(pte_clear_savedwrite(entry)); - else - entry = pte_mkclean(pte_wrprotect(entry)); set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = *pvmw.pte; @@ -3211,7 +3238,7 @@ static int __init ksm_init(void) #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ - hotplug_memory_notifier(ksm_memory_callback, 100); + hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); #endif return 0; |