diff options
Diffstat (limited to 'mm')
54 files changed, 2908 insertions, 1387 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index beb7a455915d..46ef77d5c332 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP config HAVE_MEMBLOCK_PHYS_MAP bool -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP bool config ARCH_DISCARD_MEMBLOCK @@ -149,32 +149,6 @@ config NO_BOOTMEM config MEMORY_ISOLATION bool -config MOVABLE_NODE - bool "Enable to assign a node which has only movable memory" - depends on HAVE_MEMBLOCK - depends on NO_BOOTMEM - depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG - depends on NUMA - default n - help - Allow a node to have only movable memory. Pages used by the kernel, - such as direct mapping pages cannot be migrated. So the corresponding - memory device cannot be hotplugged. This option allows the following - two things: - - When the system is booting, node full of hotpluggable memory can - be arranged to have only movable memory so that the whole node can - be hot-removed. (need movable_node boot option specified). - - After the system is up, the option allows users to online all the - memory of a node as movable memory so that the whole node can be - hot-removed. - - Users who don't use the memory hotplug feature are fine with this - option on since they don't specify movable_node boot option or they - don't online memory as movable. - - Say Y here if you want to hotplug a whole node. - Say N here if you want kernel to use memory on all nodes evenly. - # # Only be set on architectures that have completely implemented memory hotplug # feature. If you are not sure, don't touch it. @@ -446,6 +420,18 @@ choice benefit. endchoice +config ARCH_WANTS_THP_SWAP + def_bool n + +config THP_SWAP + def_bool y + depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP + help + Swap transparent huge pages in one piece, without splitting. + XXX: For now this only does clustered swap space allocation. + + For selection by architectures with reasonable THP sizes. + config TRANSPARENT_HUGE_PAGECACHE def_bool y depends on TRANSPARENT_HUGEPAGE @@ -683,12 +669,16 @@ config IDLE_PAGE_TRACKING See Documentation/vm/idle_page_tracking.txt for more details. +# arch_add_memory() comprehends device memory +config ARCH_HAS_ZONE_DEVICE + bool + config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP - depends on X86_64 #arch_add_memory() comprehends device memory + depends on ARCH_HAS_ZONE_DEVICE help Device memory hotplug support allows for establishing pmem, @@ -706,3 +696,11 @@ config ARCH_USES_HIGH_VMA_FLAGS bool config ARCH_HAS_PKEYS bool + +config PERCPU_STATS + bool "Collect percpu memory statistics" + default n + help + This feature collects and exposes statistics via debugfs. The + information includes global and per chunk statistics, which can + be used to help understand percpu memory usage. diff --git a/mm/Makefile b/mm/Makefile index 026f6a828a50..411bd24d4a7c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -103,3 +103,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o +obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o diff --git a/mm/cleancache.c b/mm/cleancache.c index ba5d8f3e6d68..f7b9fdc79d97 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -130,7 +130,7 @@ void __cleancache_init_shared_fs(struct super_block *sb) int pool_id = CLEANCACHE_NO_BACKEND_SHARED; if (cleancache_ops) { - pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); + pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE); if (pool_id < 0) pool_id = CLEANCACHE_NO_POOL; } diff --git a/mm/compaction.c b/mm/compaction.c index 613c59e928cb..fb548e4c7bd4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -236,10 +236,9 @@ static void __reset_isolation_suitable(struct zone *zone) cond_resched(); - if (!pfn_valid(pfn)) + page = pfn_to_online_page(pfn); + if (!page) continue; - - page = pfn_to_page(pfn); if (zone != page_zone(page)) continue; diff --git a/mm/filemap.c b/mm/filemap.c index 6f1be573a5e6..2e906ef52143 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -376,6 +376,38 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); +/** + * filemap_range_has_page - check if a page exists in range. + * @mapping: address space within which to check + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. + */ +bool filemap_range_has_page(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + pgoff_t index = start_byte >> PAGE_SHIFT; + pgoff_t end = end_byte >> PAGE_SHIFT; + struct pagevec pvec; + bool ret; + + if (end_byte < start_byte) + return false; + + if (mapping->nrpages == 0) + return false; + + pagevec_init(&pvec, 0); + if (!pagevec_lookup(&pvec, mapping, index, 1)) + return false; + ret = (pvec.pages[0]->index <= end); + pagevec_release(&pvec); + return ret; +} +EXPORT_SYMBOL(filemap_range_has_page); + static int __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { @@ -768,10 +800,10 @@ struct wait_page_key { struct wait_page_queue { struct page *page; int bit_nr; - wait_queue_t wait; + wait_queue_entry_t wait; }; -static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct wait_page_key *key = arg; struct wait_page_queue *wait_page @@ -834,7 +866,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, bool lock) { struct wait_page_queue wait_page; - wait_queue_t *wait = &wait_page.wait; + wait_queue_entry_t *wait = &wait_page.wait; int ret = 0; init_wait(wait); @@ -845,9 +877,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, for (;;) { spin_lock_irq(&q->lock); - if (likely(list_empty(&wait->task_list))) { + if (likely(list_empty(&wait->entry))) { if (lock) - __add_wait_queue_tail_exclusive(q, wait); + __add_wait_queue_entry_tail_exclusive(q, wait); else __add_wait_queue(q, wait); SetPageWaiters(page); @@ -907,7 +939,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) * * Add an arbitrary @waiter to the wait queue for the nominated @page. */ -void add_page_wait_queue(struct page *page, wait_queue_t *waiter) +void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter) { wait_queue_head_t *q = page_waitqueue(page); unsigned long flags; @@ -2038,10 +2070,17 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t size; size = i_size_read(inode); - retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1); - if (retval < 0) - goto out; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1)) + return -EAGAIN; + } else { + retval = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (retval < 0) + goto out; + } file_accessed(file); @@ -2226,7 +2265,7 @@ int filemap_fault(struct vm_fault *vmf) /* No page in the page cache at all */ do_sync_mmap_readahead(vmf->vma, ra, file, offset); count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: page = find_get_page(mapping, offset); @@ -2642,6 +2681,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) pos = iocb->ki_pos; + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + if (limit != RLIM_INFINITY) { if (iocb->ki_pos >= limit) { send_sig(SIGXFSZ, current, 0); @@ -2710,9 +2752,17 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) write_len = iov_iter_count(from); end = (pos + write_len - 1) >> PAGE_SHIFT; - written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); - if (written) - goto out; + if (iocb->ki_flags & IOCB_NOWAIT) { + /* If there are pages to writeback, return */ + if (filemap_range_has_page(inode->i_mapping, pos, + pos + iov_iter_count(from))) + return -EAGAIN; + } else { + written = filemap_write_and_wait_range(mapping, pos, + pos + write_len - 1); + if (written) + goto out; + } /* * After a write we want buffered reads to be sure to go to disk to get @@ -208,72 +208,28 @@ no_page: return no_page_table(vma, flags); } -/** - * follow_page_mask - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page - * - * @flags can have FOLL_ flags set, defined in <linux/mm.h> - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) +static struct page *follow_pmd_mask(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + unsigned int flags, unsigned int *page_mask) { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; pmd_t *pmd; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; - *page_mask = 0; - - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - return page; - } - - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return no_page_table(vma, flags); - p4d = p4d_offset(pgd, address); - if (p4d_none(*p4d)) - return no_page_table(vma, flags); - BUILD_BUG_ON(p4d_huge(*p4d)); - if (unlikely(p4d_bad(*p4d))) - return no_page_table(vma, flags); - pud = pud_offset(p4d, address); - if (pud_none(*pud)) + pmd = pmd_offset(pudp, address); + if (pmd_none(*pmd)) return no_page_table(vma, flags); - if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pud(mm, address, pud, flags); + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pmd(mm, address, pmd, flags); if (page) return page; return no_page_table(vma, flags); } - if (pud_devmap(*pud)) { - ptl = pud_lock(mm, pud); - page = follow_devmap_pud(vma, address, pud, flags); - spin_unlock(ptl); - if (page) - return page; - } - if (unlikely(pud_bad(*pud))) - return no_page_table(vma, flags); - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return no_page_table(vma, flags); - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags); + if (is_hugepd(__hugepd(pmd_val(*pmd)))) { + page = follow_huge_pd(vma, address, + __hugepd(pmd_val(*pmd)), flags, + PMD_SHIFT); if (page) return page; return no_page_table(vma, flags); @@ -319,13 +275,131 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return ret ? ERR_PTR(ret) : follow_page_pte(vma, address, pmd, flags); } - page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); *page_mask = HPAGE_PMD_NR - 1; return page; } + +static struct page *follow_pud_mask(struct vm_area_struct *vma, + unsigned long address, p4d_t *p4dp, + unsigned int flags, unsigned int *page_mask) +{ + pud_t *pud; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + pud = pud_offset(p4dp, address); + if (pud_none(*pud)) + return no_page_table(vma, flags); + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pud(mm, address, pud, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + if (is_hugepd(__hugepd(pud_val(*pud)))) { + page = follow_huge_pd(vma, address, + __hugepd(pud_val(*pud)), flags, + PUD_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } + if (pud_devmap(*pud)) { + ptl = pud_lock(mm, pud); + page = follow_devmap_pud(vma, address, pud, flags); + spin_unlock(ptl); + if (page) + return page; + } + if (unlikely(pud_bad(*pud))) + return no_page_table(vma, flags); + + return follow_pmd_mask(vma, address, pud, flags, page_mask); +} + + +static struct page *follow_p4d_mask(struct vm_area_struct *vma, + unsigned long address, pgd_t *pgdp, + unsigned int flags, unsigned int *page_mask) +{ + p4d_t *p4d; + struct page *page; + + p4d = p4d_offset(pgdp, address); + if (p4d_none(*p4d)) + return no_page_table(vma, flags); + BUILD_BUG_ON(p4d_huge(*p4d)); + if (unlikely(p4d_bad(*p4d))) + return no_page_table(vma, flags); + + if (is_hugepd(__hugepd(p4d_val(*p4d)))) { + page = follow_huge_pd(vma, address, + __hugepd(p4d_val(*p4d)), flags, + P4D_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } + return follow_pud_mask(vma, address, p4d, flags, page_mask); +} + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in <linux/mm.h> + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + pgd_t *pgd; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + *page_mask = 0; + + /* make this handle hugepd */ + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + return page; + } + + pgd = pgd_offset(mm, address); + + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return no_page_table(vma, flags); + + if (pgd_huge(*pgd)) { + page = follow_huge_pgd(mm, address, pgd, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + if (is_hugepd(__hugepd(pgd_val(*pgd)))) { + page = follow_huge_pd(vma, address, + __hugepd(pgd_val(*pgd)), flags, + PGDIR_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } + + return follow_p4d_mask(vma, address, pgd, flags, page_mask); +} + static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) @@ -387,11 +461,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, /* mlock all present pages, but do not fault in new pages */ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) return -ENOENT; - /* For mm_populate(), just skip the stack guard page. */ - if ((*flags & FOLL_POPULATE) && - (stack_guard_page_start(vma, address) || - stack_guard_page_end(vma, address + PAGE_SIZE))) - return -ENOENT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) @@ -407,12 +476,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, ret = handle_mm_fault(vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; - if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) - return -EFAULT; + int err = vm_fault_to_errno(ret, *flags); + + if (err) + return err; BUG(); } @@ -723,12 +790,10 @@ retry: ret = handle_mm_fault(vma, address, fault_flags); major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return -EHWPOISON; - if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) - return -EFAULT; + int err = vm_fault_to_errno(ret, 0); + + if (err) + return err; BUG(); } @@ -1155,7 +1220,7 @@ struct page *get_dump_page(unsigned long addr) #endif /* CONFIG_ELF_CORE */ /* - * Generic RCU Fast GUP + * Generic Fast GUP * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be @@ -1176,8 +1241,8 @@ struct page *get_dump_page(unsigned long addr) * Before activating this code, please be aware that the following assumptions * are currently made: * - * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free - * pages containing page tables. + * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to + * free pages containing page tables or TLB flushing requires IPI broadcast. * * *) ptes can be read atomically by the architecture. * @@ -1187,7 +1252,7 @@ struct page *get_dump_page(unsigned long addr) * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_GENERIC_RCU_GUP +#ifdef CONFIG_HAVE_GENERIC_GUP #ifndef gup_get_pte /* @@ -1358,16 +1423,15 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return __gup_device_huge_pmd(orig, addr, end, pages, nr); refs = 0; - head = pmd_page(orig); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); + head = compound_head(pmd_page(orig)); if (!page_cache_add_speculative(head, refs)) { *nr -= refs; return 0; @@ -1397,16 +1461,15 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return __gup_device_huge_pud(orig, addr, end, pages, nr); refs = 0; - head = pud_page(orig); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); + head = compound_head(pud_page(orig)); if (!page_cache_add_speculative(head, refs)) { *nr -= refs; return 0; @@ -1435,16 +1498,15 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, BUILD_BUG_ON(pgd_devmap(orig)); refs = 0; - head = pgd_page(orig); - page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); + page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); + head = compound_head(pgd_page(orig)); if (!page_cache_add_speculative(head, refs)) { *nr -= refs; return 0; @@ -1677,4 +1739,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, return ret; } -#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ +#endif /* CONFIG_HAVE_GENERIC_GUP */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a84909cf20d3..86975dec0ba1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1426,8 +1426,11 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) */ if (unlikely(pmd_trans_migrating(*vmf->pmd))) { page = pmd_page(*vmf->pmd); + if (!get_page_unless_zero(page)) + goto out_unlock; spin_unlock(vmf->ptl); wait_on_page_locked(page); + put_page(page); goto out; } @@ -1459,9 +1462,12 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { + page_nid = -1; + if (!get_page_unless_zero(page)) + goto out_unlock; spin_unlock(vmf->ptl); wait_on_page_locked(page); - page_nid = -1; + put_page(page); goto out; } @@ -1569,8 +1575,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, get_page(page); spin_unlock(ptl); split_huge_page(page); - put_page(page); unlock_page(page); + put_page(page); goto out_unlocked; } @@ -2197,7 +2203,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * atomic_set() here would be safe on all archs (and not only on x86), * it's safer to use atomic_inc()/atomic_add(). */ - if (PageAnon(head)) { + if (PageAnon(head) && !PageSwapCache(head)) { page_ref_inc(page_tail); } else { /* Additional pin to radix tree */ @@ -2208,6 +2214,7 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->flags |= (head->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | + (1L << PG_swapcache) | (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | @@ -2270,7 +2277,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageCompound(head); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { - page_ref_inc(head); + /* Additional pin to radix tree of swap cache */ + if (PageSwapCache(head)) + page_ref_add(head, 2); + else + page_ref_inc(head); } else { /* Additional pin to radix tree */ page_ref_add(head, 2); @@ -2379,6 +2390,21 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount) return ret; } +/* Racy check whether the huge page can be split */ +bool can_split_huge_page(struct page *page, int *pextra_pins) +{ + int extra_pins; + + /* Additional pins from radix tree */ + if (PageAnon(page)) + extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; + else + extra_pins = HPAGE_PMD_NR; + if (pextra_pins) + *pextra_pins = extra_pins; + return total_mapcount(page) == page_count(page) - extra_pins - 1; +} + /* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. @@ -2426,7 +2452,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ret = -EBUSY; goto out; } - extra_pins = 0; mapping = NULL; anon_vma_lock_write(anon_vma); } else { @@ -2438,8 +2463,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - /* Addidional pins from radix tree */ - extra_pins = HPAGE_PMD_NR; anon_vma = NULL; i_mmap_lock_read(mapping); } @@ -2448,7 +2471,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * Racy check if we can split the page, before freeze_page() will * split PMDs */ - if (total_mapcount(head) != page_count(head) - extra_pins - 1) { + if (!can_split_huge_page(head, &extra_pins)) { ret = -EBUSY; goto out_unlock; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e5828875f7bb..1a88006ec634 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -867,7 +867,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) { struct page *page; @@ -887,6 +887,22 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) return page; } +static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + int node; + + if (nid != NUMA_NO_NODE) + return dequeue_huge_page_node_exact(h, nid); + + for_each_online_node(node) { + page = dequeue_huge_page_node_exact(h, node); + if (page) + return page; + } + return NULL; +} + /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { @@ -904,6 +920,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; + gfp_t gfp_mask; + int nid; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; @@ -924,12 +942,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = huge_zonelist(vma, address, - htlb_alloc_mask(h), &mpol, &nodemask); + gfp_mask = htlb_alloc_mask(h); + nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + zonelist = node_zonelist(nid, gfp_mask); for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) { + if (cpuset_zone_allowed(zone, gfp_mask)) { page = dequeue_huge_page_node(h, zone_to_nid(zone)); if (page) { if (avoid_reserve) @@ -1024,9 +1043,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \ - ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ - defined(CONFIG_CMA)) +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static void destroy_compound_gigantic_page(struct page *page, unsigned int order) { @@ -1158,8 +1175,7 @@ static int alloc_fresh_gigantic_page(struct hstate *h, return 0; } -static inline bool gigantic_page_supported(void) { return true; } -#else +#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ static inline bool gigantic_page_supported(void) { return false; } static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, @@ -1545,13 +1561,13 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, do { struct page *page; struct mempolicy *mpol; - struct zonelist *zl; + int nid; nodemask_t *nodemask; cpuset_mems_cookie = read_mems_allowed_begin(); - zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); + nid = huge_node(vma, addr, gfp, &mpol, &nodemask); mpol_cond_put(mpol); - page = __alloc_pages_nodemask(gfp, order, zl, nodemask); + page = __alloc_pages_nodemask(gfp, order, nid, nodemask); if (page) return page; } while (read_mems_allowed_retry(cpuset_mems_cookie)); @@ -3185,17 +3201,17 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } -static int is_hugetlb_entry_migration(pte_t pte) +bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; if (huge_pte_none(pte) || pte_present(pte)) - return 0; + return false; swp = pte_to_swp_entry(pte); if (non_swap_entry(swp) && is_migration_entry(swp)) - return 1; + return true; else - return 0; + return false; } static int is_hugetlb_entry_hwpoisoned(pte_t pte) @@ -3233,7 +3249,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; - src_pte = huge_pte_offset(src, addr); + src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; dst_pte = huge_pte_alloc(dst, addr, sz); @@ -3263,9 +3279,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, */ make_migration_entry_read(&swp_entry); entry = swp_entry_to_pte(swp_entry); - set_huge_pte_at(src, addr, src_pte, entry); + set_huge_swap_pte_at(src, addr, src_pte, + entry, sz); } - set_huge_pte_at(dst, addr, dst_pte, entry); + set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); } else { if (cow) { huge_ptep_set_wrprotect(src, addr, src_pte); @@ -3317,7 +3334,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; for (; address < end; address += sz) { - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, sz); if (!ptep) continue; @@ -3338,7 +3355,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { - huge_pte_clear(mm, address, ptep); + huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; } @@ -3535,7 +3552,8 @@ retry_avoidcopy: unmap_ref_private(mm, vma, old_page, address); BUG_ON(huge_pte_none(pte)); spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + ptep = huge_pte_offset(mm, address & huge_page_mask(h), + huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; @@ -3574,7 +3592,8 @@ retry_avoidcopy: * before the page tables are altered */ spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + ptep = huge_pte_offset(mm, address & huge_page_mask(h), + huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); @@ -3861,7 +3880,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, address &= huge_page_mask(h); - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { @@ -4118,7 +4137,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * * Note that page table lock is not held when pte is null. */ - pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), + huge_page_size(h)); if (pte) ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); @@ -4170,6 +4190,11 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, } ret = hugetlb_fault(mm, vma, vaddr, fault_flags); if (ret & VM_FAULT_ERROR) { + int err = vm_fault_to_errno(ret, flags); + + if (err) + return err; + remainder = 0; break; } @@ -4252,7 +4277,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); @@ -4274,7 +4299,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, make_migration_entry_read(&entry); newpte = swp_entry_to_pte(entry); - set_huge_pte_at(mm, address, ptep, newpte); + set_huge_swap_pte_at(mm, address, ptep, + newpte, huge_page_size(h)); pages++; } spin_unlock(ptl); @@ -4516,7 +4542,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) saddr = page_table_shareable(svma, vma, addr, idx); if (saddr) { - spte = huge_pte_offset(svma->vm_mm, saddr); + spte = huge_pte_offset(svma->vm_mm, saddr, + vma_mmu_pagesize(svma)); if (spte) { get_page(virt_to_page(spte)); break; @@ -4612,7 +4639,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; @@ -4648,6 +4676,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, } struct page * __weak +follow_huge_pd(struct vm_area_struct *vma, + unsigned long address, hugepd_t hpd, int flags, int pdshift) +{ + WARN(1, "hugepd follow called with no support for hugepage directory format\n"); + return NULL; +} + +struct page * __weak follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int flags) { @@ -4694,6 +4730,15 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); } +struct page * __weak +follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) +{ + if (flags & FOLL_GET) + return NULL; + + return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); +} + #ifdef CONFIG_MEMORY_FAILURE /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 945fd1ca49b5..df4ebdb2b10a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -652,7 +652,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, spin_unlock(ptl); free_page_and_swap_cache(src_page); } - cond_resched(); } } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 20036d4f9f13..7780cd83a495 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -150,7 +150,7 @@ struct kmemleak_scan_area { */ struct kmemleak_object { spinlock_t lock; - unsigned long flags; /* object status flags */ + unsigned int flags; /* object status flags */ struct list_head object_list; struct list_head gray_list; struct rb_node rb_node; @@ -159,6 +159,8 @@ struct kmemleak_object { atomic_t use_count; unsigned long pointer; size_t size; + /* pass surplus references to this pointer */ + unsigned long excess_ref; /* minimum number of a pointers found before it is considered leak */ int min_count; /* the total number of pointers found pointing to this object */ @@ -253,7 +255,8 @@ enum { KMEMLEAK_NOT_LEAK, KMEMLEAK_IGNORE, KMEMLEAK_SCAN_AREA, - KMEMLEAK_NO_SCAN + KMEMLEAK_NO_SCAN, + KMEMLEAK_SET_EXCESS_REF }; /* @@ -262,9 +265,12 @@ enum { */ struct early_log { int op_type; /* kmemleak operation type */ - const void *ptr; /* allocated/freed memory block */ - size_t size; /* memory block size */ int min_count; /* minimum reference count */ + const void *ptr; /* allocated/freed memory block */ + union { + size_t size; /* memory block size */ + unsigned long excess_ref; /* surplus reference passing */ + }; unsigned long trace[MAX_TRACE]; /* stack trace */ unsigned int trace_len; /* stack trace length */ }; @@ -393,7 +399,7 @@ static void dump_object_info(struct kmemleak_object *object) object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); - pr_notice(" flags = 0x%lx\n", object->flags); + pr_notice(" flags = 0x%x\n", object->flags); pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); @@ -562,6 +568,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, object->flags = OBJECT_ALLOCATED; object->pointer = ptr; object->size = size; + object->excess_ref = 0; object->min_count = min_count; object->count = 0; /* white color initially */ object->jiffies = jiffies; @@ -795,6 +802,30 @@ out: } /* + * Any surplus references (object already gray) to 'ptr' are passed to + * 'excess_ref'. This is used in the vmalloc() case where a pointer to + * vm_struct may be used as an alternative reference to the vmalloc'ed object + * (see free_thread_stack()). + */ +static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Setting excess_ref on unknown object at 0x%08lx\n", + ptr); + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->excess_ref = excess_ref; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* * Set the OBJECT_NO_SCAN flag for the object corresponding to the give * pointer. Such object will not be scanned by kmemleak but references to it * are searched. @@ -908,7 +939,7 @@ static void early_alloc_percpu(struct early_log *log) * @gfp: kmalloc() flags used for kmemleak internal memory allocations * * This function is called from the kernel allocators when a new object - * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.). + * (memory block) is allocated (kmem_cache_alloc, kmalloc etc.). */ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) @@ -952,6 +983,36 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); /** + * kmemleak_vmalloc - register a newly vmalloc'ed object + * @area: pointer to vm_struct + * @size: size of the object + * @gfp: __vmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the vmalloc() kernel allocator when a new + * object (memory block) is allocated. + */ +void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp) +{ + pr_debug("%s(0x%p, %zu)\n", __func__, area, size); + + /* + * A min_count = 2 is needed because vm_struct contains a reference to + * the virtual address of the vmalloc'ed block. + */ + if (kmemleak_enabled) { + create_object((unsigned long)area->addr, size, 2, gfp); + object_set_excess_ref((unsigned long)area, + (unsigned long)area->addr); + } else if (kmemleak_early_log) { + log_early(KMEMLEAK_ALLOC, area->addr, size, 2); + /* reusing early_log.size for storing area->addr */ + log_early(KMEMLEAK_SET_EXCESS_REF, + area, (unsigned long)area->addr, 0); + } +} +EXPORT_SYMBOL_GPL(kmemleak_vmalloc); + +/** * kmemleak_free - unregister a previously registered object * @ptr: pointer to beginning of the object * @@ -1188,6 +1249,30 @@ static bool update_checksum(struct kmemleak_object *object) } /* + * Update an object's references. object->lock must be held by the caller. + */ +static void update_refs(struct kmemleak_object *object) +{ + if (!color_white(object)) { + /* non-orphan, ignored or new */ + return; + } + + /* + * Increase the object's reference count (number of pointers to the + * memory block). If this count reaches the required minimum, the + * object's color will become gray and it will be added to the + * gray_list. + */ + object->count++; + if (color_gray(object)) { + /* put_object() called when removing from gray_list */ + WARN_ON(!get_object(object)); + list_add_tail(&object->gray_list, &gray_list); + } +} + +/* * Memory scanning is a long process and it needs to be interruptable. This * function checks whether such interrupt condition occurred. */ @@ -1224,6 +1309,7 @@ static void scan_block(void *_start, void *_end, for (ptr = start; ptr < end; ptr++) { struct kmemleak_object *object; unsigned long pointer; + unsigned long excess_ref; if (scan_should_stop()) break; @@ -1259,25 +1345,27 @@ static void scan_block(void *_start, void *_end, * enclosed by scan_mutex. */ spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); - if (!color_white(object)) { - /* non-orphan, ignored or new */ - spin_unlock(&object->lock); - continue; - } - - /* - * Increase the object's reference count (number of pointers - * to the memory block). If this count reaches the required - * minimum, the object's color will become gray and it will be - * added to the gray_list. - */ - object->count++; + /* only pass surplus references (object already gray) */ if (color_gray(object)) { - /* put_object() called when removing from gray_list */ - WARN_ON(!get_object(object)); - list_add_tail(&object->gray_list, &gray_list); + excess_ref = object->excess_ref; + /* no need for update_refs() if object already gray */ + } else { + excess_ref = 0; + update_refs(object); } spin_unlock(&object->lock); + + if (excess_ref) { + object = lookup_object(excess_ref, 0); + if (!object) + continue; + if (object == scanned) + /* circular reference, ignore */ + continue; + spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + update_refs(object); + spin_unlock(&object->lock); + } } read_unlock_irqrestore(&kmemleak_lock, flags); } @@ -1980,6 +2068,10 @@ void __init kmemleak_init(void) case KMEMLEAK_NO_SCAN: kmemleak_no_scan(log->ptr); break; + case KMEMLEAK_SET_EXCESS_REF: + object_set_excess_ref((unsigned long)log->ptr, + log->excess_ref); + break; default: kmemleak_warn("Unknown early log operation: %d\n", log->op_type); @@ -128,9 +128,12 @@ struct ksm_scan { * struct stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list + * @hlist_dup: linked into the stable_node->hlist with a stable_node chain * @list: linked into migrate_nodes, pending placement in the proper node tree * @hlist: hlist head of rmap_items using this ksm page * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) + * @chain_prune_time: time of the last full garbage collection + * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct stable_node { @@ -138,11 +141,24 @@ struct stable_node { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ struct list_head *head; - struct list_head list; + struct { + struct hlist_node hlist_dup; + struct list_head list; + }; }; }; struct hlist_head hlist; - unsigned long kpfn; + union { + unsigned long kpfn; + unsigned long chain_prune_time; + }; + /* + * STABLE_NODE_CHAIN can be any negative number in + * rmap_hlist_len negative range, but better not -1 to be able + * to reliably detect underflows. + */ +#define STABLE_NODE_CHAIN -1024 + int rmap_hlist_len; #ifdef CONFIG_NUMA int nid; #endif @@ -192,6 +208,7 @@ static struct rb_root *root_unstable_tree = one_unstable_tree; /* Recently migrated nodes of stable tree, pending proper placement */ static LIST_HEAD(migrate_nodes); +#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -219,6 +236,18 @@ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; +/* The number of stable_node chains */ +static unsigned long ksm_stable_node_chains; + +/* The number of stable_node dups linked to the stable_node chains */ +static unsigned long ksm_stable_node_dups; + +/* Delay in pruning stale stable_node_dups in the stable_node_chains */ +static int ksm_stable_node_chains_prune_millisecs = 2000; + +/* Maximum number of page slots sharing a stable node */ +static int ksm_max_page_sharing = 256; + /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = 100; @@ -287,6 +316,45 @@ static void __init ksm_slab_free(void) mm_slot_cache = NULL; } +static __always_inline bool is_stable_node_chain(struct stable_node *chain) +{ + return chain->rmap_hlist_len == STABLE_NODE_CHAIN; +} + +static __always_inline bool is_stable_node_dup(struct stable_node *dup) +{ + return dup->head == STABLE_NODE_DUP_HEAD; +} + +static inline void stable_node_chain_add_dup(struct stable_node *dup, + struct stable_node *chain) +{ + VM_BUG_ON(is_stable_node_dup(dup)); + dup->head = STABLE_NODE_DUP_HEAD; + VM_BUG_ON(!is_stable_node_chain(chain)); + hlist_add_head(&dup->hlist_dup, &chain->hlist); + ksm_stable_node_dups++; +} + +static inline void __stable_node_dup_del(struct stable_node *dup) +{ + VM_BUG_ON(!is_stable_node_dup(dup)); + hlist_del(&dup->hlist_dup); + ksm_stable_node_dups--; +} + +static inline void stable_node_dup_del(struct stable_node *dup) +{ + VM_BUG_ON(is_stable_node_chain(dup)); + if (is_stable_node_dup(dup)) + __stable_node_dup_del(dup); + else + rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); +#ifdef CONFIG_DEBUG_VM + dup->head = NULL; +#endif +} + static inline struct rmap_item *alloc_rmap_item(void) { struct rmap_item *rmap_item; @@ -317,6 +385,8 @@ static inline struct stable_node *alloc_stable_node(void) static inline void free_stable_node(struct stable_node *stable_node) { + VM_BUG_ON(stable_node->rmap_hlist_len && + !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } @@ -498,25 +568,82 @@ static inline int get_kpfn_nid(unsigned long kpfn) return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } +static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, + struct rb_root *root) +{ + struct stable_node *chain = alloc_stable_node(); + VM_BUG_ON(is_stable_node_chain(dup)); + if (likely(chain)) { + INIT_HLIST_HEAD(&chain->hlist); + chain->chain_prune_time = jiffies; + chain->rmap_hlist_len = STABLE_NODE_CHAIN; +#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) + chain->nid = -1; /* debug */ +#endif + ksm_stable_node_chains++; + + /* + * Put the stable node chain in the first dimension of + * the stable tree and at the same time remove the old + * stable node. + */ + rb_replace_node(&dup->node, &chain->node, root); + + /* + * Move the old stable node to the second dimension + * queued in the hlist_dup. The invariant is that all + * dup stable_nodes in the chain->hlist point to pages + * that are wrprotected and have the exact same + * content. + */ + stable_node_chain_add_dup(dup, chain); + } + return chain; +} + +static inline void free_stable_node_chain(struct stable_node *chain, + struct rb_root *root) +{ + rb_erase(&chain->node, root); + free_stable_node(chain); + ksm_stable_node_chains--; +} + static void remove_node_from_stable_tree(struct stable_node *stable_node) { struct rmap_item *rmap_item; + /* check it's not STABLE_NODE_CHAIN or negative */ + BUG_ON(stable_node->rmap_hlist_len < 0); + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) ksm_pages_sharing--; else ksm_pages_shared--; + VM_BUG_ON(stable_node->rmap_hlist_len <= 0); + stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } + /* + * We need the second aligned pointer of the migrate_nodes + * list_head to stay clear from the rb_parent_color union + * (aligned and different than any node) and also different + * from &migrate_nodes. This will verify that future list.h changes + * don't break STABLE_NODE_DUP_HEAD. + */ +#if GCC_VERSION >= 40903 /* only recent gcc can handle it */ + BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); + BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); +#endif + if (stable_node->head == &migrate_nodes) list_del(&stable_node->list); else - rb_erase(&stable_node->node, - root_stable_tree + NUMA(stable_node->nid)); + stable_node_dup_del(stable_node); free_stable_node(stable_node); } @@ -635,6 +762,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ksm_pages_sharing--; else ksm_pages_shared--; + VM_BUG_ON(stable_node->rmap_hlist_len <= 0); + stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; @@ -743,6 +872,31 @@ static int remove_stable_node(struct stable_node *stable_node) return err; } +static int remove_stable_node_chain(struct stable_node *stable_node, + struct rb_root *root) +{ + struct stable_node *dup; + struct hlist_node *hlist_safe; + + if (!is_stable_node_chain(stable_node)) { + VM_BUG_ON(is_stable_node_dup(stable_node)); + if (remove_stable_node(stable_node)) + return true; + else + return false; + } + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + VM_BUG_ON(!is_stable_node_dup(dup)); + if (remove_stable_node(dup)) + return true; + } + BUG_ON(!hlist_empty(&stable_node->hlist)); + free_stable_node_chain(stable_node, root); + return false; +} + static int remove_all_stable_nodes(void) { struct stable_node *stable_node, *next; @@ -753,7 +907,8 @@ static int remove_all_stable_nodes(void) while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, struct stable_node, node); - if (remove_stable_node(stable_node)) { + if (remove_stable_node_chain(stable_node, + root_stable_tree + nid)) { err = -EBUSY; break; /* proceed to next nid */ } @@ -1028,8 +1183,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, goto out; if (PageTransCompound(page)) { - err = split_huge_page(page); - if (err) + if (split_huge_page(page)) goto out_unlock; } @@ -1139,6 +1293,214 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, return err ? NULL : page; } +static __always_inline +bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) +{ + VM_BUG_ON(stable_node->rmap_hlist_len < 0); + /* + * Check that at least one mapping still exists, otherwise + * there's no much point to merge and share with this + * stable_node, as the underlying tree_page of the other + * sharer is going to be freed soon. + */ + return stable_node->rmap_hlist_len && + stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; +} + +static __always_inline +bool is_page_sharing_candidate(struct stable_node *stable_node) +{ + return __is_page_sharing_candidate(stable_node, 0); +} + +struct page *stable_node_dup(struct stable_node **_stable_node_dup, + struct stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) +{ + struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; + struct hlist_node *hlist_safe; + struct page *_tree_page, *tree_page = NULL; + int nr = 0; + int found_rmap_hlist_len; + + if (!prune_stale_stable_nodes || + time_before(jiffies, stable_node->chain_prune_time + + msecs_to_jiffies( + ksm_stable_node_chains_prune_millisecs))) + prune_stale_stable_nodes = false; + else + stable_node->chain_prune_time = jiffies; + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + cond_resched(); + /* + * We must walk all stable_node_dup to prune the stale + * stable nodes during lookup. + * + * get_ksm_page can drop the nodes from the + * stable_node->hlist if they point to freed pages + * (that's why we do a _safe walk). The "dup" + * stable_node parameter itself will be freed from + * under us if it returns NULL. + */ + _tree_page = get_ksm_page(dup, false); + if (!_tree_page) + continue; + nr += 1; + if (is_page_sharing_candidate(dup)) { + if (!found || + dup->rmap_hlist_len > found_rmap_hlist_len) { + if (found) + put_page(tree_page); + found = dup; + found_rmap_hlist_len = found->rmap_hlist_len; + tree_page = _tree_page; + + /* skip put_page for found dup */ + if (!prune_stale_stable_nodes) + break; + continue; + } + } + put_page(_tree_page); + } + + if (found) { + /* + * nr is counting all dups in the chain only if + * prune_stale_stable_nodes is true, otherwise we may + * break the loop at nr == 1 even if there are + * multiple entries. + */ + if (prune_stale_stable_nodes && nr == 1) { + /* + * If there's not just one entry it would + * corrupt memory, better BUG_ON. In KSM + * context with no lock held it's not even + * fatal. + */ + BUG_ON(stable_node->hlist.first->next); + + /* + * There's just one entry and it is below the + * deduplication limit so drop the chain. + */ + rb_replace_node(&stable_node->node, &found->node, + root); + free_stable_node(stable_node); + ksm_stable_node_chains--; + ksm_stable_node_dups--; + /* + * NOTE: the caller depends on the stable_node + * to be equal to stable_node_dup if the chain + * was collapsed. + */ + *_stable_node = found; + /* + * Just for robustneess as stable_node is + * otherwise left as a stable pointer, the + * compiler shall optimize it away at build + * time. + */ + stable_node = NULL; + } else if (stable_node->hlist.first != &found->hlist_dup && + __is_page_sharing_candidate(found, 1)) { + /* + * If the found stable_node dup can accept one + * more future merge (in addition to the one + * that is underway) and is not at the head of + * the chain, put it there so next search will + * be quicker in the !prune_stale_stable_nodes + * case. + * + * NOTE: it would be inaccurate to use nr > 1 + * instead of checking the hlist.first pointer + * directly, because in the + * prune_stale_stable_nodes case "nr" isn't + * the position of the found dup in the chain, + * but the total number of dups in the chain. + */ + hlist_del(&found->hlist_dup); + hlist_add_head(&found->hlist_dup, + &stable_node->hlist); + } + } + + *_stable_node_dup = found; + return tree_page; +} + +static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, + struct rb_root *root) +{ + if (!is_stable_node_chain(stable_node)) + return stable_node; + if (hlist_empty(&stable_node->hlist)) { + free_stable_node_chain(stable_node, root); + return NULL; + } + return hlist_entry(stable_node->hlist.first, + typeof(*stable_node), hlist_dup); +} + +/* + * Like for get_ksm_page, this function can free the *_stable_node and + * *_stable_node_dup if the returned tree_page is NULL. + * + * It can also free and overwrite *_stable_node with the found + * stable_node_dup if the chain is collapsed (in which case + * *_stable_node will be equal to *_stable_node_dup like if the chain + * never existed). It's up to the caller to verify tree_page is not + * NULL before dereferencing *_stable_node or *_stable_node_dup. + * + * *_stable_node_dup is really a second output parameter of this + * function and will be overwritten in all cases, the caller doesn't + * need to initialize it. + */ +static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, + struct stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) +{ + struct stable_node *stable_node = *_stable_node; + if (!is_stable_node_chain(stable_node)) { + if (is_page_sharing_candidate(stable_node)) { + *_stable_node_dup = stable_node; + return get_ksm_page(stable_node, false); + } + /* + * _stable_node_dup set to NULL means the stable_node + * reached the ksm_max_page_sharing limit. + */ + *_stable_node_dup = NULL; + return NULL; + } + return stable_node_dup(_stable_node_dup, _stable_node, root, + prune_stale_stable_nodes); +} + +static __always_inline struct page *chain_prune(struct stable_node **s_n_d, + struct stable_node **s_n, + struct rb_root *root) +{ + return __stable_node_chain(s_n_d, s_n, root, true); +} + +static __always_inline struct page *chain(struct stable_node **s_n_d, + struct stable_node *s_n, + struct rb_root *root) +{ + struct stable_node *old_stable_node = s_n; + struct page *tree_page; + + tree_page = __stable_node_chain(s_n_d, &s_n, root, false); + /* not pruning dups so s_n cannot have changed */ + VM_BUG_ON(s_n != old_stable_node); + return tree_page; +} + /* * stable_tree_search - search for page inside the stable tree * @@ -1154,7 +1516,7 @@ static struct page *stable_tree_search(struct page *page) struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node; + struct stable_node *stable_node, *stable_node_dup, *stable_node_any; struct stable_node *page_node; page_node = page_stable_node(page); @@ -1176,7 +1538,44 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); - tree_page = get_ksm_page(stable_node, false); + stable_node_any = NULL; + tree_page = chain_prune(&stable_node_dup, &stable_node, root); + /* + * NOTE: stable_node may have been freed by + * chain_prune() if the returned stable_node_dup is + * not NULL. stable_node_dup may have been inserted in + * the rbtree instead as a regular stable_node (in + * order to collapse the stable_node chain if a single + * stable_node dup was found in it). In such case the + * stable_node is overwritten by the calleee to point + * to the stable_node_dup that was collapsed in the + * stable rbtree and stable_node will be equal to + * stable_node_dup like if the chain never existed. + */ + if (!stable_node_dup) { + /* + * Either all stable_node dups were full in + * this stable_node chain, or this chain was + * empty and should be rb_erased. + */ + stable_node_any = stable_node_dup_any(stable_node, + root); + if (!stable_node_any) { + /* rb_erase just run */ + goto again; + } + /* + * Take any of the stable_node dups page of + * this stable_node chain to let the tree walk + * continue. All KSM pages belonging to the + * stable_node dups in a stable_node chain + * have the same content and they're + * wrprotected at all times. Any will work + * fine to continue the walk. + */ + tree_page = get_ksm_page(stable_node_any, false); + } + VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { /* * If we walked over a stale stable_node, @@ -1199,6 +1598,34 @@ again: else if (ret > 0) new = &parent->rb_right; else { + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + /* + * Test if the migrated page should be merged + * into a stable node dup. If the mapcount is + * 1 we can migrate it with another KSM page + * without adding it to the chain. + */ + if (page_mapcount(page) > 1) + goto chain_append; + } + + if (!stable_node_dup) { + /* + * If the stable_node is a chain and + * we got a payload match in memcmp + * but we cannot merge the scanned + * page in any of the existing + * stable_node dups because they're + * all full, we need to wait the + * scanned page to find itself a match + * in the unstable tree to create a + * brand new KSM page to add later to + * the dups of this stable_node. + */ + return NULL; + } + /* * Lock and unlock the stable_node's page (which * might already have been migrated) so that page @@ -1206,23 +1633,21 @@ again: * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ - tree_page = get_ksm_page(stable_node, true); - if (tree_page) { - unlock_page(tree_page); - if (get_kpfn_nid(stable_node->kpfn) != - NUMA(stable_node->nid)) { - put_page(tree_page); - goto replace; - } - return tree_page; - } - /* - * There is now a place for page_node, but the tree may - * have been rebalanced, so re-evaluate parent and new. - */ - if (page_node) + tree_page = get_ksm_page(stable_node_dup, true); + if (unlikely(!tree_page)) + /* + * The tree may have been rebalanced, + * so re-evaluate parent and new. + */ goto again; - return NULL; + unlock_page(tree_page); + + if (get_kpfn_nid(stable_node_dup->kpfn) != + NUMA(stable_node_dup->nid)) { + put_page(tree_page); + goto replace; + } + return tree_page; } } @@ -1233,22 +1658,95 @@ again: DO_NUMA(page_node->nid = nid); rb_link_node(&page_node->node, parent, new); rb_insert_color(&page_node->node, root); - get_page(page); - return page; +out: + if (is_page_sharing_candidate(page_node)) { + get_page(page); + return page; + } else + return NULL; replace: - if (page_node) { - list_del(&page_node->list); - DO_NUMA(page_node->nid = nid); - rb_replace_node(&stable_node->node, &page_node->node, root); - get_page(page); + /* + * If stable_node was a chain and chain_prune collapsed it, + * stable_node has been updated to be the new regular + * stable_node. A collapse of the chain is indistinguishable + * from the case there was no chain in the stable + * rbtree. Otherwise stable_node is the chain and + * stable_node_dup is the dup to replace. + */ + if (stable_node_dup == stable_node) { + VM_BUG_ON(is_stable_node_chain(stable_node_dup)); + VM_BUG_ON(is_stable_node_dup(stable_node_dup)); + /* there is no chain */ + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + rb_replace_node(&stable_node_dup->node, + &page_node->node, + root); + if (is_page_sharing_candidate(page_node)) + get_page(page); + else + page = NULL; + } else { + rb_erase(&stable_node_dup->node, root); + page = NULL; + } } else { - rb_erase(&stable_node->node, root); - page = NULL; + VM_BUG_ON(!is_stable_node_chain(stable_node)); + __stable_node_dup_del(stable_node_dup); + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + stable_node_chain_add_dup(page_node, stable_node); + if (is_page_sharing_candidate(page_node)) + get_page(page); + else + page = NULL; + } else { + page = NULL; + } } - stable_node->head = &migrate_nodes; - list_add(&stable_node->list, stable_node->head); + stable_node_dup->head = &migrate_nodes; + list_add(&stable_node_dup->list, stable_node_dup->head); return page; + +chain_append: + /* stable_node_dup could be null if it reached the limit */ + if (!stable_node_dup) + stable_node_dup = stable_node_any; + /* + * If stable_node was a chain and chain_prune collapsed it, + * stable_node has been updated to be the new regular + * stable_node. A collapse of the chain is indistinguishable + * from the case there was no chain in the stable + * rbtree. Otherwise stable_node is the chain and + * stable_node_dup is the dup to replace. + */ + if (stable_node_dup == stable_node) { + VM_BUG_ON(is_stable_node_chain(stable_node_dup)); + VM_BUG_ON(is_stable_node_dup(stable_node_dup)); + /* chain is missing so create it */ + stable_node = alloc_stable_node_chain(stable_node_dup, + root); + if (!stable_node) + return NULL; + } + /* + * Add this stable_node dup that was + * migrated to the stable_node chain + * of the current nid for this page + * content. + */ + VM_BUG_ON(!is_stable_node_chain(stable_node)); + VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + stable_node_chain_add_dup(page_node, stable_node); + goto out; } /* @@ -1265,7 +1763,8 @@ static struct stable_node *stable_tree_insert(struct page *kpage) struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node; + struct stable_node *stable_node, *stable_node_dup, *stable_node_any; + bool need_chain = false; kpfn = page_to_pfn(kpage); nid = get_kpfn_nid(kpfn); @@ -1280,7 +1779,32 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); - tree_page = get_ksm_page(stable_node, false); + stable_node_any = NULL; + tree_page = chain(&stable_node_dup, stable_node, root); + if (!stable_node_dup) { + /* + * Either all stable_node dups were full in + * this stable_node chain, or this chain was + * empty and should be rb_erased. + */ + stable_node_any = stable_node_dup_any(stable_node, + root); + if (!stable_node_any) { + /* rb_erase just run */ + goto again; + } + /* + * Take any of the stable_node dups page of + * this stable_node chain to let the tree walk + * continue. All KSM pages belonging to the + * stable_node dups in a stable_node chain + * have the same content and they're + * wrprotected at all times. Any will work + * fine to continue the walk. + */ + tree_page = get_ksm_page(stable_node_any, false); + } + VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { /* * If we walked over a stale stable_node, @@ -1303,27 +1827,37 @@ again: else if (ret > 0) new = &parent->rb_right; else { - /* - * It is not a bug that stable_tree_search() didn't - * find this node: because at that time our page was - * not yet write-protected, so may have changed since. - */ - return NULL; + need_chain = true; + break; } } - stable_node = alloc_stable_node(); - if (!stable_node) + stable_node_dup = alloc_stable_node(); + if (!stable_node_dup) return NULL; - INIT_HLIST_HEAD(&stable_node->hlist); - stable_node->kpfn = kpfn; - set_page_stable_node(kpage, stable_node); - DO_NUMA(stable_node->nid = nid); - rb_link_node(&stable_node->node, parent, new); - rb_insert_color(&stable_node->node, root); + INIT_HLIST_HEAD(&stable_node_dup->hlist); + stable_node_dup->kpfn = kpfn; + set_page_stable_node(kpage, stable_node_dup); + stable_node_dup->rmap_hlist_len = 0; + DO_NUMA(stable_node_dup->nid = nid); + if (!need_chain) { + rb_link_node(&stable_node_dup->node, parent, new); + rb_insert_color(&stable_node_dup->node, root); + } else { + if (!is_stable_node_chain(stable_node)) { + struct stable_node *orig = stable_node; + /* chain is missing so create it */ + stable_node = alloc_stable_node_chain(orig, root); + if (!stable_node) { + free_stable_node(stable_node_dup); + return NULL; + } + } + stable_node_chain_add_dup(stable_node_dup, stable_node); + } - return stable_node; + return stable_node_dup; } /* @@ -1413,8 +1947,27 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, * the same ksm page. */ static void stable_tree_append(struct rmap_item *rmap_item, - struct stable_node *stable_node) + struct stable_node *stable_node, + bool max_page_sharing_bypass) { + /* + * rmap won't find this mapping if we don't insert the + * rmap_item in the right stable_node + * duplicate. page_migration could break later if rmap breaks, + * so we can as well crash here. We really need to check for + * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check + * for other negative values as an undeflow if detected here + * for the first time (and not when decreasing rmap_hlist_len) + * would be sign of memory corruption in the stable_node. + */ + BUG_ON(stable_node->rmap_hlist_len < 0); + + stable_node->rmap_hlist_len++; + if (!max_page_sharing_bypass) + /* possibly non fatal but unexpected overflow, only warn */ + WARN_ON_ONCE(stable_node->rmap_hlist_len > + ksm_max_page_sharing); + rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; hlist_add_head(&rmap_item->hlist, &stable_node->hlist); @@ -1442,19 +1995,26 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) struct page *kpage; unsigned int checksum; int err; + bool max_page_sharing_bypass = false; stable_node = page_stable_node(page); if (stable_node) { if (stable_node->head != &migrate_nodes && - get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { - rb_erase(&stable_node->node, - root_stable_tree + NUMA(stable_node->nid)); + get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != + NUMA(stable_node->nid)) { + stable_node_dup_del(stable_node); stable_node->head = &migrate_nodes; list_add(&stable_node->list, stable_node->head); } if (stable_node->head != &migrate_nodes && rmap_item->head == stable_node) return; + /* + * If it's a KSM fork, allow it to go over the sharing limit + * without warnings. + */ + if (!is_page_sharing_candidate(stable_node)) + max_page_sharing_bypass = true; } /* We first start with searching the page inside the stable tree */ @@ -1474,7 +2034,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) * add its rmap_item to the stable tree. */ lock_page(kpage); - stable_tree_append(rmap_item, page_stable_node(kpage)); + stable_tree_append(rmap_item, page_stable_node(kpage), + max_page_sharing_bypass); unlock_page(kpage); } put_page(kpage); @@ -1524,8 +2085,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) lock_page(kpage); stable_node = stable_tree_insert(kpage); if (stable_node) { - stable_tree_append(tree_rmap_item, stable_node); - stable_tree_append(rmap_item, stable_node); + stable_tree_append(tree_rmap_item, stable_node, + false); + stable_tree_append(rmap_item, stable_node, + false); } unlock_page(kpage); @@ -2029,6 +2592,48 @@ static void wait_while_offlining(void) } } +static bool stable_node_dup_remove_range(struct stable_node *stable_node, + unsigned long start_pfn, + unsigned long end_pfn) +{ + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) { + /* + * Don't get_ksm_page, page has already gone: + * which is why we keep kpfn instead of page* + */ + remove_node_from_stable_tree(stable_node); + return true; + } + return false; +} + +static bool stable_node_chain_remove_range(struct stable_node *stable_node, + unsigned long start_pfn, + unsigned long end_pfn, + struct rb_root *root) +{ + struct stable_node *dup; + struct hlist_node *hlist_safe; + + if (!is_stable_node_chain(stable_node)) { + VM_BUG_ON(is_stable_node_dup(stable_node)); + return stable_node_dup_remove_range(stable_node, start_pfn, + end_pfn); + } + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + VM_BUG_ON(!is_stable_node_dup(dup)); + stable_node_dup_remove_range(dup, start_pfn, end_pfn); + } + if (hlist_empty(&stable_node->hlist)) { + free_stable_node_chain(stable_node, root); + return true; /* notify caller that tree was rebalanced */ + } else + return false; +} + static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { @@ -2040,15 +2645,12 @@ static void ksm_check_stable_tree(unsigned long start_pfn, node = rb_first(root_stable_tree + nid); while (node) { stable_node = rb_entry(node, struct stable_node, node); - if (stable_node->kpfn >= start_pfn && - stable_node->kpfn < end_pfn) { - /* - * Don't get_ksm_page, page has already gone: - * which is why we keep kpfn instead of page* - */ - remove_node_from_stable_tree(stable_node); + if (stable_node_chain_remove_range(stable_node, + start_pfn, end_pfn, + root_stable_tree + + nid)) node = rb_first(root_stable_tree + nid); - } else + else node = rb_next(node); cond_resched(); } @@ -2294,6 +2896,47 @@ static ssize_t use_zero_pages_store(struct kobject *kobj, } KSM_ATTR(use_zero_pages); +static ssize_t max_page_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_max_page_sharing); +} + +static ssize_t max_page_sharing_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + int knob; + + err = kstrtoint(buf, 10, &knob); + if (err) + return err; + /* + * When a KSM page is created it is shared by 2 mappings. This + * being a signed comparison, it implicitly verifies it's not + * negative. + */ + if (knob < 2) + return -EINVAL; + + if (READ_ONCE(ksm_max_page_sharing) == knob) + return count; + + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksm_max_page_sharing != knob) { + if (ksm_pages_shared || remove_all_stable_nodes()) + err = -EBUSY; + else + ksm_max_page_sharing = knob; + } + mutex_unlock(&ksm_thread_mutex); + + return err ? err : count; +} +KSM_ATTR(max_page_sharing); + static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -2332,6 +2975,46 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); +static ssize_t stable_node_dups_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_stable_node_dups); +} +KSM_ATTR_RO(stable_node_dups); + +static ssize_t stable_node_chains_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_stable_node_chains); +} +KSM_ATTR_RO(stable_node_chains); + +static ssize_t +stable_node_chains_prune_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); +} + +static ssize_t +stable_node_chains_prune_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + ksm_stable_node_chains_prune_millisecs = msecs; + + return count; +} +KSM_ATTR(stable_node_chains_prune_millisecs); + static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -2351,6 +3034,10 @@ static struct attribute *ksm_attrs[] = { #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif + &max_page_sharing_attr.attr, + &stable_node_chains_attr.attr, + &stable_node_dups_attr.attr, + &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, NULL, }; diff --git a/mm/memblock.c b/mm/memblock.c index b049c9b2dba8..2cb25fe4452c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -54,9 +54,6 @@ struct memblock memblock __initdata_memblock = { }; int memblock_debug __initdata_memblock; -#ifdef CONFIG_MOVABLE_NODE -bool movable_node_enabled __initdata_memblock = false; -#endif static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; @@ -1739,6 +1736,29 @@ static void __init_memblock memblock_dump(struct memblock_type *type) } } +extern unsigned long __init_memblock +memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) +{ + struct memblock_region *rgn; + unsigned long size = 0; + int idx; + + for_each_memblock_type((&memblock.reserved), rgn) { + phys_addr_t start, end; + + if (rgn->base + rgn->size < start_addr) + continue; + if (rgn->base > end_addr) + continue; + + start = rgn->base; + end = start + rgn->size; + size += end - start; + } + + return size; +} + void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94172089f52f..425aa0caa712 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -170,7 +170,7 @@ struct mem_cgroup_event { */ poll_table pt; wait_queue_head_t *wqh; - wait_queue_t wait; + wait_queue_entry_t wait; struct work_struct remove; }; @@ -1479,10 +1479,10 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { struct mem_cgroup *memcg; - wait_queue_t wait; + wait_queue_entry_t wait; }; -static int memcg_oom_wake_function(wait_queue_t *wait, +static int memcg_oom_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; @@ -1570,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle) owait.wait.flags = 0; owait.wait.func = memcg_oom_wake_function; owait.wait.private = current; - INIT_LIST_HEAD(&owait.wait.task_list); + INIT_LIST_HEAD(&owait.wait.entry); prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); mem_cgroup_mark_under_oom(memcg); @@ -2376,10 +2376,9 @@ void mem_cgroup_split_huge_fixup(struct page *head) #ifdef CONFIG_MEMCG_SWAP static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - bool charge) + int nr_entries) { - int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEMCG_SWAP], val); + this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries); } /** @@ -2405,8 +2404,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { - mem_cgroup_swap_statistics(from, false); - mem_cgroup_swap_statistics(to, true); + mem_cgroup_swap_statistics(from, -1); + mem_cgroup_swap_statistics(to, 1); return 0; } return -EINVAL; @@ -3574,6 +3573,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); + seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; } @@ -3725,7 +3725,7 @@ static void memcg_event_remove(struct work_struct *work) * * Called with wqh->lock held and interrupts disabled. */ -static int memcg_event_wake(wait_queue_t *wait, unsigned mode, +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct mem_cgroup_event *event = @@ -4122,6 +4122,12 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; + pn->lruvec_stat = alloc_percpu(struct lruvec_stat); + if (!pn->lruvec_stat) { + kfree(pn); + return 1; + } + lruvec_init(&pn->lruvec); pn->usage_in_excess = 0; pn->on_tree = false; @@ -4133,7 +4139,10 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { - kfree(memcg->nodeinfo[node]); + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + + free_percpu(pn->lruvec_stat); + kfree(pn); } static void __mem_cgroup_free(struct mem_cgroup *memcg) @@ -5165,6 +5174,7 @@ static int memory_events_show(struct seq_file *m, void *v) seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); + seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; } @@ -5197,8 +5207,8 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "kernel_stack %llu\n", (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", - (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + - stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(stat[NR_SLAB_RECLAIMABLE] + + stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); @@ -5222,15 +5232,25 @@ static int memory_stat_show(struct seq_file *m, void *v) } seq_printf(m, "slab_reclaimable %llu\n", - (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", events[PGFAULT]); seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); + seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + + events[PGSCAN_DIRECT]); + seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + + events[PGSTEAL_DIRECT]); + seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); + seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); + seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); + seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); + seq_printf(m, "workingset_refault %lu\n", stat[WORKINGSET_REFAULT]); seq_printf(m, "workingset_activate %lu\n", @@ -5445,7 +5465,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry); + mem_cgroup_uncharge_swap(entry, nr_pages); } } @@ -5873,9 +5893,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * ancestor for the swap instead and transfer the memory+swap charge. */ swap_memcg = mem_cgroup_id_get_online(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(swap_memcg, true); + mem_cgroup_swap_statistics(swap_memcg, 1); page->mem_cgroup = NULL; @@ -5902,19 +5922,20 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) css_put(&memcg->css); } -/* - * mem_cgroup_try_charge_swap - try charging a swap entry +/** + * mem_cgroup_try_charge_swap - try charging swap space for a page * @page: page being added to swap * @entry: swap entry to charge * - * Try to charge @entry to the memcg that @page belongs to. + * Try to charge @page's memcg for the swap space at @entry. * * Returns 0 on success, -ENOMEM on failure. */ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) { - struct mem_cgroup *memcg; + unsigned int nr_pages = hpage_nr_pages(page); struct page_counter *counter; + struct mem_cgroup *memcg; unsigned short oldid; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) @@ -5929,25 +5950,27 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && - !page_counter_try_charge(&memcg->swap, 1, &counter)) { + !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { mem_cgroup_id_put(memcg); return -ENOMEM; } - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + /* Get references for the tail pages, too */ + if (nr_pages > 1) + mem_cgroup_id_get_many(memcg, nr_pages - 1); + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_swap_statistics(memcg, nr_pages); return 0; } /** - * mem_cgroup_uncharge_swap - uncharge a swap entry + * mem_cgroup_uncharge_swap - uncharge swap space * @entry: swap entry to uncharge - * - * Drop the swap charge associated with @entry. + * @nr_pages: the amount of swap space to uncharge */ -void mem_cgroup_uncharge_swap(swp_entry_t entry) +void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { struct mem_cgroup *memcg; unsigned short id; @@ -5955,18 +5978,18 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) if (!do_swap_account) return; - id = swap_cgroup_record(entry, 0); + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->swap, 1); + page_counter_uncharge(&memcg->swap, nr_pages); else - page_counter_uncharge(&memcg->memsw, 1); + page_counter_uncharge(&memcg->memsw, nr_pages); } - mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_id_put(memcg); + mem_cgroup_swap_statistics(memcg, -nr_pages); + mem_cgroup_id_put_many(memcg, nr_pages); } rcu_read_unlock(); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2527dfeddb00..a74c8311db95 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1184,7 +1184,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * page_remove_rmap() in try_to_unmap_one(). So to determine page status * correctly, we save a copy of the page flags at this time. */ - page_flags = p->flags; + if (PageHuge(p)) + page_flags = hpage->flags; + else + page_flags = p->flags; /* * unpoison always clear PG_hwpoison inside page lock @@ -1489,11 +1492,16 @@ EXPORT_SYMBOL(unpoison_memory); static struct page *new_page(struct page *p, unsigned long private, int **x) { int nid = page_to_nid(p); - if (PageHuge(p)) - return alloc_huge_page_node(page_hstate(compound_head(p)), - nid); - else + if (PageHuge(p)) { + struct hstate *hstate = page_hstate(compound_head(p)); + + if (hstate_is_gigantic(hstate)) + return alloc_huge_page_node(hstate, NUMA_NO_NODE); + + return alloc_huge_page_node(hstate, nid); + } else { return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0); + } } /* @@ -1595,12 +1603,8 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", pfn, ret, page->flags, &page->flags); - /* - * We know that soft_offline_huge_page() tries to migrate - * only one hugepage pointed to by hpage, so we need not - * run through the pagelist here. - */ - putback_active_hugepage(hpage); + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); if (ret > 0) ret = -EIO; } else { diff --git a/mm/memory.c b/mm/memory.c index 6ff5d729ded0..e31dd97e6114 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2719,7 +2719,7 @@ int do_swap_page(struct vm_fault *vmf) /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing @@ -2855,40 +2855,6 @@ out_release: } /* - * This is like a special single-page "expand_{down|up}wards()", - * except we must first make sure that 'address{-|+}PAGE_SIZE' - * doesn't hit another vma. - */ -static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) -{ - address &= PAGE_MASK; - if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { - struct vm_area_struct *prev = vma->vm_prev; - - /* - * Is there a mapping abutting this one below? - * - * That's only ok if it's the same stack mapping - * that has gotten split.. - */ - if (prev && prev->vm_end == address) - return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; - - return expand_downwards(vma, address - PAGE_SIZE); - } - if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { - struct vm_area_struct *next = vma->vm_next; - - /* As VM_GROWSDOWN but s/below/above/ */ - if (next && next->vm_start == address + PAGE_SIZE) - return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; - - return expand_upwards(vma, address + PAGE_SIZE); - } - return 0; -} - -/* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. @@ -2904,10 +2870,6 @@ static int do_anonymous_page(struct vm_fault *vmf) if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; - /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, vmf->address) < 0) - return VM_FAULT_SIGSEGV; - /* * Use pte_alloc() instead of pte_alloc_map(). We can't run * pte_offset_map() on pmds where a huge pmd might be created @@ -3029,6 +2991,17 @@ static int __do_fault(struct vm_fault *vmf) return ret; } +/* + * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. + * If we check pmd_trans_unstable() first we will trip the bad_pmd() check + * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly + * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. + */ +static int pmd_devmap_trans_unstable(pmd_t *pmd) +{ + return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); +} + static int pte_alloc_one_map(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -3052,18 +3025,27 @@ static int pte_alloc_one_map(struct vm_fault *vmf) map_pte: /* * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd - * didn't become pmd_trans_huge under us and then back to pmd_none, as - * a result of MADV_DONTNEED running immediately after a huge pmd fault - * in a different thread of this mm, in turn leading to a misleading - * pmd_trans_huge() retval. All we have to ensure is that it is a - * regular pmd that we can walk with pte_offset_map() and we can do that - * through an atomic read in C, which is what pmd_trans_unstable() - * provides. + * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of + * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge + * under us and then back to pmd_none, as a result of MADV_DONTNEED + * running immediately after a huge pmd fault in a different thread of + * this mm, in turn leading to a misleading pmd_trans_huge() retval. + * All we have to ensure is that it is a regular pmd that we can walk + * with pte_offset_map() and we can do that through an atomic read in + * C, which is what pmd_trans_unstable() provides. */ - if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) + if (pmd_devmap_trans_unstable(vmf->pmd)) return VM_FAULT_NOPAGE; + /* + * At this point we know that our vmf->pmd points to a page of ptes + * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge() + * for the duration of the fault. If a racing MADV_DONTNEED runs and + * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still + * be valid and we will re-check to make sure the vmf->pte isn't + * pte_none() under vmf->ptl protection when we return to + * alloc_set_pte(). + */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); return 0; @@ -3690,7 +3672,7 @@ static int handle_pte_fault(struct vm_fault *vmf) vmf->pte = NULL; } else { /* See comment in pte_alloc_one_map() */ - if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) + if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; /* * A regular pmd is established and it can't morph into a huge @@ -3855,7 +3837,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT); + count_memcg_event_mm(vma->vm_mm, PGFAULT); /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); @@ -4032,8 +4014,6 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; ptep = pte_offset_map_lock(mm, pmd, address, ptlp); - if (!ptep) - goto out; if (!pte_present(*ptep)) goto unlock; *ptepp = ptep; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b63d7d1239df..f79aac7a12b5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -79,6 +79,8 @@ static struct { #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +bool movable_node_enabled = false; + #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE bool memhp_auto_online; #else @@ -300,229 +302,38 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long old_zone_end_pfn; - - zone_span_writelock(zone); - - old_zone_end_pfn = zone_end_pfn(zone); - if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) - zone->zone_start_pfn = start_pfn; - - zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - - zone->zone_start_pfn; - - zone_span_writeunlock(zone); -} - -static void resize_zone(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) -{ - zone_span_writelock(zone); - - if (end_pfn - start_pfn) { - zone->zone_start_pfn = start_pfn; - zone->spanned_pages = end_pfn - start_pfn; - } else { - /* - * make it consist as free_area_init_core(), - * if spanned_pages = 0, then keep start_pfn = 0 - */ - zone->zone_start_pfn = 0; - zone->spanned_pages = 0; - } - - zone_span_writeunlock(zone); -} - -static void fix_zone_id(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) -{ - enum zone_type zid = zone_idx(zone); - int nid = zone->zone_pgdat->node_id; - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn++) - set_page_links(pfn_to_page(pfn), zid, nid, pfn); -} - -/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or - * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ -static int __ref ensure_zone_is_initialized(struct zone *zone, - unsigned long start_pfn, unsigned long num_pages) -{ - if (!zone_is_initialized(zone)) - return init_currently_empty_zone(zone, start_pfn, num_pages); - - return 0; -} - -static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, - unsigned long start_pfn, unsigned long end_pfn) +static int __meminit __add_section(int nid, unsigned long phys_start_pfn, + bool want_memblock) { int ret; - unsigned long flags; - unsigned long z1_start_pfn; - - ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); - if (ret) - return ret; - - pgdat_resize_lock(z1->zone_pgdat, &flags); - - /* can't move pfns which are higher than @z2 */ - if (end_pfn > zone_end_pfn(z2)) - goto out_fail; - /* the move out part must be at the left most of @z2 */ - if (start_pfn > z2->zone_start_pfn) - goto out_fail; - /* must included/overlap */ - if (end_pfn <= z2->zone_start_pfn) - goto out_fail; - - /* use start_pfn for z1's start_pfn if z1 is empty */ - if (!zone_is_empty(z1)) - z1_start_pfn = z1->zone_start_pfn; - else - z1_start_pfn = start_pfn; - - resize_zone(z1, z1_start_pfn, end_pfn); - resize_zone(z2, end_pfn, zone_end_pfn(z2)); - - pgdat_resize_unlock(z1->zone_pgdat, &flags); - - fix_zone_id(z1, start_pfn, end_pfn); - - return 0; -out_fail: - pgdat_resize_unlock(z1->zone_pgdat, &flags); - return -1; -} - -static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, - unsigned long start_pfn, unsigned long end_pfn) -{ - int ret; - unsigned long flags; - unsigned long z2_end_pfn; - - ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); - if (ret) - return ret; - - pgdat_resize_lock(z1->zone_pgdat, &flags); - - /* can't move pfns which are lower than @z1 */ - if (z1->zone_start_pfn > start_pfn) - goto out_fail; - /* the move out part mast at the right most of @z1 */ - if (zone_end_pfn(z1) > end_pfn) - goto out_fail; - /* must included/overlap */ - if (start_pfn >= zone_end_pfn(z1)) - goto out_fail; - - /* use end_pfn for z2's end_pfn if z2 is empty */ - if (!zone_is_empty(z2)) - z2_end_pfn = zone_end_pfn(z2); - else - z2_end_pfn = end_pfn; - - resize_zone(z1, z1->zone_start_pfn, start_pfn); - resize_zone(z2, start_pfn, z2_end_pfn); - - pgdat_resize_unlock(z1->zone_pgdat, &flags); - - fix_zone_id(z2, start_pfn, end_pfn); - - return 0; -out_fail: - pgdat_resize_unlock(z1->zone_pgdat, &flags); - return -1; -} - -static struct zone * __meminit move_pfn_range(int zone_shift, - unsigned long start_pfn, unsigned long end_pfn) -{ - struct zone *zone = page_zone(pfn_to_page(start_pfn)); - int ret = 0; - - if (zone_shift < 0) - ret = move_pfn_range_left(zone + zone_shift, zone, - start_pfn, end_pfn); - else if (zone_shift) - ret = move_pfn_range_right(zone, zone + zone_shift, - start_pfn, end_pfn); - - if (ret) - return NULL; - - return zone + zone_shift; -} - -static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); - - if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) - pgdat->node_start_pfn = start_pfn; - - pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - - pgdat->node_start_pfn; -} + int i; -static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - int nr_pages = PAGES_PER_SECTION; - int nid = pgdat->node_id; - int zone_type; - unsigned long flags, pfn; - int ret; + if (pfn_valid(phys_start_pfn)) + return -EEXIST; - zone_type = zone - pgdat->node_zones; - ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); - if (ret) + ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); + if (ret < 0) return ret; - pgdat_resize_lock(zone->zone_pgdat, &flags); - grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); - grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, - phys_start_pfn + nr_pages); - pgdat_resize_unlock(zone->zone_pgdat, &flags); - memmap_init_zone(nr_pages, nid, zone_type, - phys_start_pfn, MEMMAP_HOTPLUG); - - /* online_page_range is called later and expects pages reserved */ - for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { + /* + * Make all the pages reserved so that nobody will stumble over half + * initialized state. + * FIXME: We also have to associate it with a node because pfn_to_node + * relies on having page with the proper node. + */ + for (i = 0; i < PAGES_PER_SECTION; i++) { + unsigned long pfn = phys_start_pfn + i; + struct page *page; if (!pfn_valid(pfn)) continue; - SetPageReserved(pfn_to_page(pfn)); + page = pfn_to_page(pfn); + set_page_node(page, nid); + SetPageReserved(page); } - return 0; -} - -static int __meminit __add_section(int nid, struct zone *zone, - unsigned long phys_start_pfn) -{ - int ret; - - if (pfn_valid(phys_start_pfn)) - return -EEXIST; - - ret = sparse_add_one_section(zone, phys_start_pfn); - - if (ret < 0) - return ret; - - ret = __add_zone(zone, phys_start_pfn); - if (ret < 0) - return ret; + if (!want_memblock) + return 0; return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); } @@ -533,16 +344,14 @@ static int __meminit __add_section(int nid, struct zone *zone, * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages) +int __ref __add_pages(int nid, unsigned long phys_start_pfn, + unsigned long nr_pages, bool want_memblock) { unsigned long i; int err = 0; int start_sec, end_sec; struct vmem_altmap *altmap; - clear_zone_contiguous(zone); - /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -562,7 +371,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, section_nr_to_pfn(i)); + err = __add_section(nid, section_nr_to_pfn(i), want_memblock); /* * EEXIST is finally dealt with by ioresource collision @@ -575,7 +384,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } vmemmap_populate_print_last(); out: - set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -939,33 +747,20 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, unsigned long i; unsigned long onlined_pages = *(unsigned long *)arg; struct page *page; + if (PageReserved(pfn_to_page(start_pfn))) for (i = 0; i < nr_pages; i++) { page = pfn_to_page(start_pfn + i); (*online_page_callback)(page); onlined_pages++; } + + online_mem_sections(start_pfn, start_pfn + nr_pages); + *(unsigned long *)arg = onlined_pages; return 0; } -#ifdef CONFIG_MOVABLE_NODE -/* - * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have - * normal memory. - */ -static bool can_online_high_movable(struct zone *zone) -{ - return true; -} -#else /* CONFIG_MOVABLE_NODE */ -/* ensure every online node has NORMAL memory */ -static bool can_online_high_movable(struct zone *zone) -{ - return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); -} -#endif /* CONFIG_MOVABLE_NODE */ - /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) @@ -1040,39 +835,131 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift) +bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) { - struct zone *zone = page_zone(pfn_to_page(pfn)); - enum zone_type idx = zone_idx(zone); - int i; + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); - *zone_shift = 0; + /* + * TODO there shouldn't be any inherent reason to have ZONE_NORMAL + * physically before ZONE_MOVABLE. All we need is they do not + * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE + * though so let's stick with it for simplicity for now. + * TODO make sure we do not overlap with ZONE_DEVICE + */ + if (online_type == MMOP_ONLINE_KERNEL) { + if (zone_is_empty(movable_zone)) + return true; + return movable_zone->zone_start_pfn >= pfn + nr_pages; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + return zone_end_pfn(default_zone) <= pfn; + } - if (idx < target) { - /* pages must be at end of current zone */ - if (pfn + nr_pages != zone_end_pfn(zone)) - return false; + /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ + return online_type == MMOP_ONLINE_KEEP; +} + +static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = zone_end_pfn(zone); + + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; +} + +static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = pgdat_end_pfn(pgdat); + + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; +} + +void __ref move_pfn_range_to_zone(struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nid = pgdat->node_id; + unsigned long flags; - /* no zones in use between current zone and target */ - for (i = idx + 1; i < target; i++) - if (zone_is_initialized(zone - idx + i)) - return false; + if (zone_is_empty(zone)) + init_currently_empty_zone(zone, start_pfn, nr_pages); + + clear_zone_contiguous(zone); + + /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ + pgdat_resize_lock(pgdat, &flags); + zone_span_writelock(zone); + resize_zone_range(zone, start_pfn, nr_pages); + zone_span_writeunlock(zone); + resize_pgdat_range(pgdat, start_pfn, nr_pages); + pgdat_resize_unlock(pgdat, &flags); + + /* + * TODO now we have a visible range of pages which are not associated + * with their zone properly. Not nice but set_pfnblock_flags_mask + * expects the zone spans the pfn range. All the pages in the range + * are reserved so nobody should be touching them so we should be safe + */ + memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); + + set_zone_contiguous(zone); +} + +/* + * Returns a default kernel memory zone for the given pfn range. + * If no kernel zone covers this pfn range it will automatically go + * to the ZONE_NORMAL. + */ +struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, + unsigned long nr_pages) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + int zid; + + for (zid = 0; zid <= ZONE_NORMAL; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (zone_intersects(zone, start_pfn, nr_pages)) + return zone; } - if (target < idx) { - /* pages must be at beginning of current zone */ - if (pfn != zone->zone_start_pfn) - return false; + return &pgdat->node_zones[ZONE_NORMAL]; +} - /* no zones in use between current zone and target */ - for (i = target + 1; i < idx; i++) - if (zone_is_initialized(zone - idx + i)) - return false; +/* + * Associates the given pfn range with the given node and the zone appropriate + * for the given online type. + */ +static struct zone * __meminit move_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); + + if (online_type == MMOP_ONLINE_KEEP) { + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + /* + * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use + * movable zone if that is not possible (e.g. we are within + * or past the existing movable zone) + */ + if (!allow_online_pfn_range(nid, start_pfn, nr_pages, + MMOP_ONLINE_KERNEL)) + zone = movable_zone; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + zone = &pgdat->node_zones[ZONE_MOVABLE]; } - *zone_shift = target - idx; - return true; + move_pfn_range_to_zone(zone, start_pfn, nr_pages); + return zone; } /* Must be protected by mem_hotplug_begin() */ @@ -1085,38 +972,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; - int zone_shift = 0; - /* - * This doesn't need a lock to do pfn_to_page(). - * The section can't be removed here because of the - * memory_block->state_mutex. - */ - zone = page_zone(pfn_to_page(pfn)); - - if ((zone_idx(zone) > ZONE_NORMAL || - online_type == MMOP_ONLINE_MOVABLE) && - !can_online_high_movable(zone)) + nid = pfn_to_nid(pfn); + if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) return -EINVAL; - if (online_type == MMOP_ONLINE_KERNEL) { - if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) - return -EINVAL; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) - return -EINVAL; - } - - zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); - if (!zone) - return -EINVAL; + /* associate pfn range with the zone */ + zone = move_pfn_range(online_type, nid, pfn, nr_pages); arg.start_pfn = pfn; arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = zone_to_nid(zone); - ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); if (ret) @@ -1311,39 +1178,6 @@ static int check_hotplug_memory_range(u64 start, u64 size) return 0; } -/* - * If movable zone has already been setup, newly added memory should be check. - * If its address is higher than movable zone, it should be added as movable. - * Without this check, movable zone may overlap with other zone. - */ -static int should_add_memory_movable(int nid, u64 start, u64 size) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - pg_data_t *pgdat = NODE_DATA(nid); - struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; - - if (zone_is_empty(movable_zone)) - return 0; - - if (movable_zone->zone_start_pfn <= start_pfn) - return 1; - - return 0; -} - -int zone_for_memory(int nid, u64 start, u64 size, int zone_default, - bool for_device) -{ -#ifdef CONFIG_ZONE_DEVICE - if (for_device) - return ZONE_DEVICE; -#endif - if (should_add_memory_movable(nid, start, size)) - return ZONE_MOVABLE; - - return zone_default; -} - static int online_memory_block(struct memory_block *mem, void *arg) { return device_online(&mem->dev); @@ -1389,7 +1223,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) } /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, false); + ret = arch_add_memory(nid, start, size, true); if (ret < 0) goto error; @@ -1398,7 +1232,22 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) node_set_online(nid); if (new_node) { - ret = register_one_node(nid); + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + ret = __register_one_node(nid); + if (ret) + goto register_fail; + + /* + * link memory sections under this node. This is already + * done when creatig memory section in register_new_memory + * but that depends to have the node registered so offline + * nodes have to go through register_node. + * TODO clean up this mess. + */ + ret = link_mem_sections(nid, start_pfn, nr_pages); +register_fail: /* * If sysfs file of new node can't create, cpu on the node * can't be hot-added. There is no rollback way now. @@ -1592,11 +1441,9 @@ static struct page *new_node_page(struct page *page, unsigned long private, gfp_mask |= __GFP_HIGHMEM; if (!nodes_empty(nmask)) - new_page = __alloc_pages_nodemask(gfp_mask, 0, - node_zonelist(nid, gfp_mask), &nmask); + new_page = __alloc_pages_nodemask(gfp_mask, 0, nid, &nmask); if (!new_page) - new_page = __alloc_pages(gfp_mask, 0, - node_zonelist(nid, gfp_mask)); + new_page = __alloc_pages(gfp_mask, 0, nid); return new_page; } @@ -1725,47 +1572,12 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -#ifdef CONFIG_MOVABLE_NODE -/* - * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have - * normal memory. - */ -static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) -{ - return true; -} -#else /* CONFIG_MOVABLE_NODE */ -/* ensure the node has NORMAL memory if it is still online */ -static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long present_pages = 0; - enum zone_type zt; - - for (zt = 0; zt <= ZONE_NORMAL; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - - if (present_pages > nr_pages) - return true; - - present_pages = 0; - for (; zt <= ZONE_MOVABLE; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - - /* - * we can't offline the last normal memory until all - * higher memory is offlined. - */ - return present_pages == 0; -} -#endif /* CONFIG_MOVABLE_NODE */ - static int __init cmdline_parse_movable_node(char *p) { -#ifdef CONFIG_MOVABLE_NODE +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP movable_node_enabled = true; #else - pr_warn("movable_node option not supported\n"); + pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n"); #endif return 0; } @@ -1887,9 +1699,6 @@ static int __ref __offline_pages(unsigned long start_pfn, node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; - if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) - return -EINVAL; - /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 37d0b334bfe9..7d8e56214ac0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p) static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); - /* - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step); + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) @@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } -/* - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ -static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; @@ -325,41 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - /* - * if step == 1, we use ->w.cpuset_mems_allowed to cache the - * result - */ - if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { - nodes_remap(tmp, pol->v.nodes, - pol->w.cpuset_mems_allowed, *nodes); - pol->w.cpuset_mems_allowed = step ? tmp : *nodes; - } else if (step == MPOL_REBIND_STEP2) { - tmp = pol->w.cpuset_mems_allowed; - pol->w.cpuset_mems_allowed = *nodes; - } else - BUG(); + nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = tmp; } if (nodes_empty(tmp)) tmp = *nodes; - if (step == MPOL_REBIND_STEP1) - nodes_or(pol->v.nodes, pol->v.nodes, tmp); - else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) - pol->v.nodes = tmp; - else - BUG(); - - if (!node_isset(current->il_next, tmp)) { - current->il_next = next_node_in(current->il_next, tmp); - if (current->il_next >= MAX_NUMNODES) - current->il_next = numa_node_id(); - } + pol->v.nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, - const nodemask_t *nodes, - enum mpol_rebind_step step) + const nodemask_t *nodes) { nodemask_t tmp; @@ -385,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol, /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes + * Per-vma policies are protected by mmap_sem. Allocations using per-task + * policies are protected by task->mems_allowed_seq to prevent a premature + * OOM/allocation failure due to parallel nodemask modification. */ -static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, - enum mpol_rebind_step step) +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && + if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; - if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) - return; - - if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) - BUG(); - - if (step == MPOL_REBIND_STEP1) - pol->flags |= MPOL_F_REBINDING; - else if (step == MPOL_REBIND_STEP2) - pol->flags &= ~MPOL_F_REBINDING; - else if (step >= MPOL_REBIND_NSTEP) - BUG(); - - mpol_ops[pol->mode].rebind(pol, newmask, step); + mpol_ops[pol->mode].rebind(pol, newmask); } /* @@ -430,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, * Called with task's alloc_lock held. */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, - enum mpol_rebind_step step) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { - mpol_rebind_policy(tsk->mempolicy, new, step); + mpol_rebind_policy(tsk->mempolicy, new); } /* @@ -448,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) down_write(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); + mpol_rebind_policy(vma->vm_policy, new); up_write(&mm->mmap_sem); } @@ -812,9 +743,8 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, } old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE && - nodes_weight(new->v.nodes)) - current->il_next = first_node(new->v.nodes); + if (new && new->mode == MPOL_INTERLEAVE) + current->il_prev = MAX_NUMNODES-1; task_unlock(current); mpol_put(old); ret = 0; @@ -916,7 +846,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { - *policy = current->il_next; + *policy = next_node_in(current->il_prev, pol->v.nodes); } else { err = -EINVAL; goto out; @@ -1676,9 +1606,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) return NULL; } -/* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, - int nd) +/* Return the node id preferred by the given mempolicy, or the given id */ +static int policy_node(gfp_t gfp, struct mempolicy *policy, + int nd) { if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) nd = policy->v.preferred_node; @@ -1691,20 +1621,19 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } - return node_zonelist(nd, gfp); + return nd; } /* Do dynamic interleaving for a process */ static unsigned interleave_nodes(struct mempolicy *policy) { - unsigned nid, next; + unsigned next; struct task_struct *me = current; - nid = me->il_next; - next = next_node_in(nid, policy->v.nodes); + next = next_node_in(me->il_prev, policy->v.nodes); if (next < MAX_NUMNODES) - me->il_next = next; - return nid; + me->il_prev = next; + return next; } /* @@ -1799,38 +1728,37 @@ static inline unsigned interleave_nid(struct mempolicy *pol, #ifdef CONFIG_HUGETLBFS /* - * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) + * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask * - * Returns a zonelist suitable for a huge page allocation and a pointer + * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * * Must be protected by read_mems_allowed_begin() */ -struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, - gfp_t gfp_flags, struct mempolicy **mpol, - nodemask_t **nodemask) +int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, + struct mempolicy **mpol, nodemask_t **nodemask) { - struct zonelist *zl; + int nid; *mpol = get_vma_policy(vma, addr); *nodemask = NULL; /* assume !MPOL_BIND */ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { - zl = node_zonelist(interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))), gfp_flags); + nid = interleave_nid(*mpol, vma, addr, + huge_page_shift(hstate_vma(vma))); } else { - zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); + nid = policy_node(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } - return zl; + return nid; } /* @@ -1932,12 +1860,10 @@ out: static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) { - struct zonelist *zl; struct page *page; - zl = node_zonelist(nid, gfp); - page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) + page = __alloc_pages(gfp, order, nid); + if (page && page_to_nid(page) == nid) inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; } @@ -1971,13 +1897,10 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, { struct mempolicy *pol; struct page *page; - unsigned int cpuset_mems_cookie; - struct zonelist *zl; + int preferred_nid; nodemask_t *nmask; -retry_cpuset: pol = get_vma_policy(vma, addr); - cpuset_mems_cookie = read_mems_allowed_begin(); if (pol->mode == MPOL_INTERLEAVE) { unsigned nid; @@ -2015,12 +1938,10 @@ retry_cpuset: } nmask = policy_nodemask(gfp, pol); - zl = policy_zonelist(gfp, pol, node); - page = __alloc_pages_nodemask(gfp, order, zl, nmask); + preferred_nid = policy_node(gfp, pol, node); + page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; return page; } @@ -2038,23 +1959,15 @@ out: * Allocate a page from the kernel page pool. When not in * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. - * - * Don't call cpuset_update_task_memory_state() unless - * 1) it's ok to take cpuset_sem (can WAIT), and - * 2) allocating for current task (not interrupt). */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; struct page *page; - unsigned int cpuset_mems_cookie; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); -retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); - /* * No reference counting needed for current->mempolicy * nor system default_policy @@ -2063,12 +1976,9 @@ retry_cpuset: page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol, numa_node_id()), + policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; - return page; } EXPORT_SYMBOL(alloc_pages_current); @@ -2112,10 +2022,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); - if (new->flags & MPOL_F_REBINDING) - mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); - else - mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); + mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; diff --git a/mm/mempool.c b/mm/mempool.c index 47a659dedd44..1c0294858527 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -312,7 +312,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; - wait_queue_t wait; + wait_queue_entry_t wait; gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); diff --git a/mm/migrate.c b/mm/migrate.c index 89a0a1707f4c..051cc1555d36 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -227,25 +227,26 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); + flush_dcache_page(new); #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, vma, new, 0); - } -#endif - flush_dcache_page(new); - set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - - if (PageHuge(new)) { + set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, pvmw.address); else page_dup_rmap(new, true); - } else if (PageAnon(new)) - page_add_anon_rmap(new, vma, pvmw.address, false); - else - page_add_file_rmap(new, false); + } else +#endif + { + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); + if (PageAnon(new)) + page_add_anon_rmap(new, vma, pvmw.address, false); + else + page_add_file_rmap(new, false); + } if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); diff --git a/mm/mlock.c b/mm/mlock.c index c483c5c20b4b..b562b5523a65 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -284,7 +284,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) { int i; int nr = pagevec_count(pvec); - int delta_munlocked; + int delta_munlocked = -nr; struct pagevec pvec_putback; int pgrescued = 0; @@ -304,6 +304,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) continue; else __munlock_isolation_failed(page); + } else { + delta_munlocked++; } /* @@ -315,7 +317,6 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_add(&pvec_putback, pvec->pages[i]); pvec->pages[i] = NULL; } - delta_munlocked = -nr + pagevec_count(&pvec_putback); __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); spin_unlock_irq(zone_lru_lock(zone)); diff --git a/mm/mmap.c b/mm/mmap.c index f82741e199c0..5a0ba9788cdd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -94,7 +94,7 @@ static void unmap_region(struct mm_struct *mm, * w: (no) no * x: (yes) yes */ -pgprot_t protection_map[16] = { +pgprot_t protection_map[16] __ro_after_init = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 }; @@ -183,6 +183,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) unsigned long retval; unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; + struct vm_area_struct *next; unsigned long min_brk; bool populate; LIST_HEAD(uf); @@ -229,7 +230,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) } /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + next = find_vma(mm, oldbrk); + if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; /* Ok, looks good - let it rip. */ @@ -253,10 +255,22 @@ out: static long vma_compute_subtree_gap(struct vm_area_struct *vma) { - unsigned long max, subtree_gap; - max = vma->vm_start; - if (vma->vm_prev) - max -= vma->vm_prev->vm_end; + unsigned long max, prev_end, subtree_gap; + + /* + * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we + * allow two stack_guard_gaps between them here, and when choosing + * an unmapped area; whereas when expanding we only require one. + * That's a little inconsistent, but keeps the code here simpler. + */ + max = vm_start_gap(vma); + if (vma->vm_prev) { + prev_end = vm_end_gap(vma->vm_prev); + if (max > prev_end) + max -= prev_end; + else + max = 0; + } if (vma->vm_rb.rb_left) { subtree_gap = rb_entry(vma->vm_rb.rb_left, struct vm_area_struct, vm_rb)->rb_subtree_gap; @@ -352,7 +366,7 @@ static void validate_mm(struct mm_struct *mm) anon_vma_unlock_read(anon_vma); } - highest_address = vma->vm_end; + highest_address = vm_end_gap(vma); vma = vma->vm_next; i++; } @@ -541,7 +555,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_next) vma_gap_update(vma->vm_next); else - mm->highest_vm_end = vma->vm_end; + mm->highest_vm_end = vm_end_gap(vma); /* * vma->vm_prev wasn't known when we followed the rbtree to find the @@ -856,7 +870,7 @@ again: vma_gap_update(vma); if (end_changed) { if (!next) - mm->highest_vm_end = end; + mm->highest_vm_end = vm_end_gap(vma); else if (!adjust_next) vma_gap_update(next); } @@ -941,7 +955,7 @@ again: * mm->highest_vm_end doesn't need any update * in remove_next == 1 case. */ - VM_WARN_ON(mm->highest_vm_end != end); + VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); } } if (insert && file) @@ -1787,7 +1801,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info) while (true) { /* Visit left subtree if it looks promising */ - gap_end = vma->vm_start; + gap_end = vm_start_gap(vma); if (gap_end >= low_limit && vma->vm_rb.rb_left) { struct vm_area_struct *left = rb_entry(vma->vm_rb.rb_left, @@ -1798,12 +1812,13 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info) } } - gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; + gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; check_current: /* Check if current node has a suitable gap */ if (gap_start > high_limit) return -ENOMEM; - if (gap_end >= low_limit && gap_end - gap_start >= length) + if (gap_end >= low_limit && + gap_end > gap_start && gap_end - gap_start >= length) goto found; /* Visit right subtree if it looks promising */ @@ -1825,8 +1840,8 @@ check_current: vma = rb_entry(rb_parent(prev), struct vm_area_struct, vm_rb); if (prev == vma->vm_rb.rb_left) { - gap_start = vma->vm_prev->vm_end; - gap_end = vma->vm_start; + gap_start = vm_end_gap(vma->vm_prev); + gap_end = vm_start_gap(vma); goto check_current; } } @@ -1890,7 +1905,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) while (true) { /* Visit right subtree if it looks promising */ - gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; + gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; if (gap_start <= high_limit && vma->vm_rb.rb_right) { struct vm_area_struct *right = rb_entry(vma->vm_rb.rb_right, @@ -1903,10 +1918,11 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) check_current: /* Check if current node has a suitable gap */ - gap_end = vma->vm_start; + gap_end = vm_start_gap(vma); if (gap_end < low_limit) return -ENOMEM; - if (gap_start <= high_limit && gap_end - gap_start >= length) + if (gap_start <= high_limit && + gap_end > gap_start && gap_end - gap_start >= length) goto found; /* Visit left subtree if it looks promising */ @@ -1929,7 +1945,7 @@ check_current: struct vm_area_struct, vm_rb); if (prev == vma->vm_rb.rb_right) { gap_start = vma->vm_prev ? - vma->vm_prev->vm_end : 0; + vm_end_gap(vma->vm_prev) : 0; goto check_current; } } @@ -1967,7 +1983,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info; if (len > TASK_SIZE - mmap_min_addr) @@ -1978,9 +1994,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); + vma = find_vma_prev(mm, addr, &prev); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) return addr; } @@ -2003,7 +2020,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, const unsigned long flags) { - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; unsigned long addr = addr0; struct vm_unmapped_area_info info; @@ -2018,9 +2035,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); + vma = find_vma_prev(mm, addr, &prev); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) return addr; } @@ -2155,21 +2173,19 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, * update accounting. This is shared with both the * grow-up and grow-down cases. */ -static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) +static int acct_stack_growth(struct vm_area_struct *vma, + unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; struct rlimit *rlim = current->signal->rlim; - unsigned long new_start, actual_size; + unsigned long new_start; /* address space limit tests */ if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ - actual_size = size; - if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) - actual_size -= PAGE_SIZE; - if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) + if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) return -ENOMEM; /* mlock limit tests */ @@ -2207,16 +2223,32 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; + unsigned long gap_addr; int error = 0; if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; - /* Guard against wrapping around to address 0. */ - if (address < PAGE_ALIGN(address+4)) - address = PAGE_ALIGN(address+4); - else + /* Guard against exceeding limits of the address space. */ + address &= PAGE_MASK; + if (address >= TASK_SIZE) return -ENOMEM; + address += PAGE_SIZE; + + /* Enforce stack_guard_gap */ + gap_addr = address + stack_guard_gap; + + /* Guard against overflow */ + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + + next = vma->vm_next; + if (next && next->vm_start < gap_addr) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) @@ -2261,7 +2293,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_next) vma_gap_update(vma->vm_next); else - mm->highest_vm_end = address; + mm->highest_vm_end = vm_end_gap(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2282,6 +2314,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *prev; + unsigned long gap_addr; int error; address &= PAGE_MASK; @@ -2289,6 +2323,17 @@ int expand_downwards(struct vm_area_struct *vma, if (error) return error; + /* Enforce stack_guard_gap */ + gap_addr = address - stack_guard_gap; + if (gap_addr > address) + return -ENOMEM; + prev = vma->vm_prev; + if (prev && prev->vm_end > gap_addr) { + if (!(prev->vm_flags & VM_GROWSDOWN)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } + /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; @@ -2343,28 +2388,25 @@ int expand_downwards(struct vm_area_struct *vma, return error; } -/* - * Note how expand_stack() refuses to expand the stack all the way to - * abut the next virtual mapping, *unless* that mapping itself is also - * a stack mapping. We want to leave room for a guard page, after all - * (the guard page itself is not added here, that is done by the - * actual page faulting logic) - * - * This matches the behavior of the guard page logic (see mm/memory.c: - * check_stack_guard_page()), which only allows the guard page to be - * removed under these circumstances. - */ +/* enforced gap between the expanding stack and other mappings. */ +unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; + +static int __init cmdline_parse_stack_guard_gap(char *p) +{ + unsigned long val; + char *endptr; + + val = simple_strtoul(p, &endptr, 10); + if (!*endptr) + stack_guard_gap = val << PAGE_SHIFT; + + return 0; +} +__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); + #ifdef CONFIG_STACK_GROWSUP int expand_stack(struct vm_area_struct *vma, unsigned long address) { - struct vm_area_struct *next; - - address &= PAGE_MASK; - next = vma->vm_next; - if (next && next->vm_start == address + PAGE_SIZE) { - if (!(next->vm_flags & VM_GROWSUP)) - return -ENOMEM; - } return expand_upwards(vma, address); } @@ -2386,14 +2428,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) #else int expand_stack(struct vm_area_struct *vma, unsigned long address) { - struct vm_area_struct *prev; - - address &= PAGE_MASK; - prev = vma->vm_prev; - if (prev && prev->vm_end == address) { - if (!(prev->vm_flags & VM_GROWSDOWN)) - return -ENOMEM; - } return expand_downwards(vma, address); } @@ -2491,7 +2525,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_prev = prev; vma_gap_update(vma); } else - mm->highest_vm_end = prev ? prev->vm_end : 0; + mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; tail_vma->vm_next = NULL; /* Kill the cache */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 8edd0d576254..1a8c9ca83e48 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -58,8 +58,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * reading. */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (!pte) - return 0; /* Get target node for single threaded private VMAs */ if (prot_numa && !(vma->vm_flags & VM_SHARED) && diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 487dad610731..36454d0f96ee 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -118,7 +118,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start, unsigned long end_pfn = min_t(unsigned long, PFN_DOWN(end), max_low_pfn); - if (start_pfn > end_pfn) + if (start_pfn >= end_pfn) return 0; __free_pages_memory(start_pfn, end_pfn); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 04c9143a8625..0e2c925e7826 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -876,6 +876,11 @@ static void oom_kill_process(struct oom_control *oc, const char *message) /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; mmgrab(mm); + + /* Raise event before sending signal: task reaper must see this */ + count_vm_event(OOM_KILL); + count_memcg_event_mm(mm, OOM_KILL); + /* * We should send SIGKILL before setting TIF_MEMDIE in order to prevent * the OOM victim from depleting the memory reserves from the user diff --git a/mm/page-writeback.c b/mm/page-writeback.c index db30ce0b7d80..0b60cc7ddac2 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2432,8 +2432,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) inode_attach_wb(inode, page); wb = inode_to_wb(inode); - inc_memcg_page_state(page, NR_FILE_DIRTY); - __inc_node_page_state(page, NR_FILE_DIRTY); + __inc_lruvec_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); __inc_node_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); @@ -2454,8 +2453,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { - dec_memcg_page_state(page, NR_FILE_DIRTY); - dec_node_page_state(page, NR_FILE_DIRTY); + dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_SIZE); @@ -2711,8 +2709,7 @@ int clear_page_dirty_for_io(struct page *page) */ wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { - dec_memcg_page_state(page, NR_FILE_DIRTY); - dec_node_page_state(page, NR_FILE_DIRTY); + dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; @@ -2758,8 +2755,7 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - dec_memcg_page_state(page, NR_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK); + dec_lruvec_page_state(page, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } @@ -2813,8 +2809,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - inc_memcg_page_state(page, NR_WRITEBACK); - inc_node_page_state(page, NR_WRITEBACK); + inc_lruvec_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); } unlock_page_memcg(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f9e450c6b6e4..bd65b60939b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -113,9 +113,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { #ifdef CONFIG_HIGHMEM [N_HIGH_MEMORY] = { { [0] = 1UL } }, #endif -#ifdef CONFIG_MOVABLE_NODE [N_MEMORY] = { { [0] = 1UL } }, -#endif [N_CPU] = { { [0] = 1UL } }, #endif /* NUMA */ }; @@ -292,6 +290,26 @@ int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void reset_deferred_meminit(pg_data_t *pgdat) { + unsigned long max_initialise; + unsigned long reserved_lowmem; + + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_initialise = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); + + /* + * Compensate the all the memblock reservations (e.g. crash kernel) + * from the initial estimation to make sure we will initialize enough + * memory to boot. + */ + reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, + pgdat->node_start_pfn + max_initialise); + max_initialise += reserved_lowmem; + + pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -314,20 +332,11 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { - unsigned long max_initialise; - /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* - * Initialise at least 2G of a node but also take into account that - * two large system hashes that can take up 1GB for 0.25TB/node. - */ - max_initialise = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); - (*nr_initialised)++; - if ((*nr_initialised > max_initialise) && + if ((*nr_initialised > pgdat->static_init_size) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -500,7 +509,7 @@ static int page_is_consistent(struct zone *zone, struct page *page) /* * Temporary debugging check for pages not lying within a given zone. */ -static int bad_range(struct zone *zone, struct page *page) +static int __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) return 1; @@ -510,7 +519,7 @@ static int bad_range(struct zone *zone, struct page *page) return 0; } #else -static inline int bad_range(struct zone *zone, struct page *page) +static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) { return 0; } @@ -1286,8 +1295,9 @@ int __meminit early_pfn_to_nid(unsigned long pfn) #endif #ifdef CONFIG_NODES_SPAN_OTHER_NODES -static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +static inline bool __meminit __maybe_unused +meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) { int nid; @@ -1309,8 +1319,9 @@ static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { return true; } -static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +static inline bool __meminit __maybe_unused +meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) { return true; } @@ -1354,7 +1365,9 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) return NULL; - start_page = pfn_to_page(start_pfn); + start_page = pfn_to_online_page(start_pfn); + if (!start_page) + return NULL; if (page_zone(start_page) != zone) return NULL; @@ -3662,6 +3675,39 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, return false; } +static inline bool +check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) +{ + /* + * It's possible that cpuset's mems_allowed and the nodemask from + * mempolicy don't intersect. This should be normally dealt with by + * policy_nodemask(), but it's possible to race with cpuset update in + * such a way the check therein was true, and then it became false + * before we got our cpuset_mems_cookie here. + * This assumes that for all allocations, ac->nodemask can come only + * from MPOL_BIND mempolicy (whose documented semantics is to be ignored + * when it does not intersect with the cpuset restrictions) or the + * caller can deal with a violated nodemask. + */ + if (cpusets_enabled() && ac->nodemask && + !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { + ac->nodemask = NULL; + return true; + } + + /* + * When updating a task's mems_allowed or mempolicy nodemask, it is + * possible to race with parallel threads in such a way that our + * allocation can fail while the mask is being updated. If we are about + * to fail, check if the cpuset changed during allocation and if so, + * retry. + */ + if (read_mems_allowed_retry(cpuset_mems_cookie)) + return true; + + return false; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -3857,11 +3903,9 @@ retry: &compaction_retries)) goto retry; - /* - * It's possible we raced with cpuset update so the OOM would be - * premature (see below the nopage: label for full explanation). - */ - if (read_mems_allowed_retry(cpuset_mems_cookie)) + + /* Deal with possible cpuset update races before we start OOM killing */ + if (check_retry_cpuset(cpuset_mems_cookie, ac)) goto retry_cpuset; /* Reclaim has failed us, start killing things */ @@ -3870,7 +3914,9 @@ retry: goto got_pg; /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE)) + if (test_thread_flag(TIF_MEMDIE) && + (alloc_flags == ALLOC_NO_WATERMARKS || + (gfp_mask & __GFP_NOMEMALLOC))) goto nopage; /* Retry as long as the OOM killer is making progress */ @@ -3880,14 +3926,8 @@ retry: } nopage: - /* - * When updating a task's mems_allowed or mempolicy nodemask, it is - * possible to race with parallel threads in such a way that our - * allocation can fail while the mask is being updated. If we are about - * to fail, check if the cpuset changed during allocation and if so, - * retry. - */ - if (read_mems_allowed_retry(cpuset_mems_cookie)) + /* Deal with possible cpuset update races before we fail */ + if (check_retry_cpuset(cpuset_mems_cookie, ac)) goto retry_cpuset; /* @@ -3938,12 +3978,12 @@ got_pg: } static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask, + int preferred_nid, nodemask_t *nodemask, struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags) { ac->high_zoneidx = gfp_zone(gfp_mask); - ac->zonelist = zonelist; + ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; ac->migratetype = gfpflags_to_migratetype(gfp_mask); @@ -3988,8 +4028,8 @@ static inline void finalise_ac(gfp_t gfp_mask, * This is the 'heart' of the zoned buddy allocator. */ struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, + nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -3997,7 +4037,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct alloc_context ac = { }; gfp_mask &= gfp_allowed_mask; - if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) + if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; finalise_ac(gfp_mask, order, &ac); @@ -4601,8 +4641,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " slab_reclaimable:%lukB" - " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" " bounce:%lukB" @@ -4624,8 +4662,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), - K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), @@ -5111,6 +5147,7 @@ static void build_zonelists(pg_data_t *pgdat) */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void setup_zone_pageset(struct zone *zone); /* @@ -5515,7 +5552,7 @@ static __meminit void zone_pcp_init(struct zone *zone) zone_batchsize(zone)); } -int __meminit init_currently_empty_zone(struct zone *zone, +void __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size) { @@ -5533,8 +5570,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_init_free_lists(zone); zone->initialized = 1; - - return 0; } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -5992,7 +6027,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; - int ret; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6014,6 +6048,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); + pgdat->per_cpu_nodestats = &boot_nodestats; + for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; @@ -6074,8 +6110,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); - ret = init_currently_empty_zone(zone, zone_start_pfn, size); - BUG_ON(ret); + init_currently_empty_zone(zone, zone_start_pfn, size); memmap_init(size, nid, j, zone_start_pfn); } } @@ -6136,7 +6171,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); - reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; pgdat->per_cpu_nodestats = NULL; @@ -6158,6 +6192,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif + reset_deferred_meminit(pgdat); free_area_init_core(pgdat); } @@ -7169,6 +7204,21 @@ static unsigned long __init arch_reserved_kernel_pages(void) #endif /* + * Adaptive scale is meant to reduce sizes of hash tables on large memory + * machines. As memory size is increased the scale is also increased but at + * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory + * quadruples the scale is increased by one, which means the size of hash table + * only doubles, instead of quadrupling as well. + * Because 32-bit systems cannot have large physical memory, where this scaling + * makes sense, it is disabled on such platforms. + */ +#if __BITS_PER_LONG > 32 +#define ADAPT_SCALE_BASE (64ul << 30) +#define ADAPT_SCALE_SHIFT 2 +#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) +#endif + +/* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 * quantity of entries @@ -7187,6 +7237,7 @@ void *__init alloc_large_system_hash(const char *tablename, unsigned long long max = high_limit; unsigned long log2qty, size; void *table = NULL; + gfp_t gfp_flags; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -7198,6 +7249,16 @@ void *__init alloc_large_system_hash(const char *tablename, if (PAGE_SHIFT < 20) numentries = round_up(numentries, (1<<20)/PAGE_SIZE); +#if __BITS_PER_LONG > 32 + if (!high_limit) { + unsigned long adapt; + + for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; + adapt <<= ADAPT_SCALE_SHIFT) + scale++; + } +#endif + /* limit to 1 bucket per 2^scale bytes of low memory */ if (scale > PAGE_SHIFT) numentries >>= (scale - PAGE_SHIFT); @@ -7231,12 +7292,17 @@ void *__init alloc_large_system_hash(const char *tablename, log2qty = ilog2(numentries); + /* + * memblock allocator returns zeroed memory already, so HASH_ZERO is + * currently not used when HASH_EARLY is specified. + */ + gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { size = bucketsize << log2qty; if (flags & HASH_EARLY) table = memblock_virt_alloc_nopanic(size, 0); else if (hashdist) - table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags, PAGE_KERNEL); else { /* * If bucketsize is not a power-of-two, we may free @@ -7244,8 +7310,8 @@ void *__init alloc_large_system_hash(const char *tablename, * alloc_pages_exact() automatically does */ if (get_order(size) < MAX_ORDER) { - table = alloc_pages_exact(size, GFP_ATOMIC); - kmemleak_alloc(table, size, 1, GFP_ATOMIC); + table = alloc_pages_exact(size, gfp_flags); + kmemleak_alloc(table, size, 1, gfp_flags); } } } while (!table && size > PAGE_SIZE && --log2qty); @@ -7647,6 +7713,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) break; if (pfn == end_pfn) return; + offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); pfn = start_pfn; diff --git a/mm/page_io.c b/mm/page_io.c index 23f6d0d3470f..2da71e627812 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -45,7 +45,7 @@ void end_swap_bio_write(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); /* * We failed to write the page out to swap-space. @@ -118,7 +118,7 @@ static void end_swap_bio_read(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); ClearPageUptodate(page); pr_alert("Read-error on swap-device (%u:%u:%llu)\n", diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5092e4ef00c8..3606104893e0 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -138,12 +138,18 @@ static inline struct page * __first_valid_page(unsigned long pfn, unsigned long nr_pages) { int i; - for (i = 0; i < nr_pages; i++) - if (pfn_valid_within(pfn + i)) - break; - if (unlikely(i == nr_pages)) - return NULL; - return pfn_to_page(pfn + i); + + for (i = 0; i < nr_pages; i++) { + struct page *page; + + if (!pfn_valid_within(pfn + i)) + continue; + page = pfn_to_online_page(pfn + i); + if (!page) + continue; + return page; + } + return NULL; } /* @@ -184,8 +190,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, undo: for (pfn = start_pfn; pfn < undo_pfn; - pfn += pageblock_nr_pages) - unset_migratetype_isolate(pfn_to_page(pfn), migratetype); + pfn += pageblock_nr_pages) { + struct page *page = pfn_to_online_page(pfn); + if (!page) + continue; + unset_migratetype_isolate(page, migratetype); + } return -EBUSY; } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index de9c40d7304a..8ec6ba230bb9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -116,7 +116,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (unlikely(PageHuge(pvmw->page))) { /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address); + pvmw->pte = huge_pte_offset(mm, pvmw->address, + PAGE_SIZE << compound_order(page)); if (!pvmw->pte) return false; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 60f7856e508f..1a4197965415 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -180,12 +180,13 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); + unsigned long sz = huge_page_size(h); pte_t *pte; int err = 0; do { next = hugetlb_entry_end(h, addr, end); - pte = huge_pte_offset(walk->mm, addr & hmask); + pte = huge_pte_offset(walk->mm, addr & hmask, sz); if (pte && walk->hugetlb_entry) err = walk->hugetlb_entry(pte, hmask, addr, next, walk); if (err) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h new file mode 100644 index 000000000000..cd2442e13d8f --- /dev/null +++ b/mm/percpu-internal.h @@ -0,0 +1,166 @@ +#ifndef _MM_PERCPU_INTERNAL_H +#define _MM_PERCPU_INTERNAL_H + +#include <linux/types.h> +#include <linux/percpu.h> + +struct pcpu_chunk { +#ifdef CONFIG_PERCPU_STATS + int nr_alloc; /* # of allocations */ + size_t max_alloc_size; /* largest allocation size */ +#endif + + struct list_head list; /* linked to pcpu_slot lists */ + int free_size; /* free bytes in the chunk */ + int contig_hint; /* max contiguous size hint */ + void *base_addr; /* base address of this chunk */ + + int map_used; /* # of map entries used before the sentry */ + int map_alloc; /* # of map entries allocated */ + int *map; /* allocation map */ + struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ + + void *data; /* chunk data */ + int first_free; /* no free below this */ + bool immutable; /* no [de]population allowed */ + bool has_reserved; /* Indicates if chunk has reserved space + at the beginning. Reserved chunk will + contain reservation for static chunk. + Dynamic chunk will contain reservation + for static and reserved chunks. */ + int nr_populated; /* # of populated pages */ + unsigned long populated[]; /* populated bitmap */ +}; + +extern spinlock_t pcpu_lock; + +extern struct list_head *pcpu_slot; +extern int pcpu_nr_slots; + +extern struct pcpu_chunk *pcpu_first_chunk; +extern struct pcpu_chunk *pcpu_reserved_chunk; + +#ifdef CONFIG_PERCPU_STATS + +#include <linux/spinlock.h> + +struct percpu_stats { + u64 nr_alloc; /* lifetime # of allocations */ + u64 nr_dealloc; /* lifetime # of deallocations */ + u64 nr_cur_alloc; /* current # of allocations */ + u64 nr_max_alloc; /* max # of live allocations */ + u32 nr_chunks; /* current # of live chunks */ + u32 nr_max_chunks; /* max # of live chunks */ + size_t min_alloc_size; /* min allocaiton size */ + size_t max_alloc_size; /* max allocation size */ +}; + +extern struct percpu_stats pcpu_stats; +extern struct pcpu_alloc_info pcpu_stats_ai; + +/* + * For debug purposes. We don't care about the flexible array. + */ +static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) +{ + memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info)); + + /* initialize min_alloc_size to unit_size */ + pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size; +} + +/* + * pcpu_stats_area_alloc - increment area allocation stats + * @chunk: the location of the area being allocated + * @size: size of area to allocate in bytes + * + * CONTEXT: + * pcpu_lock. + */ +static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) +{ + lockdep_assert_held(&pcpu_lock); + + pcpu_stats.nr_alloc++; + pcpu_stats.nr_cur_alloc++; + pcpu_stats.nr_max_alloc = + max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc); + pcpu_stats.min_alloc_size = + min(pcpu_stats.min_alloc_size, size); + pcpu_stats.max_alloc_size = + max(pcpu_stats.max_alloc_size, size); + + chunk->nr_alloc++; + chunk->max_alloc_size = max(chunk->max_alloc_size, size); +} + +/* + * pcpu_stats_area_dealloc - decrement allocation stats + * @chunk: the location of the area being deallocated + * + * CONTEXT: + * pcpu_lock. + */ +static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) +{ + lockdep_assert_held(&pcpu_lock); + + pcpu_stats.nr_dealloc++; + pcpu_stats.nr_cur_alloc--; + + chunk->nr_alloc--; +} + +/* + * pcpu_stats_chunk_alloc - increment chunk stats + */ +static inline void pcpu_stats_chunk_alloc(void) +{ + unsigned long flags; + spin_lock_irqsave(&pcpu_lock, flags); + + pcpu_stats.nr_chunks++; + pcpu_stats.nr_max_chunks = + max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks); + + spin_unlock_irqrestore(&pcpu_lock, flags); +} + +/* + * pcpu_stats_chunk_dealloc - decrement chunk stats + */ +static inline void pcpu_stats_chunk_dealloc(void) +{ + unsigned long flags; + spin_lock_irqsave(&pcpu_lock, flags); + + pcpu_stats.nr_chunks--; + + spin_unlock_irqrestore(&pcpu_lock, flags); +} + +#else + +static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) +{ +} + +static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) +{ +} + +static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) +{ +} + +static inline void pcpu_stats_chunk_alloc(void) +{ +} + +static inline void pcpu_stats_chunk_dealloc(void) +{ +} + +#endif /* !CONFIG_PERCPU_STATS */ + +#endif diff --git a/mm/percpu-km.c b/mm/percpu-km.c index d66911ff42d9..eb58aa4c0997 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -72,6 +72,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void) pcpu_chunk_populated(chunk, 0, nr_pages); spin_unlock_irq(&pcpu_lock); + pcpu_stats_chunk_alloc(); + trace_percpu_create_chunk(chunk->base_addr); + return chunk; } @@ -79,7 +82,13 @@ static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) { const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; - if (chunk && chunk->data) + if (!chunk) + return; + + pcpu_stats_chunk_dealloc(); + trace_percpu_destroy_chunk(chunk->base_addr); + + if (chunk->data) __free_pages(chunk->data, order_base_2(nr_pages)); pcpu_free_chunk(chunk); } diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c new file mode 100644 index 000000000000..03524a56eeff --- /dev/null +++ b/mm/percpu-stats.c @@ -0,0 +1,222 @@ +/* + * mm/percpu-debug.c + * + * Copyright (C) 2017 Facebook Inc. + * Copyright (C) 2017 Dennis Zhou <dennisz@fb.com> + * + * This file is released under the GPLv2. + * + * Prints statistics about the percpu allocator and backing chunks. + */ +#include <linux/debugfs.h> +#include <linux/list.h> +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/sort.h> +#include <linux/vmalloc.h> + +#include "percpu-internal.h" + +#define P(X, Y) \ + seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y) + +struct percpu_stats pcpu_stats; +struct pcpu_alloc_info pcpu_stats_ai; + +static int cmpint(const void *a, const void *b) +{ + return *(int *)a - *(int *)b; +} + +/* + * Iterates over all chunks to find the max # of map entries used. + */ +static int find_max_map_used(void) +{ + struct pcpu_chunk *chunk; + int slot, max_map_used; + + max_map_used = 0; + for (slot = 0; slot < pcpu_nr_slots; slot++) + list_for_each_entry(chunk, &pcpu_slot[slot], list) + max_map_used = max(max_map_used, chunk->map_used); + + return max_map_used; +} + +/* + * Prints out chunk state. Fragmentation is considered between + * the beginning of the chunk to the last allocation. + */ +static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, + void *buffer) +{ + int i, s_index, last_alloc, alloc_sign, as_len; + int *alloc_sizes, *p; + /* statistics */ + int sum_frag = 0, max_frag = 0; + int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0; + + alloc_sizes = buffer; + s_index = chunk->has_reserved ? 1 : 0; + + /* find last allocation */ + last_alloc = -1; + for (i = chunk->map_used - 1; i >= s_index; i--) { + if (chunk->map[i] & 1) { + last_alloc = i; + break; + } + } + + /* if the chunk is not empty - ignoring reserve */ + if (last_alloc >= s_index) { + as_len = last_alloc + 1 - s_index; + + /* + * Iterate through chunk map computing size info. + * The first bit is overloaded to be a used flag. + * negative = free space, positive = allocated + */ + for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) { + alloc_sign = (*p & 1) ? 1 : -1; + alloc_sizes[i] = alloc_sign * + ((p[1] & ~1) - (p[0] & ~1)); + } + + sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL); + + /* Iterate through the unallocated fragements. */ + for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) { + sum_frag -= *p; + max_frag = max(max_frag, -1 * (*p)); + } + + cur_min_alloc = alloc_sizes[i]; + cur_med_alloc = alloc_sizes[(i + as_len - 1) / 2]; + cur_max_alloc = alloc_sizes[as_len - 1]; + } + + P("nr_alloc", chunk->nr_alloc); + P("max_alloc_size", chunk->max_alloc_size); + P("free_size", chunk->free_size); + P("contig_hint", chunk->contig_hint); + P("sum_frag", sum_frag); + P("max_frag", max_frag); + P("cur_min_alloc", cur_min_alloc); + P("cur_med_alloc", cur_med_alloc); + P("cur_max_alloc", cur_max_alloc); + seq_putc(m, '\n'); +} + +static int percpu_stats_show(struct seq_file *m, void *v) +{ + struct pcpu_chunk *chunk; + int slot, max_map_used; + void *buffer; + +alloc_buffer: + spin_lock_irq(&pcpu_lock); + max_map_used = find_max_map_used(); + spin_unlock_irq(&pcpu_lock); + + buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0])); + if (!buffer) + return -ENOMEM; + + spin_lock_irq(&pcpu_lock); + + /* if the buffer allocated earlier is too small */ + if (max_map_used < find_max_map_used()) { + spin_unlock_irq(&pcpu_lock); + vfree(buffer); + goto alloc_buffer; + } + +#define PL(X) \ + seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X) + + seq_printf(m, + "Percpu Memory Statistics\n" + "Allocation Info:\n" + "----------------------------------------\n"); + PL(unit_size); + PL(static_size); + PL(reserved_size); + PL(dyn_size); + PL(atom_size); + PL(alloc_size); + seq_putc(m, '\n'); + +#undef PL + +#define PU(X) \ + seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X) + + seq_printf(m, + "Global Stats:\n" + "----------------------------------------\n"); + PU(nr_alloc); + PU(nr_dealloc); + PU(nr_cur_alloc); + PU(nr_max_alloc); + PU(nr_chunks); + PU(nr_max_chunks); + PU(min_alloc_size); + PU(max_alloc_size); + seq_putc(m, '\n'); + +#undef PU + + seq_printf(m, + "Per Chunk Stats:\n" + "----------------------------------------\n"); + + if (pcpu_reserved_chunk) { + seq_puts(m, "Chunk: <- Reserved Chunk\n"); + chunk_map_stats(m, pcpu_reserved_chunk, buffer); + } + + for (slot = 0; slot < pcpu_nr_slots; slot++) { + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + if (chunk == pcpu_first_chunk) { + seq_puts(m, "Chunk: <- First Chunk\n"); + chunk_map_stats(m, chunk, buffer); + + + } else { + seq_puts(m, "Chunk:\n"); + chunk_map_stats(m, chunk, buffer); + } + + } + } + + spin_unlock_irq(&pcpu_lock); + + vfree(buffer); + + return 0; +} + +static int percpu_stats_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, percpu_stats_show, NULL); +} + +static const struct file_operations percpu_stats_fops = { + .open = percpu_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init init_percpu_stats_debugfs(void) +{ + debugfs_create_file("percpu_stats", 0444, NULL, NULL, + &percpu_stats_fops); + + return 0; +} + +late_initcall(init_percpu_stats_debugfs); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 9ac639499bd1..15dab691ea70 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -343,12 +343,22 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->data = vms; chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; + + pcpu_stats_chunk_alloc(); + trace_percpu_create_chunk(chunk->base_addr); + return chunk; } static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) { - if (chunk && chunk->data) + if (!chunk) + return; + + pcpu_stats_chunk_dealloc(); + trace_percpu_destroy_chunk(chunk->base_addr); + + if (chunk->data) pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); pcpu_free_chunk(chunk); } diff --git a/mm/percpu.c b/mm/percpu.c index e0aa8ae7bde7..bd4130a69bbc 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -76,6 +76,11 @@ #include <asm/tlbflush.h> #include <asm/io.h> +#define CREATE_TRACE_POINTS +#include <trace/events/percpu.h> + +#include "percpu-internal.h" + #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 @@ -103,53 +108,35 @@ #define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) #endif /* CONFIG_SMP */ -struct pcpu_chunk { - struct list_head list; /* linked to pcpu_slot lists */ - int free_size; /* free bytes in the chunk */ - int contig_hint; /* max contiguous size hint */ - void *base_addr; /* base address of this chunk */ - - int map_used; /* # of map entries used before the sentry */ - int map_alloc; /* # of map entries allocated */ - int *map; /* allocation map */ - struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ - - void *data; /* chunk data */ - int first_free; /* no free below this */ - bool immutable; /* no [de]population allowed */ - int nr_populated; /* # of populated pages */ - unsigned long populated[]; /* populated bitmap */ -}; - -static int pcpu_unit_pages __read_mostly; -static int pcpu_unit_size __read_mostly; -static int pcpu_nr_units __read_mostly; -static int pcpu_atom_size __read_mostly; -static int pcpu_nr_slots __read_mostly; -static size_t pcpu_chunk_struct_size __read_mostly; +static int pcpu_unit_pages __ro_after_init; +static int pcpu_unit_size __ro_after_init; +static int pcpu_nr_units __ro_after_init; +static int pcpu_atom_size __ro_after_init; +int pcpu_nr_slots __ro_after_init; +static size_t pcpu_chunk_struct_size __ro_after_init; /* cpus with the lowest and highest unit addresses */ -static unsigned int pcpu_low_unit_cpu __read_mostly; -static unsigned int pcpu_high_unit_cpu __read_mostly; +static unsigned int pcpu_low_unit_cpu __ro_after_init; +static unsigned int pcpu_high_unit_cpu __ro_after_init; /* the address of the first chunk which starts with the kernel static area */ -void *pcpu_base_addr __read_mostly; +void *pcpu_base_addr __ro_after_init; EXPORT_SYMBOL_GPL(pcpu_base_addr); -static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ -const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ +static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */ /* group information, used for vm allocation */ -static int pcpu_nr_groups __read_mostly; -static const unsigned long *pcpu_group_offsets __read_mostly; -static const size_t *pcpu_group_sizes __read_mostly; +static int pcpu_nr_groups __ro_after_init; +static const unsigned long *pcpu_group_offsets __ro_after_init; +static const size_t *pcpu_group_sizes __ro_after_init; /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different * ways and thus often doesn't live in the vmalloc area. */ -static struct pcpu_chunk *pcpu_first_chunk; +struct pcpu_chunk *pcpu_first_chunk __ro_after_init; /* * Optional reserved chunk. This chunk reserves part of the first @@ -158,13 +145,13 @@ static struct pcpu_chunk *pcpu_first_chunk; * area doesn't exist, the following variables contain NULL and 0 * respectively. */ -static struct pcpu_chunk *pcpu_reserved_chunk; -static int pcpu_reserved_chunk_limit; +struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; +static int pcpu_reserved_chunk_limit __ro_after_init; -static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ +DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ -static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */ /* chunks which need their map areas extended, protected by pcpu_lock */ static LIST_HEAD(pcpu_map_extend_chunks); @@ -672,6 +659,9 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, int to_free = 0; int *p; + lockdep_assert_held(&pcpu_lock); + pcpu_stats_area_dealloc(chunk); + freeme |= 1; /* we are searching for <given offset, in use> pair */ i = 0; @@ -735,6 +725,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map[0] = 0; chunk->map[1] = pcpu_unit_size | 1; chunk->map_used = 1; + chunk->has_reserved = false; INIT_LIST_HEAD(&chunk->list); INIT_LIST_HEAD(&chunk->map_extend_list); @@ -965,8 +956,10 @@ restart: * tasks to create chunks simultaneously. Serialize and create iff * there's still no empty chunk after grabbing the mutex. */ - if (is_atomic) + if (is_atomic) { + err = "atomic alloc failed, no space left"; goto fail; + } if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { chunk = pcpu_create_chunk(); @@ -984,6 +977,7 @@ restart: goto restart; area_found: + pcpu_stats_area_alloc(chunk, size); spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ @@ -1026,11 +1020,17 @@ area_found: ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size, gfp); + + trace_percpu_alloc_percpu(reserved, is_atomic, size, align, + chunk->base_addr, off, ptr); + return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: + trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); + if (!is_atomic && warn_limit) { pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", size, align, is_atomic, err); @@ -1280,6 +1280,8 @@ void free_percpu(void __percpu *ptr) } } + trace_percpu_free_percpu(chunk->base_addr, off, ptr); + spin_unlock_irqrestore(&pcpu_lock, flags); } EXPORT_SYMBOL_GPL(free_percpu); @@ -1656,6 +1658,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); + pcpu_stats_save_ai(ai); + /* * Allocate chunk slots. The additional last slot is for * empty chunks. @@ -1699,6 +1703,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (schunk->free_size) schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size; schunk->map[schunk->map_used] |= 1; + schunk->has_reserved = true; /* init dynamic chunk if necessary */ if (dyn_size) { @@ -1717,6 +1722,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, dchunk->map[1] = pcpu_reserved_chunk_limit; dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; dchunk->map_used = 2; + dchunk->has_reserved = true; } /* link the first chunk in */ @@ -1725,6 +1731,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_count_occupied_pages(pcpu_first_chunk, 1); pcpu_chunk_relocate(pcpu_first_chunk, -1); + pcpu_stats_chunk_alloc(); + trace_percpu_create_chunk(base_addr); + /* we're done */ pcpu_base_addr = base_addr; return 0; diff --git a/mm/rmap.c b/mm/rmap.c index d405f0e0ee96..ced14f1af6dc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) void try_to_unmap_flush(void) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; - int cpu; if (!tlb_ubc->flush_required) return; - cpu = get_cpu(); - - if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - local_flush_tlb(); - trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); - } - - if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) - flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL); - cpumask_clear(&tlb_ubc->cpumask); + arch_tlbbatch_flush(&tlb_ubc->arch); tlb_ubc->flush_required = false; tlb_ubc->writable = false; - put_cpu(); } /* Flush iff there are potentially writable TLB entries that can race with IO */ @@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; - cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); + arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); tlb_ubc->flush_required = true; /* @@ -1157,8 +1145,7 @@ void page_add_file_rmap(struct page *page, bool compound) if (!atomic_inc_and_test(&page->_mapcount)) goto out; } - __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); - mod_memcg_page_state(page, NR_FILE_MAPPED, nr); + __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); } @@ -1193,12 +1180,11 @@ static void page_remove_file_rmap(struct page *page, bool compound) } /* - * We use the irq-unsafe __{inc|mod}_zone_page_state because + * We use the irq-unsafe __{inc|mod}_lruvec_page_state because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); - mod_memcg_page_state(page, NR_FILE_MAPPED, -nr); + __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1379,15 +1365,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, update_hiwater_rss(mm); if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (PageHuge(page)) { int nr = 1 << compound_order(page); hugetlb_count_sub(nr, mm); + set_huge_swap_pte_at(mm, address, + pvmw.pte, pteval, + vma_mmu_pagesize(vma)); } else { dec_mm_counter(mm, mm_counter(page)); + set_pte_at(mm, address, pvmw.pte, pteval); } - pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); - set_pte_at(mm, address, pvmw.pte, pteval); } else if (pte_unused(pteval)) { /* * The guest indicated that the page content is of no diff --git a/mm/shmem.c b/mm/shmem.c index e67d6ba4e98e..9418f5a9bc46 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -75,6 +75,7 @@ static struct vfsmount *shm_mnt; #include <uapi/linux/memfd.h> #include <linux/userfaultfd_k.h> #include <linux/rmap.h> +#include <linux/uuid.h> #include <linux/uaccess.h> #include <asm/pgtable.h> @@ -1290,7 +1291,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) SetPageUptodate(page); } - swap = get_swap_page(); + swap = get_swap_page(page); if (!swap.val) goto redirty; @@ -1326,7 +1327,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) mutex_unlock(&shmem_swaplist_mutex); free_swap: - swapcache_free(swap); + put_swap_page(page, swap); redirty: set_page_dirty(page); if (wbc->for_reclaim) @@ -1645,8 +1646,7 @@ repeat: if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(charge_mm, - PGMAJFAULT); + count_memcg_event_mm(charge_mm, PGMAJFAULT); } /* Here we actually start the io */ page = shmem_swapin(swap, gfp, info, index); @@ -1902,10 +1902,10 @@ unlock: * entry unconditionally - even if something else had already woken the * target. */ -static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); - list_del_init(&wait->task_list); + list_del_init(&wait->entry); return ret; } @@ -2840,7 +2840,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); - WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list)); + WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); spin_unlock(&inode->i_lock); error = 0; goto out; @@ -3761,6 +3761,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif + uuid_gen(&sb->s_uuid); inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (!inode) diff --git a/mm/slab.c b/mm/slab.c index 2a31ee3c5814..04dec48c3ed7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1425,11 +1425,9 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - add_zone_page_state(page_zone(page), - NR_SLAB_RECLAIMABLE, nr_pages); + mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages); else - add_zone_page_state(page_zone(page), - NR_SLAB_UNRECLAIMABLE, nr_pages); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages); __SetPageSlab(page); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ @@ -1459,11 +1457,9 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - sub_zone_page_state(page_zone(page), - NR_SLAB_RECLAIMABLE, nr_freed); + mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); else - sub_zone_page_state(page_zone(page), - NR_SLAB_UNRECLAIMABLE, nr_freed); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed); BUG_ON(!PageSlab(page)); __ClearPageSlabPfmemalloc(page); @@ -2040,17 +2036,13 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ - if (size & (BYTES_PER_WORD - 1)) { - size += (BYTES_PER_WORD - 1); - size &= ~(BYTES_PER_WORD - 1); - } + size = ALIGN(size, BYTES_PER_WORD); if (flags & SLAB_RED_ZONE) { ralign = REDZONE_ALIGN; /* If redzoning, ensure that the second redzone is suitably * aligned, by adjusting the object size accordingly. */ - size += REDZONE_ALIGN - 1; - size &= ~(REDZONE_ALIGN - 1); + size = ALIGN(size, REDZONE_ALIGN); } /* 3) caller mandated alignment */ diff --git a/mm/slab.h b/mm/slab.h index 9cfcf099709c..6885e1192ec5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -274,22 +274,11 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - int ret; - if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - - ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); - if (ret) - return ret; - - memcg_kmem_update_page_stat(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, - 1 << order); - return 0; + return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); } static __always_inline void memcg_uncharge_slab(struct page *page, int order, @@ -297,11 +286,6 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, { if (!memcg_kmem_enabled()) return; - - memcg_kmem_update_page_stat(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, - -(1 << order)); memcg_kmem_uncharge(page, order); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 01a0fe2eb332..904a83be82de 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -47,13 +47,12 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, /* * Merge control. If this is set then no merging of slab caches will occur. - * (Could be removed. This was introduced to pacify the merge skeptics.) */ -static int slab_nomerge; +static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); static int __init setup_slab_nomerge(char *str) { - slab_nomerge = 1; + slab_nomerge = true; return 1; } diff --git a/mm/slub.c b/mm/slub.c index 57e5156f02be..1d3f9835f4ea 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1615,7 +1615,7 @@ out: if (!page) return NULL; - mod_zone_page_state(page_zone(page), + mod_lruvec_page_state(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1 << oo_order(oo)); @@ -1655,7 +1655,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) kmemcheck_free_shadow(page, compound_order(page)); - mod_zone_page_state(page_zone(page), + mod_lruvec_page_state(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); @@ -1829,7 +1829,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, stat(s, CPU_PARTIAL_NODE); } if (!kmem_cache_has_cpu_partial(s) - || available > s->cpu_partial / 2) + || available > slub_cpu_partial(s) / 2) break; } @@ -1993,7 +1993,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) * Remove the cpu slab */ static void deactivate_slab(struct kmem_cache *s, struct page *page, - void *freelist) + void *freelist, struct kmem_cache_cpu *c) { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -2132,6 +2132,9 @@ redo: discard_slab(s, page); stat(s, FREE_SLAB); } + + c->page = NULL; + c->freelist = NULL; } /* @@ -2266,11 +2269,9 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); - deactivate_slab(s, c->page, c->freelist); + deactivate_slab(s, c->page, c->freelist, c); c->tid = next_tid(c->tid); - c->page = NULL; - c->freelist = NULL; } /* @@ -2302,7 +2303,7 @@ static bool has_cpu_slab(int cpu, void *info) struct kmem_cache *s = info; struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - return c->page || c->partial; + return c->page || slub_percpu_partial(c); } static void flush_all(struct kmem_cache *s) @@ -2521,9 +2522,7 @@ redo: if (unlikely(!node_match(page, searchnode))) { stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; + deactivate_slab(s, page, c->freelist, c); goto new_slab; } } @@ -2534,9 +2533,7 @@ redo: * information when the page leaves the per-cpu allocator */ if (unlikely(!pfmemalloc_match(page, gfpflags))) { - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; + deactivate_slab(s, page, c->freelist, c); goto new_slab; } @@ -2568,11 +2565,10 @@ load_freelist: new_slab: - if (c->partial) { - page = c->page = c->partial; - c->partial = page->next; + if (slub_percpu_partial(c)) { + page = c->page = slub_percpu_partial(c); + slub_set_percpu_partial(c, page); stat(s, CPU_PARTIAL_ALLOC); - c->freelist = NULL; goto redo; } @@ -2592,9 +2588,7 @@ new_slab: !alloc_debug_processing(s, page, freelist, addr)) goto new_slab; /* Slab failed checks. Next slab needed */ - deactivate_slab(s, page, get_freepointer(s, freelist)); - c->page = NULL; - c->freelist = NULL; + deactivate_slab(s, page, get_freepointer(s, freelist), c); return freelist; } @@ -3410,6 +3404,39 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) s->min_partial = min; } +static void set_cpu_partial(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * This setting also determines + * + * A) The number of objects from per cpu partial slabs dumped to the + * per node list when we reach the limit. + * B) The number of objects in cpu partial slabs to extract from the + * per node list when we run out of per cpu objects. We only fetch + * 50% to keep some capacity around for frees. + */ + if (!kmem_cache_has_cpu_partial(s)) + s->cpu_partial = 0; + else if (s->size >= PAGE_SIZE) + s->cpu_partial = 2; + else if (s->size >= 1024) + s->cpu_partial = 6; + else if (s->size >= 256) + s->cpu_partial = 13; + else + s->cpu_partial = 30; +#endif +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. @@ -3568,33 +3595,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) */ set_min_partial(s, ilog2(s->size) / 2); - /* - * cpu_partial determined the maximum number of objects kept in the - * per cpu partial lists of a processor. - * - * Per cpu partial lists mainly contain slabs that just have one - * object freed. If they are used for allocation then they can be - * filled up again with minimal effort. The slab will never hit the - * per node partial lists and therefore no locking will be required. - * - * This setting also determines - * - * A) The number of objects from per cpu partial slabs dumped to the - * per node list when we reach the limit. - * B) The number of objects in cpu partial slabs to extract from the - * per node list when we run out of per cpu objects. We only fetch - * 50% to keep some capacity around for frees. - */ - if (!kmem_cache_has_cpu_partial(s)) - s->cpu_partial = 0; - else if (s->size >= PAGE_SIZE) - s->cpu_partial = 2; - else if (s->size >= 1024) - s->cpu_partial = 6; - else if (s->size >= 256) - s->cpu_partial = 13; - else - s->cpu_partial = 30; + set_cpu_partial(s); #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -3981,7 +3982,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s) * Disable empty slabs caching. Used to avoid pinning offline * memory cgroups by kmem pages that can be freed. */ - s->cpu_partial = 0; + slub_set_cpu_partial(s, 0); s->min_partial = 0; /* @@ -4760,7 +4761,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, total += x; nodes[node] += x; - page = READ_ONCE(c->partial); + page = slub_percpu_partial_read_once(c); if (page) { node = page_to_nid(page); if (flags & SO_TOTAL) @@ -4921,7 +4922,7 @@ SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->cpu_partial); + return sprintf(buf, "%u\n", slub_cpu_partial(s)); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -4936,7 +4937,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, if (objects && !kmem_cache_has_cpu_partial(s)) return -EINVAL; - s->cpu_partial = objects; + slub_set_cpu_partial(s, objects); flush_all(s); return length; } @@ -4988,7 +4989,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) int len; for_each_online_cpu(cpu) { - struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + struct page *page; + + page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (page) { pages += page->pages; @@ -5000,7 +5003,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) #ifdef CONFIG_SMP for_each_online_cpu(cpu) { - struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + struct page *page; + + page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (page && len < PAGE_SIZE - 20) len += sprintf(buf + len, " C%d=%d(%d)", cpu, @@ -5512,6 +5517,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) char mbuf[64]; char *buf; struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + ssize_t len; if (!attr || !attr->store || !attr->show) continue; @@ -5536,8 +5542,9 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) buf = buffer; } - attr->show(root_cache, buf); - attr->store(s, buf, strlen(buf)); + len = attr->show(root_cache, buf); + if (len > 0) + attr->store(s, buf, len); } if (buffer) @@ -5623,6 +5630,28 @@ static char *create_unique_id(struct kmem_cache *s) return name; } +static void sysfs_slab_remove_workfn(struct work_struct *work) +{ + struct kmem_cache *s = + container_of(work, struct kmem_cache, kobj_remove_work); + + if (!s->kobj.state_in_sysfs) + /* + * For a memcg cache, this may be called during + * deactivation and again on shutdown. Remove only once. + * A cache is never shut down before deactivation is + * complete, so no need to worry about synchronization. + */ + return; + +#ifdef CONFIG_MEMCG + kset_unregister(s->memcg_kset); +#endif + kobject_uevent(&s->kobj, KOBJ_REMOVE); + kobject_del(&s->kobj); + kobject_put(&s->kobj); +} + static int sysfs_slab_add(struct kmem_cache *s) { int err; @@ -5630,6 +5659,8 @@ static int sysfs_slab_add(struct kmem_cache *s) struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s); + INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn); + if (!kset) { kobject_init(&s->kobj, &slab_ktype); return 0; @@ -5693,20 +5724,8 @@ static void sysfs_slab_remove(struct kmem_cache *s) */ return; - if (!s->kobj.state_in_sysfs) - /* - * For a memcg cache, this may be called during - * deactivation and again on shutdown. Remove only once. - * A cache is never shut down before deactivation is - * complete, so no need to worry about synchronization. - */ - return; - -#ifdef CONFIG_MEMCG - kset_unregister(s->memcg_kset); -#endif - kobject_uevent(&s->kobj, KOBJ_REMOVE); - kobject_del(&s->kobj); + kobject_get(&s->kobj); + schedule_work(&s->kobj_remove_work); } void sysfs_slab_release(struct kmem_cache *s) diff --git a/mm/sparse.c b/mm/sparse.c index 6903c8fc3085..7b4be3fd5cac 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -168,6 +168,44 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, } } +/* + * There are a number of times that we loop over NR_MEM_SECTIONS, + * looking for section_present() on each. But, when we have very + * large physical address spaces, NR_MEM_SECTIONS can also be + * very large which makes the loops quite long. + * + * Keeping track of this gives us an easy way to break out of + * those loops early. + */ +int __highest_present_section_nr; +static void section_mark_present(struct mem_section *ms) +{ + int section_nr = __section_nr(ms); + + if (section_nr > __highest_present_section_nr) + __highest_present_section_nr = section_nr; + + ms->section_mem_map |= SECTION_MARKED_PRESENT; +} + +static inline int next_present_section_nr(int section_nr) +{ + do { + section_nr++; + if (present_section_nr(section_nr)) + return section_nr; + } while ((section_nr < NR_MEM_SECTIONS) && + (section_nr <= __highest_present_section_nr)); + + return -1; +} +#define for_each_present_section_nr(start, section_nr) \ + for (section_nr = next_present_section_nr(start-1); \ + ((section_nr >= 0) && \ + (section_nr < NR_MEM_SECTIONS) && \ + (section_nr <= __highest_present_section_nr)); \ + section_nr = next_present_section_nr(section_nr)) + /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -183,9 +221,11 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) set_section_nid(section, nid); ms = __nr_to_section(section); - if (!ms->section_mem_map) + if (!ms->section_mem_map) { ms->section_mem_map = sparse_encode_early_nid(nid) | - SECTION_MARKED_PRESENT; + SECTION_IS_ONLINE; + section_mark_present(ms); + } } } @@ -476,23 +516,19 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) int nodeid_begin = 0; unsigned long pnum_begin = 0; - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + for_each_present_section_nr(0, pnum) { struct mem_section *ms; - if (!present_section_nr(pnum)) - continue; ms = __nr_to_section(pnum); nodeid_begin = sparse_early_nid(ms); pnum_begin = pnum; break; } map_count = 1; - for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + for_each_present_section_nr(pnum_begin + 1, pnum) { struct mem_section *ms; int nodeid; - if (!present_section_nr(pnum)) - continue; ms = __nr_to_section(pnum); nodeid = sparse_early_nid(ms); if (nodeid == nodeid_begin) { @@ -561,10 +597,7 @@ void __init sparse_init(void) (void *)map_map); #endif - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { - if (!present_section_nr(pnum)) - continue; - + for_each_present_section_nr(0, pnum) { usemap = usemap_map[pnum]; if (!usemap) continue; @@ -590,6 +623,48 @@ void __init sparse_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG + +/* Mark all memory sections within the pfn range as online */ +void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + + /* onlining code should never touch invalid ranges */ + if (WARN_ON(!valid_section_nr(section_nr))) + continue; + + ms = __nr_to_section(section_nr); + ms->section_mem_map |= SECTION_IS_ONLINE; + } +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* Mark all memory sections within the pfn range as online */ +void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + + /* + * TODO this needs some double checking. Offlining code makes + * sure to check pfn_valid but those checks might be just bogus + */ + if (WARN_ON(!valid_section_nr(section_nr))) + continue; + + ms = __nr_to_section(section_nr); + ms->section_mem_map &= ~SECTION_IS_ONLINE; + } +} +#endif + #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) { @@ -686,10 +761,9 @@ static void free_map_bootmem(struct page *memmap) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) +int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn) { unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct pglist_data *pgdat = zone->zone_pgdat; struct mem_section *ms; struct page *memmap; unsigned long *usemap; @@ -722,7 +796,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); - ms->section_mem_map |= SECTION_MARKED_PRESENT; + section_mark_present(ms); ret = sparse_init_one_section(ms, section_nr, memmap, usemap); diff --git a/mm/swap.c b/mm/swap.c index 98d08b4579fa..4f44dbd7f780 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -591,6 +591,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); + count_memcg_page_event(page, PGLAZYFREE); update_page_reclaim_stat(lruvec, 1, 0); } } diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index ac6318a064d3..fcd2740f4ed7 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -48,6 +48,9 @@ static int swap_cgroup_prepare(int type) if (!page) goto not_enough_page; ctrl->map[idx] = page; + + if (!(idx % SWAP_CLUSTER_MAX)) + cond_resched(); } return 0; not_enough_page: @@ -58,21 +61,27 @@ not_enough_page: return -ENOMEM; } +static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl, + pgoff_t offset) +{ + struct page *mappage; + struct swap_cgroup *sc; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; +} + static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, struct swap_cgroup_ctrl **ctrlp) { pgoff_t offset = swp_offset(ent); struct swap_cgroup_ctrl *ctrl; - struct page *mappage; - struct swap_cgroup *sc; ctrl = &swap_cgroup_ctrl[swp_type(ent)]; if (ctrlp) *ctrlp = ctrl; - - mappage = ctrl->map[offset / SC_PER_PAGE]; - sc = page_address(mappage); - return sc + offset % SC_PER_PAGE; + return __lookup_swap_cgroup(ctrl, offset); } /** @@ -105,25 +114,39 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, } /** - * swap_cgroup_record - record mem_cgroup for this swp_entry. - * @ent: swap entry to be recorded into + * swap_cgroup_record - record mem_cgroup for a set of swap entries + * @ent: the first swap entry to be recorded into * @id: mem_cgroup to be recorded + * @nr_ents: number of swap entries to be recorded * * Returns old value at success, 0 at failure. * (Of course, old value can be 0.) */ -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, + unsigned int nr_ents) { struct swap_cgroup_ctrl *ctrl; struct swap_cgroup *sc; unsigned short old; unsigned long flags; + pgoff_t offset = swp_offset(ent); + pgoff_t end = offset + nr_ents; sc = lookup_swap_cgroup(ent, &ctrl); spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; - sc->id = id; + for (;;) { + VM_BUG_ON(sc->id != old); + sc->id = id; + offset++; + if (offset == end) + break; + if (offset % SC_PER_PAGE) + sc++; + else + sc = __lookup_swap_cgroup(ctrl, offset); + } spin_unlock_irqrestore(&ctrl->lock, flags); return old; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 58f6c78f1dad..90c1032a8ac3 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -263,7 +263,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) - cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots); + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false, + cache->slots); return cache->nr; } @@ -301,11 +302,19 @@ direct_free: return 0; } -swp_entry_t get_swap_page(void) +swp_entry_t get_swap_page(struct page *page) { swp_entry_t entry, *pentry; struct swap_slots_cache *cache; + entry.val = 0; + + if (PageTransHuge(page)) { + if (IS_ENABLED(CONFIG_THP_SWAP)) + get_swap_pages(1, true, &entry); + return entry; + } + /* * Preemption is allowed here, because we may sleep * in refill_swap_slots_cache(). But it is safe, because @@ -317,7 +326,6 @@ swp_entry_t get_swap_page(void) */ cache = raw_cpu_ptr(&swp_slots); - entry.val = 0; if (check_cache_active()) { mutex_lock(&cache->alloc_lock); if (cache->slots) { @@ -337,7 +345,7 @@ repeat: return entry; } - get_swap_pages(1, &entry); + get_swap_pages(1, false, &entry); return entry; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 539b8885e3d1..9c71b6b2562f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -19,6 +19,7 @@ #include <linux/migrate.h> #include <linux/vmalloc.h> #include <linux/swap_slots.h> +#include <linux/huge_mm.h> #include <asm/pgtable.h> @@ -38,6 +39,7 @@ struct address_space *swapper_spaces[MAX_SWAPFILES]; static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) +#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) static struct { unsigned long add_total; @@ -90,39 +92,46 @@ void show_swap_cache_info(void) */ int __add_to_swap_cache(struct page *page, swp_entry_t entry) { - int error; + int error, i, nr = hpage_nr_pages(page); struct address_space *address_space; + pgoff_t idx = swp_offset(entry); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - get_page(page); + page_ref_add(page, nr); SetPageSwapCache(page); - set_page_private(page, entry.val); address_space = swap_address_space(entry); spin_lock_irq(&address_space->tree_lock); - error = radix_tree_insert(&address_space->page_tree, - swp_offset(entry), page); - if (likely(!error)) { - address_space->nrpages++; - __inc_node_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(add_total); + for (i = 0; i < nr; i++) { + set_page_private(page + i, entry.val + i); + error = radix_tree_insert(&address_space->page_tree, + idx + i, page + i); + if (unlikely(error)) + break; } - spin_unlock_irq(&address_space->tree_lock); - - if (unlikely(error)) { + if (likely(!error)) { + address_space->nrpages += nr; + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); + ADD_CACHE_INFO(add_total, nr); + } else { /* * Only the context which have set SWAP_HAS_CACHE flag * would call add_to_swap_cache(). * So add_to_swap_cache() doesn't returns -EEXIST. */ VM_BUG_ON(error == -EEXIST); - set_page_private(page, 0UL); + set_page_private(page + i, 0UL); + while (i--) { + radix_tree_delete(&address_space->page_tree, idx + i); + set_page_private(page + i, 0UL); + } ClearPageSwapCache(page); - put_page(page); + page_ref_sub(page, nr); } + spin_unlock_irq(&address_space->tree_lock); return error; } @@ -132,7 +141,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; - error = radix_tree_maybe_preload(gfp_mask); + error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); if (!error) { error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); @@ -146,8 +155,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { - swp_entry_t entry; struct address_space *address_space; + int i, nr = hpage_nr_pages(page); + swp_entry_t entry; + pgoff_t idx; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); @@ -155,12 +166,15 @@ void __delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - radix_tree_delete(&address_space->page_tree, swp_offset(entry)); - set_page_private(page, 0); + idx = swp_offset(entry); + for (i = 0; i < nr; i++) { + radix_tree_delete(&address_space->page_tree, idx + i); + set_page_private(page + i, 0); + } ClearPageSwapCache(page); - address_space->nrpages--; - __dec_node_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(del_total); + address_space->nrpages -= nr; + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + ADD_CACHE_INFO(del_total, nr); } /** @@ -170,7 +184,7 @@ void __delete_from_swap_cache(struct page *page) * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page *page, struct list_head *list) +int add_to_swap(struct page *page) { swp_entry_t entry; int err; @@ -178,20 +192,12 @@ int add_to_swap(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); - entry = get_swap_page(); + entry = get_swap_page(page); if (!entry.val) return 0; - if (mem_cgroup_try_charge_swap(page, entry)) { - swapcache_free(entry); - return 0; - } - - if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page_to_list(page, list))) { - swapcache_free(entry); - return 0; - } + if (mem_cgroup_try_charge_swap(page, entry)) + goto fail; /* * Radix-tree node allocations from PF_MEMALLOC contexts could @@ -206,17 +212,19 @@ int add_to_swap(struct page *page, struct list_head *list) */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - - if (!err) { - return 1; - } else { /* -ENOMEM radix-tree allocation failure */ + /* -ENOMEM radix-tree allocation failure */ + if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry); - return 0; - } + goto fail; + + return 1; + +fail: + put_swap_page(page, entry); + return 0; } /* @@ -237,8 +245,8 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&address_space->tree_lock); - swapcache_free(entry); - put_page(page); + put_swap_page(page, entry); + page_ref_sub(page, hpage_nr_pages(page)); } /* @@ -295,7 +303,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), swp_offset(entry)); - if (page) { + if (page && likely(!PageTransCompound(page))) { INC_CACHE_INFO(find_success); if (TestClearPageReadahead(page)) atomic_inc(&swapin_readahead_hits); @@ -389,7 +397,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry); + put_swap_page(new_page, entry); } while (err != -ENOMEM); if (new_page) @@ -506,7 +514,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, gfp_mask, vma, addr); if (!page) continue; - if (offset != entry_offset) + if (offset != entry_offset && likely(!PageTransCompound(page))) SetPageReadahead(page); put_page(page); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 4f6cba1b6632..811d90e1c929 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -37,6 +37,7 @@ #include <linux/swapfile.h> #include <linux/export.h> #include <linux/swap_slots.h> +#include <linux/sort.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -199,7 +200,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } +#ifdef CONFIG_THP_SWAP +#define SWAPFILE_CLUSTER HPAGE_PMD_NR +#else #define SWAPFILE_CLUSTER 256 +#endif #define LATENCY_LIMIT 256 static inline void cluster_set_flag(struct swap_cluster_info *info, @@ -374,6 +379,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, schedule_work(&si->discard_work); } +static void __free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info; + + cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); + cluster_list_add_tail(&si->free_clusters, ci, idx); +} + /* * Doing discard actually. After a cluster discard is finished, the cluster * will be added to free cluster list. caller should hold si->lock. @@ -394,10 +407,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) spin_lock(&si->lock); ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); - cluster_set_flag(ci, CLUSTER_FLAG_FREE); - unlock_cluster(ci); - cluster_list_add_tail(&si->free_clusters, info, idx); - ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); + __free_cluster(si, idx); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); unlock_cluster(ci); @@ -415,6 +425,34 @@ static void swap_discard_work(struct work_struct *work) spin_unlock(&si->lock); } +static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info; + + VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); + cluster_list_del_first(&si->free_clusters, ci); + cluster_set_count_flag(ci + idx, 0, 0); +} + +static void free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info + idx; + + VM_BUG_ON(cluster_count(ci) != 0); + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed + * after discard. + */ + if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { + swap_cluster_schedule_discard(si, idx); + return; + } + + __free_cluster(si, idx); +} + /* * The cluster corresponding to page_nr will be used. The cluster will be * removed from free cluster list and its usage counter will be increased. @@ -426,11 +464,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p, if (!cluster_info) return; - if (cluster_is_free(&cluster_info[idx])) { - VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx); - cluster_list_del_first(&p->free_clusters, cluster_info); - cluster_set_count_flag(&cluster_info[idx], 0, 0); - } + if (cluster_is_free(&cluster_info[idx])) + alloc_cluster(p, idx); VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], @@ -454,21 +489,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p, cluster_set_count(&cluster_info[idx], cluster_count(&cluster_info[idx]) - 1); - if (cluster_count(&cluster_info[idx]) == 0) { - /* - * If the swap is discardable, prepare discard the cluster - * instead of free it immediately. The cluster will be freed - * after discard. - */ - if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == - (SWP_WRITEOK | SWP_PAGE_DISCARD)) { - swap_cluster_schedule_discard(p, idx); - return; - } - - cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - cluster_list_add_tail(&p->free_clusters, cluster_info, idx); - } + if (cluster_count(&cluster_info[idx]) == 0) + free_cluster(p, idx); } /* @@ -558,6 +580,60 @@ new_cluster: return found_free; } +static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries) +{ + unsigned int end = offset + nr_entries - 1; + + if (offset == si->lowest_bit) + si->lowest_bit += nr_entries; + if (end == si->highest_bit) + si->highest_bit -= nr_entries; + si->inuse_pages += nr_entries; + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; + spin_lock(&swap_avail_lock); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + } +} + +static void swap_range_free(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries) +{ + unsigned long end = offset + nr_entries - 1; + void (*swap_slot_free_notify)(struct block_device *, unsigned long); + + if (offset < si->lowest_bit) + si->lowest_bit = offset; + if (end > si->highest_bit) { + bool was_full = !si->highest_bit; + + si->highest_bit = end; + if (was_full && (si->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&si->avail_list)); + if (plist_node_empty(&si->avail_list)) + plist_add(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + } + } + atomic_long_add(nr_entries, &nr_swap_pages); + si->inuse_pages -= nr_entries; + if (si->flags & SWP_BLKDEV) + swap_slot_free_notify = + si->bdev->bd_disk->fops->swap_slot_free_notify; + else + swap_slot_free_notify = NULL; + while (offset <= end) { + frontswap_invalidate_page(si->type, offset); + if (swap_slot_free_notify) + swap_slot_free_notify(si->bdev, offset); + offset++; + } +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -676,18 +752,7 @@ checks: inc_cluster_info_page(si, si->cluster_info, offset); unlock_cluster(ci); - if (offset == si->lowest_bit) - si->lowest_bit++; - if (offset == si->highest_bit) - si->highest_bit--; - si->inuse_pages++; - if (si->inuse_pages == si->pages) { - si->lowest_bit = si->max; - si->highest_bit = 0; - spin_lock(&swap_avail_lock); - plist_del(&si->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); - } + swap_range_alloc(si, offset, 1); si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); @@ -766,6 +831,52 @@ no_page: return n_ret; } +#ifdef CONFIG_THP_SWAP +static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) +{ + unsigned long idx; + struct swap_cluster_info *ci; + unsigned long offset, i; + unsigned char *map; + + if (cluster_list_empty(&si->free_clusters)) + return 0; + + idx = cluster_list_first(&si->free_clusters); + offset = idx * SWAPFILE_CLUSTER; + ci = lock_cluster(si, offset); + alloc_cluster(si, idx); + cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0); + + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) + map[i] = SWAP_HAS_CACHE; + unlock_cluster(ci); + swap_range_alloc(si, offset, SWAPFILE_CLUSTER); + *slot = swp_entry(si->type, offset); + + return 1; +} + +static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + unsigned long offset = idx * SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + cluster_set_count_flag(ci, 0, 0); + free_cluster(si, idx); + unlock_cluster(ci); + swap_range_free(si, offset, SWAPFILE_CLUSTER); +} +#else +static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) +{ + VM_WARN_ON_ONCE(1); + return 0; +} +#endif /* CONFIG_THP_SWAP */ + static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned char usage) { @@ -781,13 +892,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } -int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) +int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) { + unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1; struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; - avail_pgs = atomic_long_read(&nr_swap_pages); + /* Only single cluster request supported */ + WARN_ON_ONCE(n_goal > 1 && cluster); + + avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages; if (avail_pgs <= 0) goto noswap; @@ -797,7 +912,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) if (n_goal > avail_pgs) n_goal = avail_pgs; - atomic_long_sub(n_goal, &nr_swap_pages); + atomic_long_sub(n_goal * nr_pages, &nr_swap_pages); spin_lock(&swap_avail_lock); @@ -823,10 +938,13 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries); + if (cluster) + n_ret = swap_alloc_cluster(si, swp_entries); + else + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries); spin_unlock(&si->lock); - if (n_ret) + if (n_ret || cluster) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); @@ -852,7 +970,8 @@ nextsi: check_out: if (n_ret < n_goal) - atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages); + atomic_long_add((long)(n_goal - n_ret) * nr_pages, + &nr_swap_pages); noswap: return n_ret; } @@ -1008,32 +1127,8 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) dec_cluster_info_page(p, p->cluster_info, offset); unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry); - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) { - bool was_full = !p->highest_bit; - - p->highest_bit = offset; - if (was_full && (p->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - WARN_ON(!plist_node_empty(&p->avail_list)); - if (plist_node_empty(&p->avail_list)) - plist_add(&p->avail_list, - &swap_avail_head); - spin_unlock(&swap_avail_lock); - } - } - atomic_long_inc(&nr_swap_pages); - p->inuse_pages--; - frontswap_invalidate_page(p->type, offset); - if (p->flags & SWP_BLKDEV) { - struct gendisk *disk = p->bdev->bd_disk; - - if (disk->fops->swap_slot_free_notify) - disk->fops->swap_slot_free_notify(p->bdev, - offset); - } + mem_cgroup_uncharge_swap(entry, 1); + swap_range_free(p, offset, 1); } /* @@ -1054,7 +1149,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void swapcache_free(swp_entry_t entry) +static void swapcache_free(swp_entry_t entry) { struct swap_info_struct *p; @@ -1065,6 +1160,52 @@ void swapcache_free(swp_entry_t entry) } } +#ifdef CONFIG_THP_SWAP +static void swapcache_free_cluster(swp_entry_t entry) +{ + unsigned long offset = swp_offset(entry); + unsigned long idx = offset / SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + struct swap_info_struct *si; + unsigned char *map; + unsigned int i; + + si = swap_info_get(entry); + if (!si) + return; + + ci = lock_cluster(si, offset); + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + VM_BUG_ON(map[i] != SWAP_HAS_CACHE); + map[i] = 0; + } + unlock_cluster(ci); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); +} +#else +static inline void swapcache_free_cluster(swp_entry_t entry) +{ +} +#endif /* CONFIG_THP_SWAP */ + +void put_swap_page(struct page *page, swp_entry_t entry) +{ + if (!PageTransHuge(page)) + swapcache_free(entry); + else + swapcache_free_cluster(entry); +} + +static int swp_entry_cmp(const void *ent1, const void *ent2) +{ + const swp_entry_t *e1 = ent1, *e2 = ent2; + + return (int)swp_type(*e1) - (int)swp_type(*e2); +} + void swapcache_free_entries(swp_entry_t *entries, int n) { struct swap_info_struct *p, *prev; @@ -1075,6 +1216,14 @@ void swapcache_free_entries(swp_entry_t *entries, int n) prev = NULL; p = NULL; + + /* + * Sort swap entries by swap device, so each lock is only taken once. + * nr_swapfiles isn't absolutely correct, but the overhead of sort() is + * so low that it isn't necessary to optimize further. + */ + if (nr_swapfiles > 1) + sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); for (i = 0; i < n; ++i) { p = swap_info_get_cont(entries[i], prev); if (p) diff --git a/mm/util.c b/mm/util.c index 464df3489903..26be6407abd7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -357,8 +357,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL); /* - * Make sure that larger requests are not too disruptive - no OOM - * killer and no allocation failure warnings as we have a fallback + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - no + * OOM killer and no allocation failure warnings as we have a fallback. */ if (size > PAGE_SIZE) { kmalloc_flags |= __GFP_NOWARN; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 34a1c3e46ed7..6211a807cb31 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -287,10 +287,21 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (p4d_none(*p4d)) return NULL; pud = pud_offset(p4d, addr); - if (pud_none(*pud)) + + /* + * Don't dereference bad PUD or PMD (below) entries. This will also + * identify huge mappings, which we may encounter on architectures + * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be + * identified as vmalloc addresses by is_vmalloc_addr(), but are + * not [unambiguously] associated with a struct page, so there is + * no correct value to return for them. + */ + WARN_ON_ONCE(pud_bad(*pud)); + if (pud_none(*pud) || pud_bad(*pud)) return NULL; pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + WARN_ON_ONCE(pmd_bad(*pmd)); + if (pmd_none(*pmd) || pmd_bad(*pmd)) return NULL; ptep = pte_offset_map(pmd, addr); @@ -1759,12 +1770,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, */ clear_vm_uninitialized_flag(area); - /* - * A ref_count = 2 is needed because vm_struct allocated in - * __get_vm_area_node() contains a reference to the virtual address of - * the vmalloc'ed block. - */ - kmemleak_alloc(addr, real_size, 2, gfp_mask); + kmemleak_vmalloc(area, size, gfp_mask); return addr; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 6063581f705c..ce0618bfa8d0 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -115,9 +115,9 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, unsigned long pressure = 0; /* - * reclaimed can be greater than scanned in cases - * like THP, where the scanned is 1 and reclaimed - * could be 512 + * reclaimed can be greater than scanned for things such as reclaimed + * slab pages. shrink_node() just adds reclaimed pages without a + * related increment to scanned pages. */ if (reclaimed >= scanned) goto out; diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ad39bbc79e6..9e95fafc026b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -708,7 +708,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - swapcache_free(swap); + put_swap_page(page, swap); } else { void (*freepage)(struct page *); void *shadow = NULL; @@ -1125,8 +1125,36 @@ static unsigned long shrink_page_list(struct list_head *page_list, !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - if (!add_to_swap(page, page_list)) + if (PageTransHuge(page)) { + /* cannot split THP, skip it */ + if (!can_split_huge_page(page, NULL)) + goto activate_locked; + /* + * Split pages without a PMD map right + * away. Chances are some or all of the + * tail pages can be freed without IO. + */ + if (!compound_mapcount(page) && + split_huge_page_to_list(page, page_list)) + goto activate_locked; + } + if (!add_to_swap(page)) { + if (!PageTransHuge(page)) + goto activate_locked; + /* Split THP and swap individual base pages */ + if (split_huge_page_to_list(page, page_list)) + goto activate_locked; + if (!add_to_swap(page)) + goto activate_locked; + } + + /* XXX: We don't support THP writes */ + if (PageTransHuge(page) && + split_huge_page_to_list(page, page_list)) { + delete_from_swap_cache(page); goto activate_locked; + } + may_enter_fs = 1; /* Adding to swap updated mapping */ @@ -1266,6 +1294,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } count_vm_event(PGLAZYFREED); + count_memcg_page_event(page, PGLAZYFREED); } else if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* @@ -1295,6 +1324,7 @@ activate_locked: if (!PageMlocked(page)) { SetPageActive(page); pgactivate++; + count_memcg_page_event(page, PGACTIVATE); } keep_locked: unlock_page(page); @@ -1734,11 +1764,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) { - if (current_is_kswapd()) + if (current_is_kswapd()) { + if (global_reclaim(sc)) __count_vm_events(PGSCAN_KSWAPD, nr_scanned); - else + count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD, + nr_scanned); + } else { + if (global_reclaim(sc)) __count_vm_events(PGSCAN_DIRECT, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT, + nr_scanned); } spin_unlock_irq(&pgdat->lru_lock); @@ -1750,11 +1785,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); - if (global_reclaim(sc)) { - if (current_is_kswapd()) + if (current_is_kswapd()) { + if (global_reclaim(sc)) __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); - else + count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD, + nr_reclaimed); + } else { + if (global_reclaim(sc)) __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); + count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, + nr_reclaimed); } putback_inactive_pages(lruvec, &page_list); @@ -1899,8 +1939,11 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec, } } - if (!is_active_lru(lru)) + if (!is_active_lru(lru)) { __count_vm_events(PGDEACTIVATE, nr_moved); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_moved); + } return nr_moved; } @@ -1938,6 +1981,7 @@ static void shrink_active_list(unsigned long nr_to_scan, reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); @@ -2967,7 +3011,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, - .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), + .gfp_mask = current_gfp_context(gfp_mask), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, @@ -2982,12 +3026,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, * 1 is returned so that the page allocator does not OOM kill at this * point. */ - if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) + if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, - gfp_mask, + sc.gfp_mask, sc.reclaim_idx); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -3652,7 +3696,7 @@ int kswapd_run(int nid) pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ - BUG_ON(system_state == SYSTEM_BOOTING); + BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; @@ -3774,17 +3818,16 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - int classzone_idx = gfp_zone(gfp_mask); unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), + .gfp_mask = current_gfp_context(gfp_mask), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, - .reclaim_idx = classzone_idx, + .reclaim_idx = gfp_zone(gfp_mask), }; cond_resched(); @@ -3795,7 +3838,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - lockdep_set_current_reclaim_state(gfp_mask); + lockdep_set_current_reclaim_state(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3831,7 +3874,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) * unmapped file backed pages. */ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && - sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) + node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 76f73670200a..744ceaeb42a0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -928,8 +928,6 @@ const char * const vmstat_text[] = { "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", - "nr_slab_reclaimable", - "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", "nr_bounce", @@ -952,6 +950,8 @@ const char * const vmstat_text[] = { "nr_inactive_file", "nr_active_file", "nr_unevictable", + "nr_slab_reclaimable", + "nr_slab_unreclaimable", "nr_isolated_anon", "nr_isolated_file", "workingset_refault", @@ -1018,6 +1018,7 @@ const char * const vmstat_text[] = { "drop_pagecache", "drop_slab", + "oom_kill", #ifdef CONFIG_NUMA_BALANCING "numa_pte_updates", @@ -1223,11 +1224,10 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { struct page *page; - if (!pfn_valid(pfn)) + page = pfn_to_online_page(pfn); + if (!page) continue; - page = pfn_to_page(pfn); - /* Watch for unexpected holes punched in the memmap */ if (!memmap_valid_within(pfn, page, zone)) continue; @@ -1322,7 +1322,7 @@ static int fragmentation_open(struct inode *inode, struct file *file) return seq_open(file, &fragmentation_op); } -static const struct file_operations fragmentation_file_operations = { +static const struct file_operations buddyinfo_file_operations = { .open = fragmentation_open, .read = seq_read, .llseek = seq_lseek, @@ -1341,7 +1341,7 @@ static int pagetypeinfo_open(struct inode *inode, struct file *file) return seq_open(file, &pagetypeinfo_op); } -static const struct file_operations pagetypeinfo_file_ops = { +static const struct file_operations pagetypeinfo_file_operations = { .open = pagetypeinfo_open, .read = seq_read, .llseek = seq_lseek, @@ -1463,7 +1463,7 @@ static int zoneinfo_open(struct inode *inode, struct file *file) return seq_open(file, &zoneinfo_op); } -static const struct file_operations proc_zoneinfo_file_operations = { +static const struct file_operations zoneinfo_file_operations = { .open = zoneinfo_open, .read = seq_read, .llseek = seq_lseek, @@ -1552,7 +1552,7 @@ static int vmstat_open(struct inode *inode, struct file *file) return seq_open(file, &vmstat_op); } -static const struct file_operations proc_vmstat_file_operations = { +static const struct file_operations vmstat_file_operations = { .open = vmstat_open, .read = seq_read, .llseek = seq_lseek, @@ -1785,10 +1785,10 @@ void __init init_mm_internals(void) start_shepherd_timer(); #endif #ifdef CONFIG_PROC_FS - proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); - proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); - proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); - proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); + proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations); + proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations); + proc_create("vmstat", 0444, NULL, &vmstat_file_operations); + proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations); #endif } diff --git a/mm/workingset.c b/mm/workingset.c index b8c9ab678479..7119cd745ace 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -288,12 +288,10 @@ bool workingset_refault(void *shadow) */ refault_distance = (refault - eviction) & EVICTION_MASK; - inc_node_state(pgdat, WORKINGSET_REFAULT); - inc_memcg_state(memcg, WORKINGSET_REFAULT); + inc_lruvec_state(lruvec, WORKINGSET_REFAULT); if (refault_distance <= active_file) { - inc_node_state(pgdat, WORKINGSET_ACTIVATE); - inc_memcg_state(memcg, WORKINGSET_ACTIVATE); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); rcu_read_unlock(); return true; } @@ -474,8 +472,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; - inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); - inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); + inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node, workingset_update_node, mapping); diff --git a/mm/zswap.c b/mm/zswap.c index eedc27894b10..d39581a076c3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -371,10 +371,9 @@ static int zswap_dstmem_prepare(unsigned int cpu) u8 *dst; dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!dst) { - pr_err("can't allocate compressor buffer\n"); + if (!dst) return -ENOMEM; - } + per_cpu(zswap_dstmem, cpu) = dst; return 0; } @@ -515,10 +514,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) } pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) { - pr_err("pool alloc failed\n"); + if (!pool) return NULL; - } /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); @@ -1158,7 +1155,7 @@ static void zswap_frontswap_init(unsigned type) { struct zswap_tree *tree; - tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); + tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); return; |