From 23955622ff8d231bcc9650b3d06583f117a6e3ba Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 10 Jul 2017 15:47:11 -0700 Subject: swap: add block io poll in swapin path For fast flash disk, async IO could introduce overhead because of context switch. block-mq now supports IO poll, which improves performance and latency a lot. swapin is a good place to use this technique, because the task is waiting for the swapin page to continue execution. In my virtual machine, directly read 4k data from a NVMe with iopoll is about 60% better than that without poll. With iopoll support in swapin patch, my microbenchmark (a task does random memory write) is about 10%~25% faster. CPU utilization increases a lot though, 2x and even 3x CPU utilization. This will depend on disk speed. While iopoll in swapin isn't intended for all usage cases, it's a win for latency sensistive workloads with high speed swap disk. block layer has knob to control poll in runtime. If poll isn't enabled in block layer, there should be no noticeable change in swapin. I got a chance to run the same test in a NVMe with DRAM as the media. In simple fio IO test, blkpoll boosts 50% performance in single thread test and ~20% in 8 threads test. So this is the base line. In above swap test, blkpoll boosts ~27% performance in single thread test. blkpoll uses 2x CPU time though. If we enable hybid polling, the performance gain has very slight drop but CPU time is only 50% worse than that without blkpoll. Also we can adjust parameter of hybid poll, with it, the CPU time penality is reduced further. In 8 threads test, blkpoll doesn't help though. The performance is similar to that without blkpoll, but cpu utilization is similar too. There is lock contention in swap path. The cpu time spending on blkpoll isn't high. So overall, blkpoll swapin isn't worse than that without it. The swapin readahead might read several pages in in the same time and form a big IO request. Since the IO will take longer time, it doesn't make sense to do poll, so the patch only does iopoll for single page swapin. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/070c3c3e40b711e7b1390002c991e86a-b5408f0@7511894063d3764ff01ea8111f5a004d7dd700ed078797c204a24e620ddb965c Signed-off-by: Shaohua Li Cc: Tim Chen Cc: Huang Ying Cc: Jens Axboe Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 5ab1c98c7d27..61e7180cee21 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -331,7 +331,7 @@ extern void kswapd_stop(int nid); #include /* for bio_end_io_t */ /* linux/mm/page_io.c */ -extern int swap_readpage(struct page *); +extern int swap_readpage(struct page *page, bool do_poll); extern int swap_writepage(struct page *page, struct writeback_control *wbc); extern void end_swap_bio_write(struct bio *bio); extern int __swap_writepage(struct page *page, struct writeback_control *wbc, @@ -362,7 +362,8 @@ extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); extern struct page *lookup_swap_cache(swp_entry_t); extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, - struct vm_area_struct *vma, unsigned long addr); + struct vm_area_struct *vma, unsigned long addr, + bool do_poll); extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated); -- cgit v1.2.3-70-g09d2 From b37ff71cc626a0c1b5e098ff9a0b723815f6aaeb Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 10 Jul 2017 15:47:38 -0700 Subject: mm: hwpoison: change PageHWPoison behavior on hugetlb pages We'd like to narrow down the error region in memory error on hugetlb pages. However, currently we set PageHWPoison flags on all subpages in the error hugepage and add # of subpages to num_hwpoison_pages, which doesn't fit our purpose. So this patch changes the behavior and we only set PageHWPoison on the head page then increase num_hwpoison_pages only by 1. This is a preparation for narrow-down part which comes in later patches. Link: http://lkml.kernel.org/r/1496305019-5493-4-git-send-email-n-horiguchi@ah.jp.nec.com Signed-off-by: Naoya Horiguchi Cc: Michal Hocko Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swapops.h | 9 ----- mm/memory-failure.c | 87 ++++++++++++++----------------------------------- 2 files changed, 24 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 5c3a5f3e7eec..c5ff7b217ee6 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -196,15 +196,6 @@ static inline void num_poisoned_pages_dec(void) atomic_long_dec(&num_poisoned_pages); } -static inline void num_poisoned_pages_add(long num) -{ - atomic_long_add(num, &num_poisoned_pages); -} - -static inline void num_poisoned_pages_sub(long num) -{ - atomic_long_sub(num, &num_poisoned_pages); -} #else static inline swp_entry_t make_hwpoison_entry(struct page *page) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e3bf6432ed25..a9ddb0e72f5b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1009,22 +1009,6 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, return unmap_success; } -static void set_page_hwpoison_huge_page(struct page *hpage) -{ - int i; - int nr_pages = 1 << compound_order(hpage); - for (i = 0; i < nr_pages; i++) - SetPageHWPoison(hpage + i); -} - -static void clear_page_hwpoison_huge_page(struct page *hpage) -{ - int i; - int nr_pages = 1 << compound_order(hpage); - for (i = 0; i < nr_pages; i++) - ClearPageHWPoison(hpage + i); -} - /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -1050,7 +1034,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags) struct page *hpage; struct page *orig_head; int res; - unsigned int nr_pages; unsigned long page_flags; if (!sysctl_memory_failure_recovery) @@ -1064,24 +1047,23 @@ int memory_failure(unsigned long pfn, int trapno, int flags) p = pfn_to_page(pfn); orig_head = hpage = compound_head(p); + + /* tmporary check code, to be updated in later patches */ + if (PageHuge(p)) { + if (TestSetPageHWPoison(hpage)) { + pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); + return 0; + } + goto tmp; + } if (TestSetPageHWPoison(p)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); return 0; } - /* - * Currently errors on hugetlbfs pages are measured in hugepage units, - * so nr_pages should be 1 << compound_order. OTOH when errors are on - * transparent hugepages, they are supposed to be split and error - * measurement is done in normal page units. So nr_pages should be one - * in this case. - */ - if (PageHuge(p)) - nr_pages = 1 << compound_order(hpage); - else /* normal page or thp */ - nr_pages = 1; - num_poisoned_pages_add(nr_pages); +tmp: + num_poisoned_pages_inc(); /* * We need/can do nothing about count=0 pages. @@ -1109,12 +1091,11 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (PageHWPoison(hpage)) { if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) || (p != hpage && TestSetPageHWPoison(hpage))) { - num_poisoned_pages_sub(nr_pages); + num_poisoned_pages_dec(); unlock_page(hpage); return 0; } } - set_page_hwpoison_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage); action_result(pfn, MF_MSG_FREE_HUGE, res ? MF_IGNORED : MF_DELAYED); @@ -1137,7 +1118,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) pr_err("Memory failure: %#lx: thp split failed\n", pfn); if (TestClearPageHWPoison(p)) - num_poisoned_pages_sub(nr_pages); + num_poisoned_pages_dec(); put_hwpoison_page(p); return -EBUSY; } @@ -1193,14 +1174,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (!PageHWPoison(p)) { pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_sub(nr_pages); + num_poisoned_pages_dec(); unlock_page(hpage); put_hwpoison_page(hpage); return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - num_poisoned_pages_sub(nr_pages); + num_poisoned_pages_dec(); unlock_page(hpage); put_hwpoison_page(hpage); return 0; @@ -1219,14 +1200,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags) put_hwpoison_page(hpage); return 0; } - /* - * Set PG_hwpoison on all pages in an error hugepage, - * because containment is done in hugepage unit for now. - * Since we have done TestSetPageHWPoison() for the head page with - * page lock held, we can safely set PG_hwpoison bits on tail pages. - */ - if (PageHuge(p)) - set_page_hwpoison_huge_page(hpage); /* * It's very difficult to mess with pages currently under IO @@ -1397,7 +1370,6 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int freeit = 0; - unsigned int nr_pages; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1442,8 +1414,6 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_order(page); - if (!get_hwpoison_page(p)) { /* * Since HWPoisoned hugepage should have non-zero refcount, @@ -1473,10 +1443,8 @@ int unpoison_memory(unsigned long pfn) if (TestClearPageHWPoison(page)) { unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", pfn, &unpoison_rs); - num_poisoned_pages_sub(nr_pages); + num_poisoned_pages_dec(); freeit = 1; - if (PageHuge(page)) - clear_page_hwpoison_huge_page(page); } unlock_page(page); @@ -1608,14 +1576,10 @@ static int soft_offline_huge_page(struct page *page, int flags) ret = -EIO; } else { /* overcommit hugetlb page will be freed to buddy */ - if (PageHuge(page)) { - set_page_hwpoison_huge_page(hpage); + SetPageHWPoison(page); + if (PageHuge(page)) dequeue_hwpoisoned_huge_page(hpage); - num_poisoned_pages_add(1 << compound_order(hpage)); - } else { - SetPageHWPoison(page); - num_poisoned_pages_inc(); - } + num_poisoned_pages_inc(); } return ret; } @@ -1731,15 +1695,12 @@ static int soft_offline_in_use_page(struct page *page, int flags) static void soft_offline_free_page(struct page *page) { - if (PageHuge(page)) { - struct page *hpage = compound_head(page); + struct page *head = compound_head(page); - set_page_hwpoison_huge_page(hpage); - if (!dequeue_hwpoisoned_huge_page(hpage)) - num_poisoned_pages_add(1 << compound_order(hpage)); - } else { - if (!TestSetPageHWPoison(page)) - num_poisoned_pages_inc(); + if (!TestSetPageHWPoison(head)) { + num_poisoned_pages_inc(); + if (PageHuge(head)) + dequeue_hwpoisoned_huge_page(head); } } -- cgit v1.2.3-70-g09d2 From c3114a84f7f96c9d5c73c8bfa7e21ff42fda97e2 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 10 Jul 2017 15:47:41 -0700 Subject: mm: hugetlb: soft-offline: dissolve source hugepage after successful migration Currently hugepage migrated by soft-offline (i.e. due to correctable memory errors) is contained as a hugepage, which means many non-error pages in it are unreusable, i.e. wasted. This patch solves this issue by dissolving source hugepages into buddy. As done in previous patch, PageHWPoison is set only on a head page of the error hugepage. Then in dissoliving we move the PageHWPoison flag to the raw error page so that all healthy subpages return back to buddy. [arnd@arndb.de: fix warnings: replace some macros with inline functions] Link: http://lkml.kernel.org/r/20170609102544.2947326-1-arnd@arndb.de Link: http://lkml.kernel.org/r/1496305019-5493-5-git-send-email-n-horiguchi@ah.jp.nec.com Signed-off-by: Anshuman Khandual Signed-off-by: Naoya Horiguchi Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 31 +++++++++++++++++++++++++++---- mm/hugetlb.c | 10 +++++++++- mm/memory-failure.c | 5 +---- mm/migrate.c | 2 ++ 4 files changed, 39 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 46bfb702e7d6..668ab1742ef6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -472,6 +472,7 @@ static inline pgoff_t basepage_index(struct page *page) return __basepage_index(page); } +extern int dissolve_free_huge_page(struct page *page); extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); static inline bool hugepage_migration_supported(struct hstate *h) @@ -550,15 +551,37 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) { return 1; } -#define hstate_index_to_shift(index) 0 -#define hstate_index(h) 0 + +static inline unsigned hstate_index_to_shift(unsigned index) +{ + return 0; +} + +static inline int hstate_index(struct hstate *h) +{ + return 0; +} static inline pgoff_t basepage_index(struct page *page) { return page->index; } -#define dissolve_free_huge_pages(s, e) 0 -#define hugepage_migration_supported(h) false + +static inline int dissolve_free_huge_page(struct page *page) +{ + return 0; +} + +static inline int dissolve_free_huge_pages(unsigned long start_pfn, + unsigned long end_pfn) +{ + return 0; +} + +static inline bool hugepage_migration_supported(struct hstate *h) +{ + return false; +} static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41a1b48cefbf..b2d44363837a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1459,7 +1459,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, * number of free hugepages would be reduced below the number of reserved * hugepages. */ -static int dissolve_free_huge_page(struct page *page) +int dissolve_free_huge_page(struct page *page) { int rc = 0; @@ -1472,6 +1472,14 @@ static int dissolve_free_huge_page(struct page *page) rc = -EBUSY; goto out; } + /* + * Move PageHWPoison flag from head page to the raw error page, + * which makes any subpages rather than the error page reusable. + */ + if (PageHWPoison(head) && page != head) { + SetPageHWPoison(page); + ClearPageHWPoison(head); + } list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a9ddb0e72f5b..42c5803e6275 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1575,11 +1575,8 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { - /* overcommit hugetlb page will be freed to buddy */ - SetPageHWPoison(page); if (PageHuge(page)) - dequeue_hwpoisoned_huge_page(hpage); - num_poisoned_pages_inc(); + dissolve_free_huge_page(page); } return ret; } diff --git a/mm/migrate.c b/mm/migrate.c index 051cc1555d36..8935cbe362ce 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1252,6 +1252,8 @@ put_anon: out: if (rc != -EAGAIN) putback_active_hugepage(hpage); + if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage)) + num_poisoned_pages_inc(); /* * If migration was not successful and there's a freeing callback, use -- cgit v1.2.3-70-g09d2 From ddd40d8a2c4ef8f2152ea6d227e11475cf7e5bfa Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 10 Jul 2017 15:47:53 -0700 Subject: mm: hugetlb: delete dequeue_hwpoisoned_huge_page() dequeue_hwpoisoned_huge_page() is no longer used, so let's remove it. Link: http://lkml.kernel.org/r/1496305019-5493-9-git-send-email-n-horiguchi@ah.jp.nec.com Signed-off-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 ----- mm/hugetlb.c | 34 ---------------------------------- mm/memory-failure.c | 11 ----------- 3 files changed, 50 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 668ab1742ef6..57f700ac127e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -116,7 +116,6 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int dequeue_hwpoisoned_huge_page(struct page *page); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); void free_huge_page(struct page *page); @@ -192,10 +191,6 @@ static inline void hugetlb_show_meminfo(void) #define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ src_addr, pagep) ({ BUG(); 0; }) #define huge_pte_offset(mm, address, sz) 0 -static inline int dequeue_hwpoisoned_huge_page(struct page *page) -{ - return 0; -} static inline bool isolate_huge_page(struct page *page, struct list_head *list) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b2d44363837a..8254e8f6db6b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4746,40 +4746,6 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); } -#ifdef CONFIG_MEMORY_FAILURE - -/* - * This function is called from memory failure code. - */ -int dequeue_hwpoisoned_huge_page(struct page *hpage) -{ - struct hstate *h = page_hstate(hpage); - int nid = page_to_nid(hpage); - int ret = -EBUSY; - - spin_lock(&hugetlb_lock); - /* - * Just checking !page_huge_active is not enough, because that could be - * an isolated/hwpoisoned hugepage (which have >0 refcount). - */ - if (!page_huge_active(hpage) && !page_count(hpage)) { - /* - * Hwpoisoned hugepage isn't linked to activelist or freelist, - * but dangling hpage->lru can trigger list-debug warnings - * (this happens when we call unpoison_memory() on it), - * so let it point to itself with list_del_init(). - */ - list_del_init(&hpage->lru); - set_page_refcounted(hpage); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - ret = 0; - } - spin_unlock(&hugetlb_lock); - return ret; -} -#endif - bool isolate_huge_page(struct page *page, struct list_head *list) { bool ret = true; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6f8f69f4a986..2aec57c07652 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1458,17 +1458,6 @@ int unpoison_memory(unsigned long pfn) } if (!get_hwpoison_page(p)) { - /* - * Since HWPoisoned hugepage should have non-zero refcount, - * race between memory failure and unpoison seems to happen. - * In such case unpoison fails and memory failure runs - * to the end. - */ - if (PageHuge(page)) { - unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n", - pfn, &unpoison_rs); - return 0; - } if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", -- cgit v1.2.3-70-g09d2 From 1860033237d4be09c5d7382585f0c7229367a534 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:48:02 -0700 Subject: mm: make PR_SET_THP_DISABLE immediately active PR_SET_THP_DISABLE has a rather subtle semantic. It doesn't affect any existing mapping because it only updated mm->def_flags which is a template for new mappings. The mappings created after prctl(PR_SET_THP_DISABLE) have VM_NOHUGEPAGE flag set. This can be quite surprising for all those applications which do not do prctl(); fork() & exec() and want to control their own THP behavior. Another usecase when the immediate semantic of the prctl might be useful is a combination of pre- and post-copy migration of containers with CRIU. In this case CRIU populates a part of a memory region with data that was saved during the pre-copy stage. Afterwards, the region is registered with userfaultfd and CRIU expects to get page faults for the parts of the region that were not yet populated. However, khugepaged collapses the pages and the expected page faults do not occur. In more general case, the prctl(PR_SET_THP_DISABLE) could be used as a temporary mechanism for enabling/disabling THP process wide. Implementation wise, a new MMF_DISABLE_THP flag is added. This flag is tested when decision whether to use huge pages is taken either during page fault of at the time of THP collapse. It should be noted, that the new implementation makes PR_SET_THP_DISABLE master override to any per-VMA setting, which was not the case previously. Fixes: a0715cc22601 ("mm, thp: add VM_INIT_DEF_MASK and PRCTL_THP_DISABLE") Link: http://lkml.kernel.org/r/1496415802-30944-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Michal Hocko Signed-off-by: Mike Rapoport Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: "Kirill A. Shutemov" Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 1 + include/linux/khugepaged.h | 3 ++- include/linux/sched/coredump.h | 5 ++++- kernel/sys.c | 6 +++--- mm/khugepaged.c | 3 ++- mm/shmem.c | 8 +++++--- 6 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d3b3e8fcc717..40d7b7dd2653 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -92,6 +92,7 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); (1<vm_flags & VM_HUGEPAGE))) && \ !((__vma)->vm_flags & VM_NOHUGEPAGE) && \ + !test_bit(MMF_DISABLE_THP, &(__vma)->vm_mm->flags) && \ !is_vma_temporary_stack(__vma)) #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 5d9a400af509..f0d7335336cd 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -48,7 +48,8 @@ static inline int khugepaged_enter(struct vm_area_struct *vma, if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) if ((khugepaged_always() || (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && - !(vm_flags & VM_NOHUGEPAGE)) + !(vm_flags & VM_NOHUGEPAGE) && + !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 69eedcef8f03..98ae0d05aa32 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -68,7 +68,10 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ + MMF_DISABLE_THP_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 47d901586b4e..73fc0af147d0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2360,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) @@ -2368,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (down_write_killable(&me->mm->mmap_sem)) return -EINTR; if (arg2) - me->mm->def_flags |= VM_NOHUGEPAGE; + set_bit(MMF_DISABLE_THP, &me->mm->flags); else - me->mm->def_flags &= ~VM_NOHUGEPAGE; + clear_bit(MMF_DISABLE_THP, &me->mm->flags); up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: diff --git a/mm/khugepaged.c b/mm/khugepaged.c index df4ebdb2b10a..c01f177a1120 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -816,7 +816,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) static bool hugepage_vma_check(struct vm_area_struct *vma) { if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) + (vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; if (shmem_file(vma->vm_file)) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) diff --git a/mm/shmem.c b/mm/shmem.c index 9418f5a9bc46..b0aa6075d164 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1977,10 +1977,12 @@ static int shmem_fault(struct vm_fault *vmf) } sgp = SGP_CACHE; - if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - else if (vma->vm_flags & VM_NOHUGEPAGE) + + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) sgp = SGP_NOHUGE; + else if (vma->vm_flags & VM_HUGEPAGE) + sgp = SGP_HUGE; error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma, vmf, &ret); -- cgit v1.2.3-70-g09d2 From 7ab0e50ad0831e714dcdc3de44a7fe3887732b7c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 10 Jul 2017 15:48:18 -0700 Subject: oom, trace: remove ENUM evaluation of COMPACTION_FEEDBACK After enabling CONFIG_TRACE_ENUM_MAP_FILE (which will soon be renamed to CONFIG_TRACE_EVAL_MAP_FILE), I am able to examine the enums that have been evaluated: # cat /sys/kernel/debug/tracing/enum_map (which will soon be renamed to eval_map) And it showed some interesting results: [..] ZONE_MOVABLE 3 (oom) ZONE_NORMAL 2 (oom) ZONE_DMA32 1 (oom) ZONE_DMA 0 (oom) 3 3 (oom) 2 2 (oom) 1 1 (oom) COMPACT_PRIO_ASYNC 2 (oom) COMPACT_PRIO_SYNC_LIGHT 1 (oom) COMPACT_PRIO_SYNC_FULL 0 (oom) [..] ZONE_DMA 0 (vmscan) 3 3 (vmscan) 2 2 (vmscan) 1 1 (vmscan) COMPACT_PRIO_ASYNC 2 (vmscan) [..] ZONE_DMA 0 (kmem) 3 3 (kmem) 2 2 (kmem) 1 1 (kmem) COMPACT_PRIO_ASYNC 2 (kmem) [..] ZONE_DMA 0 (compaction) 3 3 (compaction) 2 2 (compaction) 1 1 (compaction) COMPACT_PRIO_ASYNC 2 (compaction) [..] The name within the parenthesis are the trace systems that the enum/eval maps are associated with. When there's a number evaluated to another number, that tells me that the TRACE_DEFINE_ENUM() was used on a #define and not an enum. As #defines get converted normally, they are not needed to be evaluated. Each of the above trace systems with the number to number evaluation included the file include/trace/events/mmflags.h which has: /* High-level compaction status feedback */ #define COMPACTION_FAILED 1 #define COMPACTION_WITHDRAWN 2 #define COMPACTION_PROGRESS 3 [..] #define COMPACTION_FEEDBACK \ EM(COMPACTION_FAILED, "failed") \ EM(COMPACTION_WITHDRAWN, "withdrawn") \ EMe(COMPACTION_PROGRESS, "progress") Which is still needed for the __print_symbolic() usage in the trace_event. But it is not needed to be evaluated. Removing the evaluation part removes the unnecessary evaluations of numbers to numbers. Link: http://lkml.kernel.org/r/20170615074944.7be9a647@gandalf.local.home Signed-off-by: Steven Rostedt (VMware) Cc: Michal Hocko Cc: Vlastimil Babka Cc: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/mmflags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 304ff94363b2..10e3663a75a6 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -257,7 +257,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ COMPACTION_STATUS COMPACTION_PRIORITY -COMPACTION_FEEDBACK +/* COMPACTION_FEEDBACK are defines not enums. Not needed here. */ ZONE_TYPE LRU_NAMES -- cgit v1.2.3-70-g09d2 From 16981d763501c0e06e434cf6b59f964c520e0ccc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 10 Jul 2017 15:48:22 -0700 Subject: mm: improve readability of transparent_hugepage_enabled() Turn the macro into a static inline and rewrite the condition checks for better readability in preparation for adding another condition. [ross.zwisler@linux.intel.com: fix logic to make conversion equivalent] [akpm@linux-foundation.org: resolve vs mm-make-pr_set_thp_disable-immediately-active.patch] [akpm@linux-foundation.org: include coredump.h for MMF_DISABLE_THP] Link: http://lkml.kernel.org/r/149739530612.20686.14760671150202647861.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Ross Zwisler Acked-by: "Kirill A. Shutemov" Acked-by: Michal Hocko Cc: Christoph Hellwig Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 40d7b7dd2653..f4239d3c9c73 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -1,6 +1,8 @@ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H +#include + extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf); extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, @@ -85,15 +87,29 @@ extern struct kobj_attribute shmem_enabled_attr; extern bool is_vma_temporary_stack(struct vm_area_struct *vma); -#define transparent_hugepage_enabled(__vma) \ - ((transparent_hugepage_flags & \ - (1<vm_flags & VM_HUGEPAGE))) && \ - !((__vma)->vm_flags & VM_NOHUGEPAGE) && \ - !test_bit(MMF_DISABLE_THP, &(__vma)->vm_mm->flags) && \ - !is_vma_temporary_stack(__vma)) +extern unsigned long transparent_hugepage_flags; + +static inline bool transparent_hugepage_enabled(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_NOHUGEPAGE) + return false; + + if (is_vma_temporary_stack(vma)) + return false; + + if (test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; + + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) + return true; + + if (transparent_hugepage_flags & + (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) + return !!(vma->vm_flags & VM_HUGEPAGE); + + return false; +} + #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1< Date: Mon, 10 Jul 2017 15:48:25 -0700 Subject: mm: always enable thp for dax mappings The madvise policy for transparent huge pages is meant to avoid unwanted allocations of transparent huge pages. It allows a policy of disabling the extra memory pressure and effort to arrange for a huge page when it is not needed. DAX by definition never incurs this overhead since it is statically allocated. The policy choice makes even less sense for device-dax which tries to guarantee a given tlb-fault size. Specifically, the following setting: echo never > /sys/kernel/mm/transparent_hugepage/enabled ...violates that guarantee and silently disables all device-dax instances with a 2M or 1G alignment. So, let's avoid that non-obvious side effect by force enabling thp for dax mappings in all cases. It is worth noting that the reason this uses vma_is_dax(), and the resulting header include changes, is that previous attempts to add a VM_DAX flag were NAKd. Link: http://lkml.kernel.org/r/149739531127.20686.15813586620597484283.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Ross Zwisler Cc: Jan Kara Cc: Christoph Hellwig Cc: "Kirill A. Shutemov" Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 5 ----- include/linux/fs.h | 6 ++++++ include/linux/huge_mm.h | 5 +++++ 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 8f39db7439c3..794811875732 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -154,11 +154,6 @@ static inline unsigned int dax_radix_order(void *entry) #endif int dax_pfn_mkwrite(struct vm_fault *vmf); -static inline bool vma_is_dax(struct vm_area_struct *vma) -{ - return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); -} - static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); diff --git a/include/linux/fs.h b/include/linux/fs.h index 0cfa47125d52..78e1dbbe4cfd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -3127,6 +3128,11 @@ static inline bool io_is_direct(struct file *filp) return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host); } +static inline bool vma_is_dax(struct vm_area_struct *vma) +{ + return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); +} + static inline int iocb_flags(struct file *file) { int res = 0; diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f4239d3c9c73..ee696347f928 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -3,6 +3,8 @@ #include +#include /* only for vma_is_dax() */ + extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf); extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, @@ -103,6 +105,9 @@ static inline bool transparent_hugepage_enabled(struct vm_area_struct *vma) if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) return true; + if (vma_is_dax(vma)) + return true; + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) return !!(vma->vm_flags & VM_HUGEPAGE); -- cgit v1.2.3-70-g09d2 From 108a7ac448caff8e35e8c3f92f65faad893e5aca Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 10 Jul 2017 15:48:28 -0700 Subject: include/linux/page_ref.h: ensure page_ref_unfreeze is ordered against prior accesses page_ref_freeze and page_ref_unfreeze are designed to be used as a pair, wrapping a critical section where struct pages can be modified without having to worry about consistency for a concurrent fast-GUP. Whilst page_ref_freeze has full barrier semantics due to its use of atomic_cmpxchg, page_ref_unfreeze is implemented using atomic_set, which doesn't provide any barrier semantics and allows the operation to be reordered with respect to page modifications in the critical section. This patch ensures that page_ref_unfreeze is ordered after any critical section updates, by invoking smp_mb() prior to the atomic_set. Link: http://lkml.kernel.org/r/1497349722-6731-3-git-send-email-will.deacon@arm.com Signed-off-by: Will Deacon Acked-by: Steve Capper Acked-by: Kirill A. Shutemov Cc: Mark Rutland Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ref.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 610e13271918..1fd71733aa68 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -174,6 +174,7 @@ static inline void page_ref_unfreeze(struct page *page, int count) VM_BUG_ON_PAGE(page_count(page) != 0, page); VM_BUG_ON(count == 0); + smp_mb(); atomic_set(&page->_refcount, count); if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze)) __page_ref_unfreeze(page, count); -- cgit v1.2.3-70-g09d2 From 4db9b2efe94967be34e3b136a93251a3c1736dd5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:48:44 -0700 Subject: hugetlb, memory_hotplug: prefer to use reserved pages for migration new_node_page will try to use the origin's next NUMA node as the migration destination for hugetlb pages. If such a node doesn't have any preallocated pool it falls back to __alloc_buddy_huge_page_no_mpol to allocate a surplus page instead. This is quite subotpimal for any configuration when hugetlb pages are no distributed to all NUMA nodes evenly. Say we have a hotplugable node 4 and spare hugetlb pages are node 0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:10000 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node2/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node3/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node4/hugepages/hugepages-2048kB/nr_hugepages:10000 /sys/devices/system/node/node5/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node6/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node7/hugepages/hugepages-2048kB/nr_hugepages:0 Now we consume the whole pool on node 4 and try to offline this node. All the allocated pages should be moved to node0 which has enough preallocated pages to hold them. With the current implementation offlining very likely fails because hugetlb allocations during runtime are much less reliable. Fix this by reusing the nodemask which excludes migration source and try to find a first node which has a page in the preallocated pool first and fall back to __alloc_buddy_huge_page_no_mpol only when the whole pool is consumed. [akpm@linux-foundation.org: remove bogus arg from alloc_huge_page_nodemask() stub] Link: http://lkml.kernel.org/r/20170608074553.22152-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Naoya Horiguchi Cc: Xishi Qiu Cc: zhong jiang Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 ++ mm/hugetlb.c | 27 +++++++++++++++++++++++++++ mm/memory_hotplug.c | 9 ++------- 3 files changed, 31 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 57f700ac127e..8fd0725d3f30 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -349,6 +349,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, struct page *alloc_huge_page_node(struct hstate *h, int nid); struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); +struct page *alloc_huge_page_nodemask(struct hstate *h, const nodemask_t *nmask); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -524,6 +525,7 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr struct hstate {}; #define alloc_huge_page(v, a, r) NULL #define alloc_huge_page_node(h, nid) NULL +#define alloc_huge_page_nodemask(h, nmask) NULL #define alloc_huge_page_noerr(v, a, r) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 761a669d0b62..01c11ceb47d6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1723,6 +1723,33 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) return page; } +struct page *alloc_huge_page_nodemask(struct hstate *h, const nodemask_t *nmask) +{ + struct page *page = NULL; + int node; + + spin_lock(&hugetlb_lock); + if (h->free_huge_pages - h->resv_huge_pages > 0) { + for_each_node_mask(node, *nmask) { + page = dequeue_huge_page_node_exact(h, node); + if (page) + break; + } + } + spin_unlock(&hugetlb_lock); + if (page) + return page; + + /* No reservations, try to overcommit */ + for_each_node_mask(node, *nmask) { + page = __alloc_buddy_huge_page_no_mpol(h, node); + if (page) + return page; + } + + return NULL; +} + /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f42a8ef93ec4..1cf3404bd065 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1446,14 +1446,9 @@ static struct page *new_node_page(struct page *page, unsigned long private, if (nodes_empty(nmask)) node_set(nid, nmask); - /* - * TODO: allocate a destination hugepage from a nearest neighbor node, - * accordance with memory policy of the user process if possible. For - * now as a simple work-around, we use the next node for destination. - */ if (PageHuge(page)) - return alloc_huge_page_node(page_hstate(compound_head(page)), - next_node_in(nid, nmask)); + return alloc_huge_page_nodemask( + page_hstate(compound_head(page)), &nmask); if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) -- cgit v1.2.3-70-g09d2 From 8b9132388964df2cfe151a88fd1dd8219dabf23c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:48:47 -0700 Subject: mm: unify new_node_page and alloc_migrate_target Commit 394e31d2ceb4 ("mem-hotplug: alloc new page from a nearest neighbor node when mem-offline") has duplicated a large part of alloc_migrate_target with some hotplug specific special casing. To be more precise it tried to enfore the allocation from a different node than the original page. As a result the two function diverged in their shared logic, e.g. the hugetlb allocation strategy. Let's unify the two and express different NUMA requirements by the given nodemask. new_node_page will simply exclude the node it doesn't care about and alloc_migrate_target will use all the available nodes. alloc_migrate_target will then learn to migrate hugetlb pages more sanely and use preallocated pool when possible. Please note that alloc_migrate_target used to call alloc_page resp. alloc_pages_current so the memory policy of the current context which is quite strange when we consider that it is used in the context of alloc_contig_range which just tries to migrate pages which stand in the way. Link: http://lkml.kernel.org/r/20170608074553.22152-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Naoya Horiguchi Cc: Xishi Qiu Cc: zhong jiang Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 16 ++++++++++++++++ mm/memory_hotplug.c | 11 +---------- mm/page_isolation.c | 18 ++---------------- 3 files changed, 19 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 48e24844b3c5..d9675b665cc4 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -4,6 +4,7 @@ #include #include #include +#include typedef struct page *new_page_t(struct page *page, unsigned long private, int **reason); @@ -30,6 +31,21 @@ enum migrate_reason { /* In mm/debug.c; also keep sync with include/trace/events/migrate.h */ extern char *migrate_reason_names[MR_TYPES]; +static inline struct page *new_page_nodemask(struct page *page, + int preferred_nid, nodemask_t *nodemask) +{ + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + + if (PageHuge(page)) + return alloc_huge_page_nodemask(page_hstate(compound_head(page)), + nodemask); + + if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) + gfp_mask |= __GFP_HIGHMEM; + + return __alloc_pages_nodemask(gfp_mask, 0, preferred_nid, nodemask); +} + #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1cf3404bd065..203c46306a74 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1433,7 +1433,6 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) static struct page *new_node_page(struct page *page, unsigned long private, int **result) { - gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; int nid = page_to_nid(page); nodemask_t nmask = node_states[N_MEMORY]; @@ -1446,15 +1445,7 @@ static struct page *new_node_page(struct page *page, unsigned long private, if (nodes_empty(nmask)) node_set(nid, nmask); - if (PageHuge(page)) - return alloc_huge_page_nodemask( - page_hstate(compound_head(page)), &nmask); - - if (PageHighMem(page) - || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) - gfp_mask |= __GFP_HIGHMEM; - - return __alloc_pages_nodemask(gfp_mask, 0, nid, &nmask); + return new_page_nodemask(page, nid, &nmask); } #define NR_OFFLINE_AT_ONCE_PAGES (256) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 3606104893e0..757410d9f758 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "internal.h" #define CREATE_TRACE_POINTS @@ -294,20 +295,5 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, struct page *alloc_migrate_target(struct page *page, unsigned long private, int **resultp) { - gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; - - /* - * TODO: allocate a destination hugepage from a nearest neighbor node, - * accordance with memory policy of the user process if possible. For - * now as a simple work-around, we use the next node for destination. - */ - if (PageHuge(page)) - return alloc_huge_page_node(page_hstate(compound_head(page)), - next_node_in(page_to_nid(page), - node_online_map)); - - if (PageHighMem(page)) - gfp_mask |= __GFP_HIGHMEM; - - return alloc_page(gfp_mask); + return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]); } -- cgit v1.2.3-70-g09d2 From 422580c3cea7faaca67f6199375b79565d3d8ebd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 10 Jul 2017 15:49:05 -0700 Subject: mm/oom_kill.c: add tracepoints for oom reaper-related events During the debugging of the problem described in https://lkml.org/lkml/2017/5/17/542 and fixed by Tetsuo Handa in https://lkml.org/lkml/2017/5/19/383 , I've found that the existing debug output is not really useful to understand issues related to the oom reaper. So, I assume, that adding some tracepoints might help with debugging of similar issues. Trace the following events: 1) a process is marked as an oom victim, 2) a process is added to the oom reaper list, 3) the oom reaper starts reaping process's mm, 4) the oom reaper finished reaping, 5) the oom reaper skips reaping. How it works in practice? Below is an example which show how the problem mentioned above can be found: one process is added twice to the oom_reaper list: $ cd /sys/kernel/debug/tracing $ echo "oom:mark_victim" > set_event $ echo "oom:wake_reaper" >> set_event $ echo "oom:skip_task_reaping" >> set_event $ echo "oom:start_task_reaping" >> set_event $ echo "oom:finish_task_reaping" >> set_event $ cat trace_pipe allocate-502 [001] .... 91.836405: mark_victim: pid=502 allocate-502 [001] .N.. 91.837356: wake_reaper: pid=502 allocate-502 [000] .N.. 91.871149: wake_reaper: pid=502 oom_reaper-23 [000] .... 91.871177: start_task_reaping: pid=502 oom_reaper-23 [000] .N.. 91.879511: finish_task_reaping: pid=502 oom_reaper-23 [000] .... 91.879580: skip_task_reaping: pid=502 Link: http://lkml.kernel.org/r/20170530185231.GA13412@castle Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/oom.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++ mm/oom_kill.c | 7 ++++ 2 files changed, 87 insertions(+) (limited to 'include') diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index 38baeb27221a..c3c19d47ae5e 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -70,6 +70,86 @@ TRACE_EVENT(reclaim_retry_zone, __entry->wmark_check) ); +TRACE_EVENT(mark_victim, + TP_PROTO(int pid), + + TP_ARGS(pid), + + TP_STRUCT__entry( + __field(int, pid) + ), + + TP_fast_assign( + __entry->pid = pid; + ), + + TP_printk("pid=%d", __entry->pid) +); + +TRACE_EVENT(wake_reaper, + TP_PROTO(int pid), + + TP_ARGS(pid), + + TP_STRUCT__entry( + __field(int, pid) + ), + + TP_fast_assign( + __entry->pid = pid; + ), + + TP_printk("pid=%d", __entry->pid) +); + +TRACE_EVENT(start_task_reaping, + TP_PROTO(int pid), + + TP_ARGS(pid), + + TP_STRUCT__entry( + __field(int, pid) + ), + + TP_fast_assign( + __entry->pid = pid; + ), + + TP_printk("pid=%d", __entry->pid) +); + +TRACE_EVENT(finish_task_reaping, + TP_PROTO(int pid), + + TP_ARGS(pid), + + TP_STRUCT__entry( + __field(int, pid) + ), + + TP_fast_assign( + __entry->pid = pid; + ), + + TP_printk("pid=%d", __entry->pid) +); + +TRACE_EVENT(skip_task_reaping, + TP_PROTO(int pid), + + TP_ARGS(pid), + + TP_STRUCT__entry( + __field(int, pid) + ), + + TP_fast_assign( + __entry->pid = pid; + ), + + TP_printk("pid=%d", __entry->pid) +); + #ifdef CONFIG_COMPACTION TRACE_EVENT(compact_retry, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0e2c925e7826..9e8b4f030c1c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -490,6 +490,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) if (!down_read_trylock(&mm->mmap_sem)) { ret = false; + trace_skip_task_reaping(tsk->pid); goto unlock_oom; } @@ -500,9 +501,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) */ if (!mmget_not_zero(mm)) { up_read(&mm->mmap_sem); + trace_skip_task_reaping(tsk->pid); goto unlock_oom; } + trace_start_task_reaping(tsk->pid); + /* * Tell all users of get_user/copy_from_user etc... that the content * is no longer stable. No barriers really needed because unmapping @@ -544,6 +548,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * put the oom_reaper out of the way. */ mmput_async(mm); + trace_finish_task_reaping(tsk->pid); unlock_oom: mutex_unlock(&oom_lock); return ret; @@ -615,6 +620,7 @@ static void wake_oom_reaper(struct task_struct *tsk) tsk->oom_reaper_list = oom_reaper_list; oom_reaper_list = tsk; spin_unlock(&oom_reaper_lock); + trace_wake_reaper(tsk->pid); wake_up(&oom_reaper_wait); } @@ -666,6 +672,7 @@ static void mark_oom_victim(struct task_struct *tsk) */ __thaw_task(tsk); atomic_inc(&oom_victims); + trace_mark_victim(tsk->pid); } /** -- cgit v1.2.3-70-g09d2 From aaf14e40a33a2c9350471387031ca40c00f5a006 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:49:08 -0700 Subject: mm, hugetlb: unclutter hugetlb allocation layers Patch series "mm, hugetlb: allow proper node fallback dequeue". While working on a hugetlb migration issue addressed in a separate patchset[1] I have noticed that the hugetlb allocations from the preallocated pool are quite subotimal. [1] //lkml.kernel.org/r/20170608074553.22152-1-mhocko@kernel.org There is no fallback mechanism implemented and no notion of preferred node. I have tried to work around it but Vlastimil was right to push back for a more robust solution. It seems that such a solution is to reuse zonelist approach we use for the page alloctor. This series has 3 patches. The first one tries to make hugetlb allocation layers more clear. The second one implements the zonelist hugetlb pool allocation and introduces a preferred node semantic which is used by the migration callbacks. The last patch is a clean up. This patch (of 3): Hugetlb allocation path for fresh huge pages is unnecessarily complex and it mixes different interfaces between layers. __alloc_buddy_huge_page is the central place to perform a new allocation. It checks for the hugetlb overcommit and then relies on __hugetlb_alloc_buddy_huge_page to invoke the page allocator. This is all good except that __alloc_buddy_huge_page pushes vma and address down the callchain and so __hugetlb_alloc_buddy_huge_page has to deal with two different allocation modes - one for memory policy and other node specific (or to make it more obscure node non-specific) requests. This just screams for a reorganization. This patch pulls out all the vma specific handling up to __alloc_buddy_huge_page_with_mpol where it belongs. __alloc_buddy_huge_page will get nodemask argument and __hugetlb_alloc_buddy_huge_page will become a trivial wrapper over the page allocator. In short: __alloc_buddy_huge_page_with_mpol - memory policy handling __alloc_buddy_huge_page - overcommit handling and accounting __hugetlb_alloc_buddy_huge_page - page allocator layer Also note that __hugetlb_alloc_buddy_huge_page and its cpuset retry loop is not really needed because the page allocator already handles the cpusets update. Finally __hugetlb_alloc_buddy_huge_page had a special case for node specific allocations (when no policy is applied and there is a node given). This has relied on __GFP_THISNODE to not fallback to a different node. alloc_huge_page_node is the only caller which relies on this behavior so move the __GFP_THISNODE there. Not only does this remove quite some code it also should make those layers easier to follow and clear wrt responsibilities. Link: http://lkml.kernel.org/r/20170622193034.28972-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Reviewed-by: Mike Kravetz Tested-by: Mike Kravetz Cc: Naoya Horiguchi Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 133 +++++++++++------------------------------------- 2 files changed, 30 insertions(+), 105 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8fd0725d3f30..66b621469f52 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -349,7 +349,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, struct page *alloc_huge_page_node(struct hstate *h, int nid); struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); -struct page *alloc_huge_page_nodemask(struct hstate *h, const nodemask_t *nmask); +struct page *alloc_huge_page_nodemask(struct hstate *h, nodemask_t *nmask); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 907786581812..fd6e0c50f949 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1521,82 +1521,19 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) return rc; } -/* - * There are 3 ways this can get called: - * 1. With vma+addr: we use the VMA's memory policy - * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge - * page from any node, and let the buddy allocator itself figure - * it out. - * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page - * strictly from 'nid' - */ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long addr, int nid) + gfp_t gfp_mask, int nid, nodemask_t *nmask) { int order = huge_page_order(h); - gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; - unsigned int cpuset_mems_cookie; - /* - * We need a VMA to get a memory policy. If we do not - * have one, we use the 'nid' argument. - * - * The mempolicy stuff below has some non-inlined bits - * and calls ->vm_ops. That makes it hard to optimize at - * compile-time, even when NUMA is off and it does - * nothing. This helps the compiler optimize it out. - */ - if (!IS_ENABLED(CONFIG_NUMA) || !vma) { - /* - * If a specific node is requested, make sure to - * get memory from there, but only when a node - * is explicitly specified. - */ - if (nid != NUMA_NO_NODE) - gfp |= __GFP_THISNODE; - /* - * Make sure to call something that can handle - * nid=NUMA_NO_NODE - */ - return alloc_pages_node(nid, gfp, order); - } - - /* - * OK, so we have a VMA. Fetch the mempolicy and try to - * allocate a huge page with it. We will only reach this - * when CONFIG_NUMA=y. - */ - do { - struct page *page; - struct mempolicy *mpol; - int nid; - nodemask_t *nodemask; - - cpuset_mems_cookie = read_mems_allowed_begin(); - nid = huge_node(vma, addr, gfp, &mpol, &nodemask); - mpol_cond_put(mpol); - page = __alloc_pages_nodemask(gfp, order, nid, nodemask); - if (page) - return page; - } while (read_mems_allowed_retry(cpuset_mems_cookie)); - - return NULL; + gfp_mask |= __GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + return __alloc_pages_nodemask(gfp_mask, order, nid, nmask); } -/* - * There are two ways to allocate a huge page: - * 1. When you have a VMA and an address (like a fault) - * 2. When you have no VMA (like when setting /proc/.../nr_hugepages) - * - * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in - * this case which signifies that the allocation should be done with - * respect for the VMA's memory policy. - * - * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This - * implies that memory policies will not be taken in to account. - */ -static struct page *__alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long addr, int nid) +static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { struct page *page; unsigned int r_nid; @@ -1604,15 +1541,6 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, if (hstate_is_gigantic(h)) return NULL; - /* - * Make sure that anyone specifying 'nid' is not also specifying a VMA. - * This makes sure the caller is picking _one_ of the modes with which - * we can call this function, not both. - */ - if (vma || (addr != -1)) { - VM_WARN_ON_ONCE(addr == -1); - VM_WARN_ON_ONCE(nid != NUMA_NO_NODE); - } /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed @@ -1646,7 +1574,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, } spin_unlock(&hugetlb_lock); - page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); spin_lock(&hugetlb_lock); if (page) { @@ -1670,19 +1598,6 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, return page; } -/* - * Allocate a huge page from 'nid'. Note, 'nid' may be - * NUMA_NO_NODE, which means that it may be allocated - * anywhere. - */ -static -struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) -{ - unsigned long addr = -1; - - return __alloc_buddy_huge_page(h, NULL, addr, nid); -} - /* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ @@ -1690,7 +1605,17 @@ static struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE); + struct page *page; + struct mempolicy *mpol; + gfp_t gfp_mask = htlb_alloc_mask(h); + int nid; + nodemask_t *nodemask; + + nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); + page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask); + mpol_cond_put(mpol); + + return page; } /* @@ -1700,21 +1625,26 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, */ struct page *alloc_huge_page_node(struct hstate *h, int nid) { + gfp_t gfp_mask = htlb_alloc_mask(h); struct page *page = NULL; + if (nid != NUMA_NO_NODE) + gfp_mask |= __GFP_THISNODE; + spin_lock(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) page = dequeue_huge_page_node(h, nid); spin_unlock(&hugetlb_lock); if (!page) - page = __alloc_buddy_huge_page_no_mpol(h, nid); + page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL); return page; } -struct page *alloc_huge_page_nodemask(struct hstate *h, const nodemask_t *nmask) +struct page *alloc_huge_page_nodemask(struct hstate *h, nodemask_t *nmask) { + gfp_t gfp_mask = htlb_alloc_mask(h); struct page *page = NULL; int node; @@ -1731,13 +1661,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, const nodemask_t *nmask) return page; /* No reservations, try to overcommit */ - for_each_node_mask(node, *nmask) { - page = __alloc_buddy_huge_page_no_mpol(h, node); - if (page) - return page; - } - - return NULL; + return __alloc_buddy_huge_page(h, gfp_mask, NUMA_NO_NODE, nmask); } /* @@ -1765,7 +1689,8 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE); + page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h), + NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; break; -- cgit v1.2.3-70-g09d2 From 3e59fcb0e8c1c40aecb60fa4c2d1543d6a097184 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:49:11 -0700 Subject: hugetlb: add support for preferred node to alloc_huge_page_nodemask alloc_huge_page_nodemask tries to allocate from any numa node in the allowed node mask starting from lower numa nodes. This might lead to filling up those low NUMA nodes while others are not used. We can reduce this risk by introducing a concept of the preferred node similar to what we have in the regular page allocator. We will start allocating from the preferred nid and then iterate over all allowed nodes in the zonelist order until we try them all. This is mimicing the page allocator logic except it operates on per-node mempools. dequeue_huge_page_vma already does this so distill the zonelist logic into a more generic dequeue_huge_page_nodemask and use it in alloc_huge_page_nodemask. This will allow us to use proper per numa distance fallback also for alloc_huge_page_node which can use alloc_huge_page_nodemask now and we can get rid of alloc_huge_page_node helper which doesn't have any user anymore. Link: http://lkml.kernel.org/r/20170622193034.28972-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Reviewed-by: Mike Kravetz Tested-by: Mike Kravetz Cc: Naoya Horiguchi Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +-- include/linux/migrate.h | 2 +- mm/hugetlb.c | 88 ++++++++++++++++++++++++------------------------- 3 files changed, 48 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 66b621469f52..8d9fe131a240 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -349,7 +349,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, struct page *alloc_huge_page_node(struct hstate *h, int nid); struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); -struct page *alloc_huge_page_nodemask(struct hstate *h, nodemask_t *nmask); +struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, + nodemask_t *nmask); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -525,7 +526,7 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr struct hstate {}; #define alloc_huge_page(v, a, r) NULL #define alloc_huge_page_node(h, nid) NULL -#define alloc_huge_page_nodemask(h, nmask) NULL +#define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL #define alloc_huge_page_noerr(v, a, r) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL diff --git a/include/linux/migrate.h b/include/linux/migrate.h index d9675b665cc4..4634da521238 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -38,7 +38,7 @@ static inline struct page *new_page_nodemask(struct page *page, if (PageHuge(page)) return alloc_huge_page_nodemask(page_hstate(compound_head(page)), - nodemask); + preferred_nid, nodemask); if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) gfp_mask |= __GFP_HIGHMEM; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fd6e0c50f949..1e516520433d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -887,19 +887,39 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) return page; } -static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, + nodemask_t *nmask) { - struct page *page; - int node; + unsigned int cpuset_mems_cookie; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + int node = -1; - if (nid != NUMA_NO_NODE) - return dequeue_huge_page_node_exact(h, nid); + zonelist = node_zonelist(nid, gfp_mask); + +retry_cpuset: + cpuset_mems_cookie = read_mems_allowed_begin(); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { + struct page *page; + + if (!cpuset_zone_allowed(zone, gfp_mask)) + continue; + /* + * no need to ask again on the same node. Pool is node rather than + * zone aware + */ + if (zone_to_nid(zone) == node) + continue; + node = zone_to_nid(zone); - for_each_online_node(node) { page = dequeue_huge_page_node_exact(h, node); if (page) return page; } + if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return NULL; } @@ -917,15 +937,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page = NULL; + struct page *page; struct mempolicy *mpol; - nodemask_t *nodemask; gfp_t gfp_mask; + nodemask_t *nodemask; int nid; - struct zonelist *zonelist; - struct zone *zone; - struct zoneref *z; - unsigned int cpuset_mems_cookie; /* * A child process with MAP_PRIVATE mappings created by their parent @@ -940,32 +956,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) goto err; -retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - zonelist = node_zonelist(nid, gfp_mask); - - for_each_zone_zonelist_nodemask(zone, z, zonelist, - MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed(zone, gfp_mask)) { - page = dequeue_huge_page_node(h, zone_to_nid(zone)); - if (page) { - if (avoid_reserve) - break; - if (!vma_has_reserves(vma, chg)) - break; - - SetPagePrivate(page); - h->resv_huge_pages--; - break; - } - } + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { + SetPagePrivate(page); + h->resv_huge_pages--; } mpol_cond_put(mpol); - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; return page; err: @@ -1633,7 +1632,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) spin_lock(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) - page = dequeue_huge_page_node(h, nid); + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL); spin_unlock(&hugetlb_lock); if (!page) @@ -1642,26 +1641,27 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) return page; } -struct page *alloc_huge_page_nodemask(struct hstate *h, nodemask_t *nmask) + +struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, + nodemask_t *nmask) { gfp_t gfp_mask = htlb_alloc_mask(h); - struct page *page = NULL; - int node; spin_lock(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) { - for_each_node_mask(node, *nmask) { - page = dequeue_huge_page_node_exact(h, node); - if (page) - break; + struct page *page; + + page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); + if (page) { + spin_unlock(&hugetlb_lock); + return page; } } spin_unlock(&hugetlb_lock); - if (page) - return page; /* No reservations, try to overcommit */ - return __alloc_buddy_huge_page(h, gfp_mask, NUMA_NO_NODE, nmask); + + return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask); } /* -- cgit v1.2.3-70-g09d2 From 618b8c20d03c9ea06711bd36d906322ba35c0add Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 10 Jul 2017 15:49:32 -0700 Subject: include/linux/mmzone.h: remove ancient/ambiguous comment Currently pg_data_t is just a struct which describes a NUMA node memory layout. Let's keep the comment simple and remove ambiguity. Link: http://lkml.kernel.org/r/1498220534-22717-1-git-send-email-nborisov@suse.com Signed-off-by: Nikolay Borisov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7e8f100cb56d..16532fa0bb64 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -603,12 +603,9 @@ extern struct page *mem_map; #endif /* - * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM - * (mostly NUMA machines?) to denote a higher-level memory zone than the - * zone denotes. - * * On NUMA machines, each NUMA node would have a pg_data_t to describe - * it's memory layout. + * it's memory layout. On UMA machines there is a single pglist_data which + * describes the whole memory. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. -- cgit v1.2.3-70-g09d2 From e3d3910a57ab9c70cddb2522ae711ff9bff89e7c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 10 Jul 2017 15:49:35 -0700 Subject: include/linux/backing-dev.h: simplify wb_stat_sum wb_stat_sum() disables interrupts and calls __wb_stat_sum() which eventually calls __percpu_counter_sum(). However, the percpu routine is already irq-safe. Simplify the code a bit by making wb_stat_sum() directly call percpu_counter_sum_positive() and not disable interrupts. Also remove the now-uneeded __wb_stat_sum() which was just a wrapper over percpu_counter_sum_positive(). Link: http://lkml.kernel.org/r/1498230681-29103-1-git-send-email-nborisov@suse.com Signed-off-by: Nikolay Borisov Acked-by: Peter Zijlstra Cc: Tejun Heo Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index ace73f96eb1e..334165c911f0 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -104,22 +104,9 @@ static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) return percpu_counter_read_positive(&wb->stat[item]); } -static inline s64 __wb_stat_sum(struct bdi_writeback *wb, - enum wb_stat_item item) -{ - return percpu_counter_sum_positive(&wb->stat[item]); -} - static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { - s64 sum; - unsigned long flags; - - local_irq_save(flags); - sum = __wb_stat_sum(wb, item); - local_irq_restore(flags); - - return sum; + return percpu_counter_sum_positive(&wb->stat[item]); } extern void wb_writeout_inc(struct bdi_writeback *wb); -- cgit v1.2.3-70-g09d2 From 2c80cd57c74339889a8752b20862a16c28929c3a Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Mon, 10 Jul 2017 15:49:57 -0700 Subject: mm/list_lru.c: fix list_lru_count_node() to be race free list_lru_count_node() iterates over all memcgs to get the total number of entries on the node but it can race with memcg_drain_all_list_lrus(), which migrates the entries from a dead cgroup to another. This can return incorrect number of entries from list_lru_count_node(). Fix this by keeping track of entries per node and simply return it in list_lru_count_node(). Link: http://lkml.kernel.org/r/1498707555-30525-1-git-send-email-stummala@codeaurora.org Signed-off-by: Sahitya Tummala Acked-by: Vladimir Davydov Cc: Jan Kara Cc: Alexander Polakov Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_lru.h | 1 + mm/list_lru.c | 14 ++++++-------- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index cb0ba9f2a9a2..fa7fd03cb5f9 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -44,6 +44,7 @@ struct list_lru_node { /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ struct list_lru_memcg *memcg_lrus; #endif + long nr_items; } ____cacheline_aligned_in_smp; struct list_lru { diff --git a/mm/list_lru.c b/mm/list_lru.c index 234676e31edd..7a40fa2be858 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -117,6 +117,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) l = list_lru_from_kmem(nlru, item); list_add_tail(item, &l->list); l->nr_items++; + nlru->nr_items++; spin_unlock(&nlru->lock); return true; } @@ -136,6 +137,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) l = list_lru_from_kmem(nlru, item); list_del_init(item); l->nr_items--; + nlru->nr_items--; spin_unlock(&nlru->lock); return true; } @@ -183,15 +185,10 @@ EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) { - long count = 0; - int memcg_idx; + struct list_lru_node *nlru; - count += __list_lru_count_one(lru, nid, -1); - if (list_lru_memcg_aware(lru)) { - for_each_memcg_cache_index(memcg_idx) - count += __list_lru_count_one(lru, nid, memcg_idx); - } - return count; + nlru = &lru->node[nid]; + return nlru->nr_items; } EXPORT_SYMBOL_GPL(list_lru_count_node); @@ -226,6 +223,7 @@ restart: assert_spin_locked(&nlru->lock); case LRU_REMOVED: isolated++; + nlru->nr_items--; /* * If the lru lock has been dropped, our list * traversal is now invalid and so we have to -- cgit v1.2.3-70-g09d2 From a47fed5b5b014f5a13878b90ef2c3a7dc294189f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jul 2017 15:50:06 -0700 Subject: mm: swap: provide lru_add_drain_all_cpuslocked() The rework of the cpu hotplug locking unearthed potential deadlocks with the memory hotplug locking code. The solution for these is to rework the memory hotplug locking code as well and take the cpu hotplug lock before the memory hotplug lock in mem_hotplug_begin(), but this will cause a recursive locking of the cpu hotplug lock when the memory hotplug code calls lru_add_drain_all(). Split out the inner workings of lru_add_drain_all() into lru_add_drain_all_cpuslocked() so this function can be invoked from the memory hotplug code with the cpu hotplug lock held. Link: http://lkml.kernel.org/r/20170704093421.419329357@linutronix.de Signed-off-by: Thomas Gleixner Reported-by: Andrey Ryabinin Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Vladimir Davydov Cc: Peter Zijlstra Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/swap.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 61e7180cee21..d83d28e53e62 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -277,6 +277,7 @@ extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); +extern void lru_add_drain_all_cpuslocked(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); extern void mark_page_lazyfree(struct page *page); diff --git a/mm/swap.c b/mm/swap.c index 4f44dbd7f780..60b1d2a75852 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -688,7 +688,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); -void lru_add_drain_all(void) +void lru_add_drain_all_cpuslocked(void) { static DEFINE_MUTEX(lock); static struct cpumask has_work; @@ -702,7 +702,6 @@ void lru_add_drain_all(void) return; mutex_lock(&lock); - get_online_cpus(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { @@ -722,10 +721,16 @@ void lru_add_drain_all(void) for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); - put_online_cpus(); mutex_unlock(&lock); } +void lru_add_drain_all(void) +{ + get_online_cpus(); + lru_add_drain_all_cpuslocked(); + put_online_cpus(); +} + /** * release_pages - batched put_page() * @pages: array of pages to release -- cgit v1.2.3-70-g09d2 From 9d1f4b3f5b29bea431525e528a3ff2dc806ad904 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:50:12 -0700 Subject: mm: disallow early_pfn_to_nid on configurations which do not implement it early_pfn_to_nid will return node 0 if both HAVE_ARCH_EARLY_PFN_TO_NID and HAVE_MEMBLOCK_NODE_MAP are disabled. It seems we are safe now because all architectures which support NUMA define one of them (with an exception of alpha which however has CONFIG_NUMA marked as broken) so this works as expected. It can get silently and subtly broken too easily, though. Make sure we fail the compilation if NUMA is enabled and there is no proper implementation for this function. If that ever happens we know that either the specific configuration is invalid and the fix should either disable NUMA or enable one of the above configs. Link: http://lkml.kernel.org/r/20170704075803.15979-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Yang Shi Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 16532fa0bb64..fc14b8b3f6ce 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1055,6 +1055,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) static inline unsigned long early_pfn_to_nid(unsigned long pfn) { + BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA)); return 0; } #endif -- cgit v1.2.3-70-g09d2 From 0b396923ee9bdcb4a208df2148712b79b6dee73e Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Mon, 10 Jul 2017 15:50:55 -0700 Subject: asm-generic/bug.h: declare struct pt_regs; before function prototype This series of patches splits BUILD_BUG related macros out of "include/linux/bug.h" into new file "include/linux/build_bug.h" (patch 5), and changes the pointer type checking in the `container_of()` macro to deal with pointers of array type better (patch 6). Patches 1 to 4 are prerequisites. Patches 2, 3, 4, and 5 have been inserted since the previous version of this patch series. Patch 6 here corresponds to v3 and v4's patch 2. Patch 1 was a prerequisite in v3 of this series to avoid a lot of warnings when was included by . That is no longer relevant for v5 of the series, but I left it in because it was acked by a Arnd Bergmann and Michal Nazarewicz. Patches 2, 3, and 4 are some checkpatch clean-ups on "include/linux/bug.h" before splitting out the BUILD_BUG stuff in patch 5. Patch 5 splits the BUILD_BUG related macros out of "include/linux/bug.h" into new file "include/linux/build_bug.h" because including in "include/linux/kernel.h" would result in build failures due to circular dependencies. Patch 6 changes the pointer type checking by `container_of()` to avoid some incompatible pointer warnings when the dereferenced pointer has array type. 1) asm-generic/bug.h: declare struct pt_regs; before function prototype 2) linux/bug.h: correct formatting of block comment 3) linux/bug.h: correct "(foo*)" should be "(foo *)" 4) linux/bug.h: correct "space required before that '-'" 5) bug: split BUILD_BUG stuff out into 6) kernel.h: handle pointers to arrays better in container_of() This patch (of 6): The declaration of `__warn()` has `struct pt_regs *regs` as one of its parameters. This can result in compiler warnings if `struct regs` is not already declared. Add an empty declaration of `struct pt_regs` to avoid the warnings. Link: http://lkml.kernel.org/r/20170525120316.24473-2-abbotti@mev.co.uk Signed-off-by: Ian Abbott Acked-by: Arnd Bergmann Acked-by: Michal Nazarewicz Cc: Arnd Bergmann Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index d6f4aed479a1..87191357d303 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -97,6 +97,7 @@ extern void warn_slowpath_null(const char *file, const int line); /* used internally by panic.c */ struct warn_args; +struct pt_regs; void __warn(const char *file, int line, void *caller, unsigned taint, struct pt_regs *regs, struct warn_args *args); -- cgit v1.2.3-70-g09d2 From e9d5a48499391fe5b0615610858665ba8149e255 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Mon, 10 Jul 2017 15:50:58 -0700 Subject: linux/bug.h: correct formatting of block comment Correct these checkpatch.pl warnings: |WARNING: Block comments use * on subsequent lines |#34: FILE: include/linux/bug.h:34: |+/* Force a compilation error if condition is true, but also produce a |+ result (of value 0 and type size_t), so the expression can be used |WARNING: Block comments use a trailing */ on a separate line |#36: FILE: include/linux/bug.h:36: |+ aren't permitted). */ Link: http://lkml.kernel.org/r/20170525120316.24473-3-abbotti@mev.co.uk Signed-off-by: Ian Abbott Acked-by: Michal Nazarewicz Cc: Kees Cook Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Jakub Kicinski Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bug.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index 687b557fc5eb..ca24007e2dc3 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -30,10 +30,12 @@ struct pt_regs; #define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) -/* Force a compilation error if condition is true, but also produce a - result (of value 0 and type size_t), so the expression can be used - e.g. in a structure initializer (or where-ever else comma expressions - aren't permitted). */ +/* + * Force a compilation error if condition is true, but also produce a + * result (of value 0 and type size_t), so the expression can be used + * e.g. in a structure initializer (or where-ever else comma expressions + * aren't permitted). + */ #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) #define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) -- cgit v1.2.3-70-g09d2 From 8cdd7cca9287abf4c849c01e2a4e8207ad3e3a82 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Mon, 10 Jul 2017 15:51:01 -0700 Subject: linux/bug.h: correct "(foo*)" should be "(foo *)" Correct this checkpatch.pl error: |ERROR: "(foo*)" should be "(foo *)" |#19: FILE: include/linux/bug.h:19: |+#define BUILD_BUG_ON_NULL(e) ((void*)0) Link: http://lkml.kernel.org/r/20170525120316.24473-4-abbotti@mev.co.uk Signed-off-by: Ian Abbott Acked-by: Michal Nazarewicz Cc: Kees Cook Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Jakub Kicinski Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index ca24007e2dc3..216a1b79653d 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -16,7 +16,7 @@ struct pt_regs; #define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) #define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) #define BUILD_BUG_ON_ZERO(e) (0) -#define BUILD_BUG_ON_NULL(e) ((void*)0) +#define BUILD_BUG_ON_NULL(e) ((void *)0) #define BUILD_BUG_ON_INVALID(e) (0) #define BUILD_BUG_ON_MSG(cond, msg) (0) #define BUILD_BUG_ON(condition) (0) -- cgit v1.2.3-70-g09d2 From 47e81e59d98b90727a02ceb486407eeed5eb8727 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Mon, 10 Jul 2017 15:51:04 -0700 Subject: linux/bug.h: correct "space required before that '-'" Correct these checkpatch.pl errors: |ERROR: space required before that '-' (ctx:OxO) |#37: FILE: include/linux/bug.h:37: |+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) |ERROR: space required before that '-' (ctx:OxO) |#38: FILE: include/linux/bug.h:38: |+#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) I decided to wrap the bitfield expressions that begin with minus signs in parentheses rather than insert spaces before the minus signs. Link: http://lkml.kernel.org/r/20170525120316.24473-5-abbotti@mev.co.uk Signed-off-by: Ian Abbott Acked-by: Michal Nazarewicz Cc: Kees Cook Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Jakub Kicinski Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bug.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index 216a1b79653d..483207cb99fb 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -36,8 +36,8 @@ struct pt_regs; * e.g. in a structure initializer (or where-ever else comma expressions * aren't permitted). */ -#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) -#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) +#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:(-!!(e)); })) +#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:(-!!(e)); })) /* * BUILD_BUG_ON_INVALID() permits the compiler to check the validity of the -- cgit v1.2.3-70-g09d2 From bc6245e5efd70c41eaf9334b1b5e646745cb0fb3 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Mon, 10 Jul 2017 15:51:07 -0700 Subject: bug: split BUILD_BUG stuff out into Including pulls in a lot of bloat from and that is not needed to call the BUILD_BUG() family of macros. Split them out into their own header, . Also correct some checkpatch.pl errors for the BUILD_BUG_ON_ZERO() and BUILD_BUG_ON_NULL() macros by adding parentheses around the bitfield widths that begin with a minus sign. Link: http://lkml.kernel.org/r/20170525120316.24473-6-abbotti@mev.co.uk Signed-off-by: Ian Abbott Acked-by: Michal Nazarewicz Acked-by: Kees Cook Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Jakub Kicinski Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bug.h | 74 +---------------------------------------- include/linux/build_bug.h | 84 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 73 deletions(-) create mode 100644 include/linux/build_bug.h (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index 483207cb99fb..5d5554c874fd 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -3,6 +3,7 @@ #include #include +#include enum bug_trap_type { BUG_TRAP_TYPE_NONE = 0, @@ -13,82 +14,9 @@ enum bug_trap_type { struct pt_regs; #ifdef __CHECKER__ -#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) -#define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) -#define BUILD_BUG_ON_ZERO(e) (0) -#define BUILD_BUG_ON_NULL(e) ((void *)0) -#define BUILD_BUG_ON_INVALID(e) (0) -#define BUILD_BUG_ON_MSG(cond, msg) (0) -#define BUILD_BUG_ON(condition) (0) -#define BUILD_BUG() (0) #define MAYBE_BUILD_BUG_ON(cond) (0) #else /* __CHECKER__ */ -/* Force a compilation error if a constant expression is not a power of 2 */ -#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \ - BUILD_BUG_ON(((n) & ((n) - 1)) != 0) -#define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ - BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) - -/* - * Force a compilation error if condition is true, but also produce a - * result (of value 0 and type size_t), so the expression can be used - * e.g. in a structure initializer (or where-ever else comma expressions - * aren't permitted). - */ -#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:(-!!(e)); })) -#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:(-!!(e)); })) - -/* - * BUILD_BUG_ON_INVALID() permits the compiler to check the validity of the - * expression but avoids the generation of any code, even if that expression - * has side-effects. - */ -#define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e)))) - -/** - * BUILD_BUG_ON_MSG - break compile if a condition is true & emit supplied - * error message. - * @condition: the condition which the compiler should know is false. - * - * See BUILD_BUG_ON for description. - */ -#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg) - -/** - * BUILD_BUG_ON - break compile if a condition is true. - * @condition: the condition which the compiler should know is false. - * - * If you have some code which relies on certain constants being equal, or - * some other compile-time-evaluated condition, you should use BUILD_BUG_ON to - * detect if someone changes it. - * - * The implementation uses gcc's reluctance to create a negative array, but gcc - * (as of 4.4) only emits that error for obvious cases (e.g. not arguments to - * inline functions). Luckily, in 4.3 they added the "error" function - * attribute just for this type of case. Thus, we use a negative sized array - * (should always create an error on gcc versions older than 4.4) and then call - * an undefined function with the error attribute (should always create an - * error on gcc 4.3 and later). If for some reason, neither creates a - * compile-time error, we'll still have a link-time error, which is harder to - * track down. - */ -#ifndef __OPTIMIZE__ -#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) -#else -#define BUILD_BUG_ON(condition) \ - BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition) -#endif - -/** - * BUILD_BUG - break compile if used. - * - * If you have some code that you expect the compiler to eliminate at - * build time, you should use BUILD_BUG to detect if it is - * unexpectedly used. - */ -#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed") - #define MAYBE_BUILD_BUG_ON(cond) \ do { \ if (__builtin_constant_p((cond))) \ diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h new file mode 100644 index 000000000000..b7d22d60008a --- /dev/null +++ b/include/linux/build_bug.h @@ -0,0 +1,84 @@ +#ifndef _LINUX_BUILD_BUG_H +#define _LINUX_BUILD_BUG_H + +#include + +#ifdef __CHECKER__ +#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) +#define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) +#define BUILD_BUG_ON_ZERO(e) (0) +#define BUILD_BUG_ON_NULL(e) ((void *)0) +#define BUILD_BUG_ON_INVALID(e) (0) +#define BUILD_BUG_ON_MSG(cond, msg) (0) +#define BUILD_BUG_ON(condition) (0) +#define BUILD_BUG() (0) +#else /* __CHECKER__ */ + +/* Force a compilation error if a constant expression is not a power of 2 */ +#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \ + BUILD_BUG_ON(((n) & ((n) - 1)) != 0) +#define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ + BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) + +/* + * Force a compilation error if condition is true, but also produce a + * result (of value 0 and type size_t), so the expression can be used + * e.g. in a structure initializer (or where-ever else comma expressions + * aren't permitted). + */ +#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:(-!!(e)); })) +#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:(-!!(e)); })) + +/* + * BUILD_BUG_ON_INVALID() permits the compiler to check the validity of the + * expression but avoids the generation of any code, even if that expression + * has side-effects. + */ +#define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e)))) + +/** + * BUILD_BUG_ON_MSG - break compile if a condition is true & emit supplied + * error message. + * @condition: the condition which the compiler should know is false. + * + * See BUILD_BUG_ON for description. + */ +#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg) + +/** + * BUILD_BUG_ON - break compile if a condition is true. + * @condition: the condition which the compiler should know is false. + * + * If you have some code which relies on certain constants being equal, or + * some other compile-time-evaluated condition, you should use BUILD_BUG_ON to + * detect if someone changes it. + * + * The implementation uses gcc's reluctance to create a negative array, but gcc + * (as of 4.4) only emits that error for obvious cases (e.g. not arguments to + * inline functions). Luckily, in 4.3 they added the "error" function + * attribute just for this type of case. Thus, we use a negative sized array + * (should always create an error on gcc versions older than 4.4) and then call + * an undefined function with the error attribute (should always create an + * error on gcc 4.3 and later). If for some reason, neither creates a + * compile-time error, we'll still have a link-time error, which is harder to + * track down. + */ +#ifndef __OPTIMIZE__ +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#else +#define BUILD_BUG_ON(condition) \ + BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition) +#endif + +/** + * BUILD_BUG - break compile if used. + * + * If you have some code that you expect the compiler to eliminate at + * build time, you should use BUILD_BUG to detect if it is + * unexpectedly used. + */ +#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed") + +#endif /* __CHECKER__ */ + +#endif /* _LINUX_BUILD_BUG_H */ -- cgit v1.2.3-70-g09d2 From 287f3ca563d8ba0ede4ac0cec84218a1ea5e848f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 10 Jul 2017 15:51:10 -0700 Subject: ARM: fix rd_size declaration The global variable 'rd_size' is declared as 'int' in source file arch/arm/kernel/atags_parse.c and as 'unsigned long' in drivers/block/brd.c. Fix this inconsistency. Additionally, remove the declarations of rd_image_start, rd_prompt and rd_doload from parse_tag_ramdisk() since these duplicate existing declarations in . Link: http://lkml.kernel.org/r/20170627065024.12347-1-bart.vanassche@wdc.com Signed-off-by: Bart Van Assche Acked-by: Russell King Cc: Jens Axboe Cc: Jan Kara Cc: Jason Yan Cc: Zhaohongjiang Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/atags_parse.c | 3 +-- drivers/block/brd.c | 1 + include/linux/initrd.h | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/arm/kernel/atags_parse.c b/arch/arm/kernel/atags_parse.c index 68c6ae0b9e4c..98fbfd235ac8 100644 --- a/arch/arm/kernel/atags_parse.c +++ b/arch/arm/kernel/atags_parse.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -91,8 +92,6 @@ __tagtable(ATAG_VIDEOTEXT, parse_tag_videotext); #ifdef CONFIG_BLK_DEV_RAM static int __init parse_tag_ramdisk(const struct tag *tag) { - extern int rd_size, rd_image_start, rd_prompt, rd_doload; - rd_image_start = tag->u.ramdisk.start; rd_doload = (tag->u.ramdisk.flags & 1) == 0; rd_prompt = (tag->u.ramdisk.flags & 2) == 0; diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 17723fd50a53..104b71c0490d 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include diff --git a/include/linux/initrd.h b/include/linux/initrd.h index 55289d261b4f..bc67b767f9ce 100644 --- a/include/linux/initrd.h +++ b/include/linux/initrd.h @@ -10,6 +10,9 @@ extern int rd_prompt; /* starting block # of image */ extern int rd_image_start; +/* size of a single RAM disk */ +extern unsigned long rd_size; + /* 1 if it is not an error if initrd_start < memory_start */ extern int initrd_below_start_ok; -- cgit v1.2.3-70-g09d2 From e5af323c9badd5dc09af7ccf9d45616ebffc623c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 10 Jul 2017 15:51:29 -0700 Subject: bitmap: optimise bitmap_set and bitmap_clear of a single bit We have eight users calling bitmap_clear for a single bit and seventeen calling bitmap_set for a single bit. Rather than fix all of them to call __clear_bit or __set_bit, turn bitmap_clear and bitmap_set into inline functions and make this special case efficient. Link: http://lkml.kernel.org/r/20170628153221.11322-3-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Rasmus Villemoes Cc: Martin Schwidefsky Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 23 ++++++++++++++++++++--- lib/bitmap.c | 8 ++++---- lib/test_bitmap.c | 3 --- 3 files changed, 24 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 3b77588a9360..4e0f0c8167af 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -112,9 +112,8 @@ extern int __bitmap_intersects(const unsigned long *bitmap1, extern int __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); - -extern void bitmap_set(unsigned long *map, unsigned int start, int len); -extern void bitmap_clear(unsigned long *map, unsigned int start, int len); +extern void __bitmap_set(unsigned long *map, unsigned int start, int len); +extern void __bitmap_clear(unsigned long *map, unsigned int start, int len); extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map, unsigned long size, @@ -315,6 +314,24 @@ static __always_inline int bitmap_weight(const unsigned long *src, unsigned int return __bitmap_weight(src, nbits); } +static __always_inline void bitmap_set(unsigned long *map, unsigned int start, + unsigned int nbits) +{ + if (__builtin_constant_p(nbits) && nbits == 1) + __set_bit(start, map); + else + __bitmap_set(map, start, nbits); +} + +static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, + unsigned int nbits) +{ + if (__builtin_constant_p(nbits) && nbits == 1) + __clear_bit(start, map); + else + __bitmap_clear(map, start, nbits); +} + static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, int nbits) { diff --git a/lib/bitmap.c b/lib/bitmap.c index 08c6ef3a2b6f..9a532805364b 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -251,7 +251,7 @@ int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) } EXPORT_SYMBOL(__bitmap_weight); -void bitmap_set(unsigned long *map, unsigned int start, int len) +void __bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len; @@ -270,9 +270,9 @@ void bitmap_set(unsigned long *map, unsigned int start, int len) *p |= mask_to_set; } } -EXPORT_SYMBOL(bitmap_set); +EXPORT_SYMBOL(__bitmap_set); -void bitmap_clear(unsigned long *map, unsigned int start, int len) +void __bitmap_clear(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len; @@ -291,7 +291,7 @@ void bitmap_clear(unsigned long *map, unsigned int start, int len) *p &= ~mask_to_clear; } } -EXPORT_SYMBOL(bitmap_clear); +EXPORT_SYMBOL(__bitmap_clear); /** * bitmap_find_next_zero_area_off - find a contiguous aligned zero area diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 252d3bddbe7d..2526a2975c51 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -333,9 +333,6 @@ static void __init test_bitmap_u32_array_conversions(void) } } -#define __bitmap_set(a, b, c) bitmap_set(a, b, c) -#define __bitmap_clear(a, b, c) bitmap_clear(a, b, c) - static void noinline __init test_mem_optimisations(void) { DECLARE_BITMAP(bmap1, 1024); -- cgit v1.2.3-70-g09d2 From 2a98dc028f911a7c59c87d11d4eed6626be1605b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 10 Jul 2017 15:51:32 -0700 Subject: include/linux/bitmap.h: turn bitmap_set and bitmap_clear into memset when possible Several callers have constant 'start' and an 'nbits' that is a multiple of 8, so we can turn them into calls to memset. We don't need the entirety of 'start' and 'nbits' to be constant, we just need to know whether they're divisible by 8. Link: http://lkml.kernel.org/r/20170628153221.11322-4-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Rasmus Villemoes Cc: Martin Schwidefsky Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 4e0f0c8167af..c04c9d155e59 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -319,6 +319,9 @@ static __always_inline void bitmap_set(unsigned long *map, unsigned int start, { if (__builtin_constant_p(nbits) && nbits == 1) __set_bit(start, map); + else if (__builtin_constant_p(start & 7) && IS_ALIGNED(start, 8) && + __builtin_constant_p(nbits & 7) && IS_ALIGNED(nbits, 8)) + memset((char *)map + start / 8, 0xff, nbits / 8); else __bitmap_set(map, start, nbits); } @@ -328,6 +331,9 @@ static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, { if (__builtin_constant_p(nbits) && nbits == 1) __clear_bit(start, map); + else if (__builtin_constant_p(start & 7) && IS_ALIGNED(start, 8) && + __builtin_constant_p(nbits & 7) && IS_ALIGNED(nbits, 8)) + memset((char *)map + start / 8, 0, nbits / 8); else __bitmap_clear(map, start, nbits); } -- cgit v1.2.3-70-g09d2 From 2c6deb01525ac11cc03c44fe31e3f45ce2cadaf9 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 10 Jul 2017 15:51:35 -0700 Subject: bitmap: use memcmp optimisation in more situations Commit 7dd968163f7c ("bitmap: bitmap_equal memcmp optimization") was rather more restrictive than necessary; we can use memcmp() to implement bitmap_equal() as long as the number of bits can be proved to be a multiple of 8. And architectures other than s390 may be able to make good use of this optimisation. [arnd@arndb.de: fix build: add a memcmp() declaration] Link: http://lkml.kernel.org/r/20170630153908.3439707-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20170628153221.11322-5-willy@infradead.org Signed-off-by: Matthew Wilcox Signed-off-by: Arnd Bergmann Acked-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/boot/compressed/decompress.c | 1 + include/linux/bitmap.h | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c index ea7832702a8f..f3a4bedd1afc 100644 --- a/arch/arm/boot/compressed/decompress.c +++ b/arch/arm/boot/compressed/decompress.c @@ -33,6 +33,7 @@ extern void error(char *); /* Not needed, but used in some headers pulled in by decompressors */ extern char * strstr(const char * s1, const char *s2); extern size_t strlen(const char *s); +extern int memcmp(const void *cs, const void *ct, size_t count); #ifdef CONFIG_KERNEL_GZIP #include "../../../../lib/decompress_inflate.c" diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index c04c9d155e59..5797ca6fdfe2 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -266,10 +266,8 @@ static inline int bitmap_equal(const unsigned long *src1, { if (small_const_nbits(nbits)) return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); -#ifdef CONFIG_S390 - if (__builtin_constant_p(nbits) && (nbits % BITS_PER_LONG) == 0) + if (__builtin_constant_p(nbits & 7) && IS_ALIGNED(nbits, 8)) return !memcmp(src1, src2, nbits / 8); -#endif return __bitmap_equal(src1, src2, nbits); } -- cgit v1.2.3-70-g09d2 From a94c33dd1f677d16c4f1a162b4b3e9eba1b07c24 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Mon, 10 Jul 2017 15:51:58 -0700 Subject: lib/extable.c: use bsearch() library function in search_extable() [thomas@m3y3r.de: v3: fix arch specific implementations] Link: http://lkml.kernel.org/r/1497890858.12931.7.camel@m3y3r.de Signed-off-by: Thomas Meyer Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/kernel/module.c | 3 ++- arch/mips/kernel/traps.c | 3 ++- arch/sh/mm/extable_64.c | 34 ++++++++++++++++++---------------- arch/sparc/mm/extable.c | 28 ++++++++++++++-------------- include/linux/extable.h | 5 +++-- kernel/extable.c | 3 ++- kernel/module.c | 2 +- lib/extable.c | 41 +++++++++++++++++++++-------------------- 8 files changed, 63 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 94627a3a6a0d..50c020c47e54 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -317,7 +317,8 @@ const struct exception_table_entry *search_module_dbetables(unsigned long addr) spin_lock_irqsave(&dbe_lock, flags); list_for_each_entry(dbe, &dbe_list, dbe_list) { - e = search_extable(dbe->dbe_start, dbe->dbe_end - 1, addr); + e = search_extable(dbe->dbe_start, + dbe->dbe_end - dbe->dbe_start, addr); if (e) break; } diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 38dfa27730ff..b68b4d0726d3 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -429,7 +429,8 @@ static const struct exception_table_entry *search_dbe_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___dbe_table, __stop___dbe_table - 1, addr); + e = search_extable(__start___dbe_table, + __stop___dbe_table - __start___dbe_table, addr); if (!e) e = search_module_dbetables(addr); return e; diff --git a/arch/sh/mm/extable_64.c b/arch/sh/mm/extable_64.c index b90cdfad2c78..7a3b4d33d2e7 100644 --- a/arch/sh/mm/extable_64.c +++ b/arch/sh/mm/extable_64.c @@ -10,6 +10,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ +#include #include #include #include @@ -40,10 +41,23 @@ static const struct exception_table_entry *check_exception_ranges(unsigned long return NULL; } +static int cmp_ex_search(const void *key, const void *elt) +{ + const struct exception_table_entry *_elt = elt; + unsigned long _key = *(unsigned long *)key; + + /* avoid overflow */ + if (_key > _elt->insn) + return 1; + if (_key < _elt->insn) + return -1; + return 0; +} + /* Simple binary search */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { const struct exception_table_entry *mid; @@ -52,20 +66,8 @@ search_extable(const struct exception_table_entry *first, if (mid) return mid; - while (first <= last) { - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - - return NULL; + return bsearch(&value, base, num, + sizeof(struct exception_table_entry), cmp_ex_search); } int fixup_exception(struct pt_regs *regs) diff --git a/arch/sparc/mm/extable.c b/arch/sparc/mm/extable.c index db214e9931d9..2422511dc8c5 100644 --- a/arch/sparc/mm/extable.c +++ b/arch/sparc/mm/extable.c @@ -13,11 +13,11 @@ void sort_extable(struct exception_table_entry *start, /* Caller knows they are in a range if ret->fixup == 0 */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *start, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { - const struct exception_table_entry *walk; + int i; /* Single insn entries are encoded as: * word 1: insn address @@ -37,30 +37,30 @@ search_extable(const struct exception_table_entry *start, */ /* 1. Try to find an exact match. */ - for (walk = start; walk <= last; walk++) { - if (walk->fixup == 0) { + for (i = 0; i < num; i++) { + if (base[i].fixup == 0) { /* A range entry, skip both parts. */ - walk++; + i++; continue; } /* A deleted entry; see trim_init_extable */ - if (walk->fixup == -1) + if (base[i].fixup == -1) continue; - if (walk->insn == value) - return walk; + if (base[i].insn == value) + return &base[i]; } /* 2. Try to find a range match. */ - for (walk = start; walk <= (last - 1); walk++) { - if (walk->fixup) + for (i = 0; i < (num - 1); i++) { + if (base[i].fixup) continue; - if (walk[0].insn <= value && walk[1].insn > value) - return walk; + if (base[i].insn <= value && base[i + 1].insn > value) + return &base[i]; - walk++; + i++; } return NULL; diff --git a/include/linux/extable.h b/include/linux/extable.h index 7effea4b257d..28addad0dda7 100644 --- a/include/linux/extable.h +++ b/include/linux/extable.h @@ -2,13 +2,14 @@ #define _LINUX_EXTABLE_H #include /* for NULL */ +#include struct module; struct exception_table_entry; const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value); void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish); diff --git a/kernel/extable.c b/kernel/extable.c index 223df4a328a4..38c2412401a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___ex_table, __stop___ex_table-1, addr); + e = search_extable(__start___ex_table, + __stop___ex_table - __start___ex_table, addr); if (!e) e = search_module_extables(addr); return e; diff --git a/kernel/module.c b/kernel/module.c index b3dbdde82e80..b0f92a365140 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4196,7 +4196,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) goto out; e = search_extable(mod->extable, - mod->extable + mod->num_exentries - 1, + mod->num_exentries, addr); out: preempt_enable(); diff --git a/lib/extable.c b/lib/extable.c index 62968daa66a9..f54996fdd0b8 100644 --- a/lib/extable.c +++ b/lib/extable.c @@ -9,6 +9,7 @@ * 2 of the License, or (at your option) any later version. */ +#include #include #include #include @@ -51,7 +52,7 @@ static void swap_ex(void *a, void *b, int size) * This is used both for the kernel exception table and for * the exception tables of modules that get loaded. */ -static int cmp_ex(const void *a, const void *b) +static int cmp_ex_sort(const void *a, const void *b) { const struct exception_table_entry *x = a, *y = b; @@ -67,7 +68,7 @@ void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish) { sort(start, finish - start, sizeof(struct exception_table_entry), - cmp_ex, swap_ex); + cmp_ex_sort, swap_ex); } #ifdef CONFIG_MODULES @@ -93,6 +94,20 @@ void trim_init_extable(struct module *m) #endif /* !ARCH_HAS_SORT_EXTABLE */ #ifndef ARCH_HAS_SEARCH_EXTABLE + +static int cmp_ex_search(const void *key, const void *elt) +{ + const struct exception_table_entry *_elt = elt; + unsigned long _key = *(unsigned long *)key; + + /* avoid overflow */ + if (_key > ex_to_insn(_elt)) + return 1; + if (_key < ex_to_insn(_elt)) + return -1; + return 0; +} + /* * Search one exception table for an entry corresponding to the * given instruction address, and return the address of the entry, @@ -101,25 +116,11 @@ void trim_init_extable(struct module *m) * already sorted. */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { - while (first <= last) { - const struct exception_table_entry *mid; - - mid = ((last - first) >> 1) + first; - /* - * careful, the distance between value and insn - * can be larger than MAX_LONG: - */ - if (ex_to_insn(mid) < value) - first = mid + 1; - else if (ex_to_insn(mid) > value) - last = mid - 1; - else - return mid; - } - return NULL; + return bsearch(&value, base, num, + sizeof(struct exception_table_entry), cmp_ex_search); } #endif -- cgit v1.2.3-70-g09d2