From 1d798ca3f16437c71ff63e36597ff07f9c12e4d6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 6 Nov 2015 16:29:54 -0800 Subject: mm: make compound_head() robust Hugh has pointed that compound_head() call can be unsafe in some context. There's one example: CPU0 CPU1 isolate_migratepages_block() page_count() compound_head() !!PageTail() == true put_page() tail->first_page = NULL head = tail->first_page alloc_pages(__GFP_COMP) prep_compound_page() tail->first_page = head __SetPageTail(p); !!PageTail() == true The race is pure theoretical. I don't it's possible to trigger it in practice. But who knows. We can fix the race by changing how encode PageTail() and compound_head() within struct page to be able to update them in one shot. The patch introduces page->compound_head into third double word block in front of compound_dtor and compound_order. Bit 0 encodes PageTail() and the rest bits are pointer to head page if bit zero is set. The patch moves page->pmd_huge_pte out of word, just in case if an architecture defines pgtable_t into something what can have the bit 0 set. hugetlb_cgroup uses page->lru.next in the second tail page to store pointer struct hugetlb_cgroup. The patch switch it to use page->private in the second tail page instead. The space is free since ->first_page is removed from the union. The patch also opens possibility to remove HUGETLB_CGROUP_MIN_ORDER limitation, since there's now space in first tail page to store struct hugetlb_cgroup pointer. But that's out of scope of the patch. That means page->compound_head shares storage space with: - page->lru.next; - page->next; - page->rcu_head.next; That's too long list to be absolutely sure, but looks like nobody uses bit 0 of the word. page->rcu_head.next guaranteed[1] to have bit 0 clean as long as we use call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu(). But future call_rcu_lazy() is not allowed as it makes use of the bit and we can get false positive PageTail(). [1] http://lkml.kernel.org/g/20150827163634.GD4029@linux.vnet.ibm.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Reviewed-by: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Cc: Vlastimil Babka Acked-by: Paul E. McKenney Cc: Aneesh Kumar K.V Cc: Andi Kleen Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fae1bd6f9f37..e361001519d3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -445,15 +445,15 @@ out: /* * Higher-order pages are called "compound pages". They are structured thusly: * - * The first PAGE_SIZE page is called the "head page". + * The first PAGE_SIZE page is called the "head page" and have PG_head set. * - * The remaining PAGE_SIZE pages are called "tail pages". + * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded + * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * - * All pages have PG_compound set. All tail pages have their ->first_page - * pointing at the head page. + * The first tail page's ->compound_dtor holds the offset in array of compound + * page destructors. See compound_page_dtors. * - * The first tail page's ->lru.next holds the address of the compound page's - * put_page() function. Its ->lru.prev holds the order of allocation. + * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ @@ -473,10 +473,7 @@ void prep_compound_page(struct page *page, unsigned long order) for (i = 1; i < nr_pages; i++) { struct page *p = page + i; set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + set_compound_head(p, page); } } @@ -854,17 +851,30 @@ static void free_one_page(struct zone *zone, static int free_tail_pages_check(struct page *head_page, struct page *page) { - if (!IS_ENABLED(CONFIG_DEBUG_VM)) - return 0; + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; + } if (unlikely(!PageTail(page))) { bad_page(page, "PageTail not set", 0); - return 1; + goto out; } - if (unlikely(page->first_page != head_page)) { - bad_page(page, "first_page not consistent", 0); - return 1; + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent", 0); + goto out; } - return 0; + ret = 0; +out: + clear_compound_head(page); + return ret; } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -931,6 +941,10 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) struct page *page = pfn_to_page(start_pfn); init_reserved_page(start_pfn); + + /* Avoid false-positive PageTail() */ + INIT_LIST_HEAD(&page->lru); + SetPageReserved(page); } } -- cgit v1.2.3-70-g09d2