From 124fe20d94630b6f173dae5eb815e6e6e350c72d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 10 Aug 2015 23:07:05 -0400 Subject: mm: enhance region_is_ram() to region_intersects() region_is_ram() is used to prevent the establishment of aliased mappings to physical "System RAM" with incompatible cache settings. However, it uses "-1" to indicate both "unknown" memory ranges (ranges not described by platform firmware) and "mixed" ranges (where the parameters describe a range that partially overlaps "System RAM"). Fix this up by explicitly tracking the "unknown" vs "mixed" resource cases and returning REGION_INTERSECTS, REGION_MIXED, or REGION_DISJOINT. This re-write also adds support for detecting when the requested region completely eclipses all of a resource. Note, the implementation treats overlaps between "unknown" and the requested memory type as REGION_INTERSECTS. Finally, other memory types can be passed in by name, for now the only usage "System RAM". Suggested-by: Luis R. Rodriguez Reviewed-by: Toshi Kani Signed-off-by: Dan Williams --- include/linux/mm.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e872f92dbac..84b05ebedb2d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -369,7 +369,14 @@ static inline int put_page_unless_one(struct page *page) } extern int page_is_ram(unsigned long pfn); -extern int region_is_ram(resource_size_t phys_addr, unsigned long size); + +enum { + REGION_INTERSECTS, + REGION_DISJOINT, + REGION_MIXED, +}; + +int region_intersects(resource_size_t offset, size_t size, const char *type); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); -- cgit v1.2.3-70-g09d2 From 8025e5ddf9c1cac0e632dad49a63abf7848b78cb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 13 Jul 2015 11:55:44 -0300 Subject: [media] mm: Provide new get_vaddr_frames() helper Provide new function get_vaddr_frames(). This function maps virtual addresses from given start and fills given array with page frame numbers of the corresponding pages. If given start belongs to a normal vma, the function grabs reference to each of the pages to pin them in memory. If start belongs to VM_IO | VM_PFNMAP vma, we don't touch page structures. Caller must make sure pfns aren't reused for anything else while he is using them. This function is created for various drivers to simplify handling of their buffers. Signed-off-by: Jan Kara Acked-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Andrew Morton Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/linux/mm.h | 44 ++++++++++ mm/Kconfig | 3 + mm/Makefile | 1 + mm/frame_vector.c | 230 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 278 insertions(+) create mode 100644 mm/frame_vector.c (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e872f92dbac..79ad29a8a60a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -20,6 +20,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1198,6 +1199,49 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, int write, int force, struct page **pages); int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); + +/* Container for pinned pfns / pages */ +struct frame_vector { + unsigned int nr_allocated; /* Number of frames we have space for */ + unsigned int nr_frames; /* Number of frames stored in ptrs array */ + bool got_ref; /* Did we pin pages by getting page ref? */ + bool is_pfns; /* Does array contain pages or pfns? */ + void *ptrs[0]; /* Array of pinned pfns / pages. Use + * pfns_vector_pages() or pfns_vector_pfns() + * for access */ +}; + +struct frame_vector *frame_vector_create(unsigned int nr_frames); +void frame_vector_destroy(struct frame_vector *vec); +int get_vaddr_frames(unsigned long start, unsigned int nr_pfns, + bool write, bool force, struct frame_vector *vec); +void put_vaddr_frames(struct frame_vector *vec); +int frame_vector_to_pages(struct frame_vector *vec); +void frame_vector_to_pfns(struct frame_vector *vec); + +static inline unsigned int frame_vector_count(struct frame_vector *vec) +{ + return vec->nr_frames; +} + +static inline struct page **frame_vector_pages(struct frame_vector *vec) +{ + if (vec->is_pfns) { + int err = frame_vector_to_pages(vec); + + if (err) + return ERR_PTR(err); + } + return (struct page **)(vec->ptrs); +} + +static inline unsigned long *frame_vector_pfns(struct frame_vector *vec) +{ + if (!vec->is_pfns) + frame_vector_to_pfns(vec); + return (unsigned long *)(vec->ptrs); +} + struct kvec; int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, struct page **pages); diff --git a/mm/Kconfig b/mm/Kconfig index e79de2bd12cd..7f146dd32fc5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -654,3 +654,6 @@ config DEFERRED_STRUCT_PAGE_INIT when kswapd starts. This has a potential performance impact on processes running early in the lifetime of the systemm until kswapd finishes the initialisation. + +config FRAME_VECTOR + bool diff --git a/mm/Makefile b/mm/Makefile index 98c4eaeabdcb..be5d5c866305 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -78,3 +78,4 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o diff --git a/mm/frame_vector.c b/mm/frame_vector.c new file mode 100644 index 000000000000..cdabcb93c6a6 --- /dev/null +++ b/mm/frame_vector.c @@ -0,0 +1,230 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * get_vaddr_frames() - map virtual addresses to pfns + * @start: starting user address + * @nr_frames: number of pages / pfns from start to map + * @write: whether pages will be written to by the caller + * @force: whether to force write access even if user mapping is + * readonly. See description of the same argument of + get_user_pages(). + * @vec: structure which receives pages / pfns of the addresses mapped. + * It should have space for at least nr_frames entries. + * + * This function maps virtual addresses from @start and fills @vec structure + * with page frame numbers or page pointers to corresponding pages (choice + * depends on the type of the vma underlying the virtual address). If @start + * belongs to a normal vma, the function grabs reference to each of the pages + * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't + * touch page structures and the caller must make sure pfns aren't reused for + * anything else while he is using them. + * + * The function returns number of pages mapped which may be less than + * @nr_frames. In particular we stop mapping if there are more vmas of + * different type underlying the specified range of virtual addresses. + * When the function isn't able to map a single page, it returns error. + * + * This function takes care of grabbing mmap_sem as necessary. + */ +int get_vaddr_frames(unsigned long start, unsigned int nr_frames, + bool write, bool force, struct frame_vector *vec) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int ret = 0; + int err; + int locked; + + if (nr_frames == 0) + return 0; + + if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) + nr_frames = vec->nr_allocated; + + down_read(&mm->mmap_sem); + locked = 1; + vma = find_vma_intersection(mm, start, start + 1); + if (!vma) { + ret = -EFAULT; + goto out; + } + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { + vec->got_ref = true; + vec->is_pfns = false; + ret = get_user_pages_locked(current, mm, start, nr_frames, + write, force, (struct page **)(vec->ptrs), &locked); + goto out; + } + + vec->got_ref = false; + vec->is_pfns = true; + do { + unsigned long *nums = frame_vector_pfns(vec); + + while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) { + err = follow_pfn(vma, start, &nums[ret]); + if (err) { + if (ret == 0) + ret = err; + goto out; + } + start += PAGE_SIZE; + ret++; + } + /* + * We stop if we have enough pages or if VMA doesn't completely + * cover the tail page. + */ + if (ret >= nr_frames || start < vma->vm_end) + break; + vma = find_vma_intersection(mm, start, start + 1); + } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP)); +out: + if (locked) + up_read(&mm->mmap_sem); + if (!ret) + ret = -EFAULT; + if (ret > 0) + vec->nr_frames = ret; + return ret; +} +EXPORT_SYMBOL(get_vaddr_frames); + +/** + * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired + * them + * @vec: frame vector to put + * + * Drop references to pages if get_vaddr_frames() acquired them. We also + * invalidate the frame vector so that it is prepared for the next call into + * get_vaddr_frames(). + */ +void put_vaddr_frames(struct frame_vector *vec) +{ + int i; + struct page **pages; + + if (!vec->got_ref) + goto out; + pages = frame_vector_pages(vec); + /* + * frame_vector_pages() might needed to do a conversion when + * get_vaddr_frames() got pages but vec was later converted to pfns. + * But it shouldn't really fail to convert pfns back... + */ + if (WARN_ON(IS_ERR(pages))) + goto out; + for (i = 0; i < vec->nr_frames; i++) + put_page(pages[i]); + vec->got_ref = false; +out: + vec->nr_frames = 0; +} +EXPORT_SYMBOL(put_vaddr_frames); + +/** + * frame_vector_to_pages - convert frame vector to contain page pointers + * @vec: frame vector to convert + * + * Convert @vec to contain array of page pointers. If the conversion is + * successful, return 0. Otherwise return an error. Note that we do not grab + * page references for the page structures. + */ +int frame_vector_to_pages(struct frame_vector *vec) +{ + int i; + unsigned long *nums; + struct page **pages; + + if (!vec->is_pfns) + return 0; + nums = frame_vector_pfns(vec); + for (i = 0; i < vec->nr_frames; i++) + if (!pfn_valid(nums[i])) + return -EINVAL; + pages = (struct page **)nums; + for (i = 0; i < vec->nr_frames; i++) + pages[i] = pfn_to_page(nums[i]); + vec->is_pfns = false; + return 0; +} +EXPORT_SYMBOL(frame_vector_to_pages); + +/** + * frame_vector_to_pfns - convert frame vector to contain pfns + * @vec: frame vector to convert + * + * Convert @vec to contain array of pfns. + */ +void frame_vector_to_pfns(struct frame_vector *vec) +{ + int i; + unsigned long *nums; + struct page **pages; + + if (vec->is_pfns) + return; + pages = (struct page **)(vec->ptrs); + nums = (unsigned long *)pages; + for (i = 0; i < vec->nr_frames; i++) + nums[i] = page_to_pfn(pages[i]); + vec->is_pfns = true; +} +EXPORT_SYMBOL(frame_vector_to_pfns); + +/** + * frame_vector_create() - allocate & initialize structure for pinned pfns + * @nr_frames: number of pfns slots we should reserve + * + * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns + * pfns. + */ +struct frame_vector *frame_vector_create(unsigned int nr_frames) +{ + struct frame_vector *vec; + int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames; + + if (WARN_ON_ONCE(nr_frames == 0)) + return NULL; + /* + * This is absurdly high. It's here just to avoid strange effects when + * arithmetics overflows. + */ + if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2)) + return NULL; + /* + * Avoid higher order allocations, use vmalloc instead. It should + * be rare anyway. + */ + if (size <= PAGE_SIZE) + vec = kmalloc(size, GFP_KERNEL); + else + vec = vmalloc(size); + if (!vec) + return NULL; + vec->nr_allocated = nr_frames; + vec->nr_frames = 0; + return vec; +} +EXPORT_SYMBOL(frame_vector_create); + +/** + * frame_vector_destroy() - free memory allocated to carry frame vector + * @vec: Frame vector to free + * + * Free structure allocated by frame_vector_create() to carry frames. + */ +void frame_vector_destroy(struct frame_vector *vec) +{ + /* Make sure put_vaddr_frames() got called properly... */ + VM_BUG_ON(vec->nr_frames > 0); + kvfree(vec); +} +EXPORT_SYMBOL(frame_vector_destroy); -- cgit v1.2.3-70-g09d2 From 2f064f3485cd29633ad1b3cfb00cc519509a3d72 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 21 Aug 2015 14:11:51 -0700 Subject: mm: make page pfmemalloc check more robust Commit c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb") added checks for page->pfmemalloc to __skb_fill_page_desc(): if (page->pfmemalloc && !page->mapping) skb->pfmemalloc = true; It assumes page->mapping == NULL implies that page->pfmemalloc can be trusted. However, __delete_from_page_cache() can set set page->mapping to NULL and leave page->index value alone. Due to being in union, a non-zero page->index will be interpreted as true page->pfmemalloc. So the assumption is invalid if the networking code can see such a page. And it seems it can. We have encountered this with a NFS over loopback setup when such a page is attached to a new skbuf. There is no copying going on in this case so the page confuses __skb_fill_page_desc which interprets the index as pfmemalloc flag and the network stack drops packets that have been allocated using the reserves unless they are to be queued on sockets handling the swapping which is the case here and that leads to hangs when the nfs client waits for a response from the server which has been dropped and thus never arrive. The struct page is already heavily packed so rather than finding another hole to put it in, let's do a trick instead. We can reuse the index again but define it to an impossible value (-1UL). This is the page index so it should never see the value that large. Replace all direct users of page->pfmemalloc by page_is_pfmemalloc which will hide this nastiness from unspoiled eyes. The information will get lost if somebody wants to use page->index obviously but that was the case before and the original code expected that the information should be persisted somewhere else if that is really needed (e.g. what SLAB and SLUB do). [akpm@linux-foundation.org: fix blooper in slub] Fixes: c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb") Signed-off-by: Michal Hocko Debugged-by: Vlastimil Babka Debugged-by: Jiri Bohac Cc: Eric Dumazet Cc: David Miller Acked-by: Mel Gorman Cc: [3.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/ethernet/intel/fm10k/fm10k_main.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- include/linux/mm.h | 28 +++++++++++++++++++++++ include/linux/mm_types.h | 9 -------- include/linux/skbuff.h | 14 ++++-------- mm/page_alloc.c | 9 +++++--- mm/slab.c | 4 ++-- mm/slub.c | 2 +- net/core/skbuff.c | 2 +- 11 files changed, 47 insertions(+), 29 deletions(-) (limited to 'include/linux/mm.h') diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c index 982fdcdc795b..b5b2925103ec 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c @@ -216,7 +216,7 @@ static void fm10k_reuse_rx_page(struct fm10k_ring *rx_ring, static inline bool fm10k_page_is_reserved(struct page *page) { - return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc; + return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page); } static bool fm10k_can_reuse_rx_page(struct fm10k_rx_buffer *rx_buffer, diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 2f70a9b152bd..830466c49987 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -6566,7 +6566,7 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring, static inline bool igb_page_is_reserved(struct page *page) { - return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc; + return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page); } static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 9aa6104e34ea..ae21e0b06c3a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1832,7 +1832,7 @@ static void ixgbe_reuse_rx_page(struct ixgbe_ring *rx_ring, static inline bool ixgbe_page_is_reserved(struct page *page) { - return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc; + return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page); } /** diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index e71cdde9cb01..1d7b00b038a2 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -765,7 +765,7 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring *rx_ring, static inline bool ixgbevf_page_is_reserved(struct page *page) { - return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc; + return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page); } /** diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e872f92dbac..bf6f117fcf4d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1002,6 +1002,34 @@ static inline int page_mapped(struct page *page) return atomic_read(&(page)->_mapcount) >= 0; } +/* + * Return true only if the page has been allocated with + * ALLOC_NO_WATERMARKS and the low watermark was not + * met implying that the system is under some pressure. + */ +static inline bool page_is_pfmemalloc(struct page *page) +{ + /* + * Page index cannot be this large so this must be + * a pfmemalloc page. + */ + return page->index == -1UL; +} + +/* + * Only to be called by the page allocator on a freshly allocated + * page. + */ +static inline void set_page_pfmemalloc(struct page *page) +{ + page->index = -1UL; +} + +static inline void clear_page_pfmemalloc(struct page *page) +{ + page->index = 0; +} + /* * Different kinds of faults, as returned by handle_mm_fault(). * Used to decide whether a process gets delivered SIGBUS or diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0038ac7466fd..15549578d559 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -63,15 +63,6 @@ struct page { union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* sl[aou]b first free object */ - bool pfmemalloc; /* If set by the page allocator, - * ALLOC_NO_WATERMARKS was set - * and the low watermark was not - * met implying that the system - * is under some pressure. The - * caller should try ensure - * this page is only used to - * free other pages. - */ }; union { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 22b6d9ca1654..9b88536487e6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1602,20 +1602,16 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; /* - * Propagate page->pfmemalloc to the skb if we can. The problem is - * that not all callers have unique ownership of the page. If - * pfmemalloc is set, we check the mapping as a mapping implies - * page->index is set (index and pfmemalloc share space). - * If it's a valid mapping, we cannot use page->pfmemalloc but we - * do not lose pfmemalloc information as the pages would not be - * allocated using __GFP_MEMALLOC. + * Propagate page pfmemalloc to the skb if we can. The problem is + * that not all callers have unique ownership of the page but rely + * on page_is_pfmemalloc doing the right thing(tm). */ frag->page.p = page; frag->page_offset = off; skb_frag_size_set(frag, size); page = compound_head(page); - if (page->pfmemalloc && !page->mapping) + if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; } @@ -2263,7 +2259,7 @@ static inline struct page *dev_alloc_page(void) static inline void skb_propagate_pfmemalloc(struct page *page, struct sk_buff *skb) { - if (page && page->pfmemalloc) + if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df959b7d6085..5b5240b7f642 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1343,12 +1343,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, set_page_owner(page, order, gfp_flags); /* - * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to + * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking * steps that will free more memory. The caller should avoid the page * being used for !PFMEMALLOC purposes. */ - page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + if (alloc_flags & ALLOC_NO_WATERMARKS) + set_page_pfmemalloc(page); + else + clear_page_pfmemalloc(page); return 0; } @@ -3345,7 +3348,7 @@ refill: atomic_add(size - 1, &page->_count); /* reset page count bias and offset to start of new frag */ - nc->pfmemalloc = page->pfmemalloc; + nc->pfmemalloc = page_is_pfmemalloc(page); nc->pagecnt_bias = size; nc->offset = size; } diff --git a/mm/slab.c b/mm/slab.c index 200e22412a16..bbd0b47dc6a9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1603,7 +1603,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, } /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (unlikely(page->pfmemalloc)) + if (page_is_pfmemalloc(page)) pfmemalloc_active = true; nr_pages = (1 << cachep->gfporder); @@ -1614,7 +1614,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); __SetPageSlab(page); - if (page->pfmemalloc) + if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { diff --git a/mm/slub.c b/mm/slub.c index 816df0016555..f68c0e50f3c0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1427,7 +1427,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) inc_slabs_node(s, page_to_nid(page), page->objects); page->slab_cache = s; __SetPageSlab(page); - if (page->pfmemalloc) + if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); start = page_address(page); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bf9a5d93c2d1..7b84330e5d30 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -340,7 +340,7 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) if (skb && frag_size) { skb->head_frag = 1; - if (virt_to_head_page(data)->pfmemalloc) + if (page_is_pfmemalloc(virt_to_head_page(data))) skb->pfmemalloc = 1; } return skb; -- cgit v1.2.3-70-g09d2 From 16ba6f811dfe44bc14f7946a4b257b85476fc16e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 4 Sep 2015 15:46:17 -0700 Subject: userfaultfd: add VM_UFFD_MISSING and VM_UFFD_WP These two flags gets set in vma->vm_flags to tell the VM common code if the userfaultfd is armed and in which mode (only tracking missing faults, only tracking wrprotect faults or both). If neither flags is set it means the userfaultfd is not armed on the vma. Signed-off-by: Andrea Arcangeli Acked-by: Pavel Emelyanov Cc: Sanidhya Kashyap Cc: zhang.zhanghailiang@huawei.com Cc: "Kirill A. Shutemov" Cc: Andres Lagar-Cavilla Cc: Dave Hansen Cc: Paolo Bonzini Cc: Rik van Riel Cc: Mel Gorman Cc: Andy Lutomirski Cc: Hugh Dickins Cc: Peter Feiner Cc: "Dr. David Alan Gilbert" Cc: Johannes Weiner Cc: "Huangpeng (Peter)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 ++ include/linux/mm.h | 2 ++ kernel/fork.c | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ca1e091881d4..3b4d8255e806 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -597,6 +597,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_HUGEPAGE)] = "hg", [ilog2(VM_NOHUGEPAGE)] = "nh", [ilog2(VM_MERGEABLE)] = "mg", + [ilog2(VM_UFFD_MISSING)]= "um", + [ilog2(VM_UFFD_WP)] = "uw", }; size_t i; diff --git a/include/linux/mm.h b/include/linux/mm.h index bf6f117fcf4d..0f7cd30039ea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -124,8 +124,10 @@ extern unsigned int kobjsize(const void *objp); #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ +#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ diff --git a/kernel/fork.c b/kernel/fork.c index ceb4eb4abb9d..7d5f0f118a63 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -454,7 +454,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); tmp->vm_next = tmp->vm_prev = NULL; tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; -- cgit v1.2.3-70-g09d2 From 19a809afe2fe089317226bbe5c5a1ce7f53dcdca Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 4 Sep 2015 15:46:24 -0700 Subject: userfaultfd: teach vma_merge to merge across vma->vm_userfaultfd_ctx vma->vm_userfaultfd_ctx is yet another vma parameter that vma_merge must be aware about so that we can merge vmas back like they were originally before arming the userfaultfd on some memory range. Signed-off-by: Andrea Arcangeli Acked-by: Pavel Emelyanov Cc: Sanidhya Kashyap Cc: zhang.zhanghailiang@huawei.com Cc: "Kirill A. Shutemov" Cc: Andres Lagar-Cavilla Cc: Dave Hansen Cc: Paolo Bonzini Cc: Rik van Riel Cc: Mel Gorman Cc: Andy Lutomirski Cc: Hugh Dickins Cc: Peter Feiner Cc: "Dr. David Alan Gilbert" Cc: Johannes Weiner Cc: "Huangpeng (Peter)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/madvise.c | 3 ++- mm/mempolicy.c | 4 ++-- mm/mlock.c | 3 ++- mm/mmap.c | 40 +++++++++++++++++++++++++++------------- mm/mprotect.c | 3 ++- 6 files changed, 36 insertions(+), 19 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0f7cd30039ea..77a9d609523e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1835,7 +1835,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *); + struct mempolicy *, struct vm_userfaultfd_ctx); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); diff --git a/mm/madvise.c b/mm/madvise.c index 64bb8a22110c..911357973905 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -103,7 +103,8 @@ static long madvise_behavior(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma)); + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*prev) { vma = *prev; goto success; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 99d4c1d0b858..a7f1e0d1d6b8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -722,8 +722,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol); + vma->anon_vma, vma->vm_file, pgoff, + new_pol, vma->vm_userfaultfd_ctx); if (prev) { vma = prev; next = vma->vm_next; diff --git a/mm/mlock.c b/mm/mlock.c index 6fd2cf15e868..25936680064f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -510,7 +510,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma)); + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index f126923ce683..82db4fc0a9d3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -919,7 +920,8 @@ again: remove_next = 1 + (end > next->vm_end); * per-vma resources, so we don't attempt to merge those. */ static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags) + struct file *file, unsigned long vm_flags, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (vma->vm_ops && vma->vm_ops->close) return 0; + if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) + return 0; return 1; } @@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, */ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { - if (is_mergeable_vma(vma, file, vm_flags) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, */ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { - if (is_mergeable_vma(vma, file, vm_flags) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy) + pgoff_t pgoff, struct mempolicy *policy, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (prev && prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, - anon_vma, file, pgoff)) { + anon_vma, file, pgoff, + vm_userfaultfd_ctx)) { /* * OK, it can. Can we now merge in the successor as well? */ if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen) && + anon_vma, file, + pgoff+pglen, + vm_userfaultfd_ctx) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen)) { + anon_vma, file, pgoff+pglen, + vm_userfaultfd_ctx)) { if (prev && addr < prev->vm_end) /* case 4 */ err = vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL); @@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* * Can we just expand an old mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, - NULL); + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out; @@ -2757,7 +2770,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out; @@ -2913,7 +2926,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index e7d6f1171ecb..ef5be8eaab00 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -292,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*pprev) { vma = *pprev; goto success; -- cgit v1.2.3-70-g09d2 From 5477e70a6420a6b7ca96c8e21413ee1c96a84260 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 4 Sep 2015 15:48:04 -0700 Subject: mm: move ->mremap() from file_operations to vm_operations_struct vma->vm_ops->mremap() looks more natural and clean in move_vma(), and this way ->mremap() can have more users. Say, vdso. While at it, s/aio_ring_remap/aio_ring_mremap/. Note: this is the minimal change before ->mremap() finds another user in file_operations; this method should have more arguments, and it can be used to kill arch_remap(). Signed-off-by: Oleg Nesterov Acked-by: Pavel Emelyanov Acked-by: Kirill A. Shutemov Cc: David Rientjes Cc: Benjamin LaHaise Cc: Hugh Dickins Cc: Jeff Moyer Cc: Laurent Dufour Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 27 ++++++++++++++++++--------- include/linux/fs.h | 1 - include/linux/mm.h | 1 + mm/mremap.c | 4 ++-- 4 files changed, 21 insertions(+), 12 deletions(-) (limited to 'include/linux/mm.h') diff --git a/fs/aio.c b/fs/aio.c index 480440f4701f..155f84253f33 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_flags |= VM_DONTEXPAND; - vma->vm_ops = &generic_file_vm_ops; - return 0; -} - -static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) +static int aio_ring_mremap(struct vm_area_struct *vma) { + struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; @@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) return res; } +static const struct vm_operations_struct aio_ring_vm_ops = { + .mremap = aio_ring_mremap, +#if IS_ENABLED(CONFIG_MMU) + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = filemap_page_mkwrite, +#endif +}; + +static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_flags |= VM_DONTEXPAND; + vma->vm_ops = &aio_ring_vm_ops; + return 0; +} + static const struct file_operations aio_ring_fops = { .mmap = aio_ring_mmap, - .mremap = aio_ring_remap, }; #if IS_ENABLED(CONFIG_MIGRATION) diff --git a/include/linux/fs.h b/include/linux/fs.h index fbd780c33c5f..864203c10dbc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1612,7 +1612,6 @@ struct file_operations { long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); - int (*mremap)(struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 77a9d609523e..8b257c43855b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -247,6 +247,7 @@ struct vm_fault { struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); + int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); diff --git a/mm/mremap.c b/mm/mremap.c index f54a43fa4b79..3310378bb60f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -277,8 +277,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, need_rmap_locks); if (moved_len < old_len) { err = -ENOMEM; - } else if (vma->vm_file && vma->vm_file->f_op->mremap) { - err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); + } else if (vma->vm_ops && vma->vm_ops->mremap) { + err = vma->vm_ops->mremap(new_vma); } if (unlikely(err)) { -- cgit v1.2.3-70-g09d2 From b5330628546616af14ff23075fbf8d4ad91f6e25 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 8 Sep 2015 14:58:28 -0700 Subject: mm: introduce vma_is_anonymous(vma) helper special_mapping_fault() is absolutely broken. It seems it was always wrong, but this didn't matter until vdso/vvar started to use more than one page. And after this change vma_is_anonymous() becomes really trivial, it simply checks vm_ops == NULL. However, I do think the helper makes sense. There are a lot of ->vm_ops != NULL checks, the helper makes the caller's code more understandable (self-documented) and this is more grep-friendly. This patch (of 3): Preparation. Add the new simple helper, vma_is_anonymous(vma), and change handle_pte_fault() to use it. It will have more users. The name is not accurate, say a hpet_mmap()'ed vma is not anonymous. Perhaps it should be named vma_has_fault() instead. But it matches the logic in mmap.c/memory.c (see next changes). "True" just means that a page fault will use do_anonymous_page(). Signed-off-by: Oleg Nesterov Acked-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Hugh Dickins Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++++ mm/memory.c | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b257c43855b..dfb7ce05f1e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1260,6 +1260,11 @@ static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr) return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); } +static inline bool vma_is_anonymous(struct vm_area_struct *vma) +{ + return !vma->vm_ops; +} + static inline int stack_guard_page_start(struct vm_area_struct *vma, unsigned long addr) { diff --git a/mm/memory.c b/mm/memory.c index bb04d8f2f86c..882c9d7ae2f5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3267,12 +3267,12 @@ static int handle_pte_fault(struct mm_struct *mm, barrier(); if (!pte_present(entry)) { if (pte_none(entry)) { - if (vma->vm_ops) + if (vma_is_anonymous(vma)) + return do_anonymous_page(mm, vma, address, + pte, pmd, flags); + else return do_fault(mm, vma, address, pte, pmd, flags, entry); - - return do_anonymous_page(mm, vma, address, pte, pmd, - flags); } return do_swap_page(mm, vma, address, pte, pmd, flags, entry); -- cgit v1.2.3-70-g09d2 From b96375f74a6d4f39fc6cbdc0bce5175115c7f96f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:58:48 -0700 Subject: mm: add a pmd_fault handler Allow non-anonymous VMAs to provide huge pages in response to a page fault. Signed-off-by: Matthew Wilcox Cc: Hillf Danton Cc: "Kirill A. Shutemov" Cc: Theodore Ts'o Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/memory.c | 30 ++++++++++++++++++++++++------ 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index dfb7ce05f1e3..e1fbd18b8c4b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -249,6 +249,8 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); + int (*pmd_fault)(struct vm_area_struct *, unsigned long address, + pmd_t *, unsigned int flags); void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); /* notification that a previously read-only page is about to become diff --git a/mm/memory.c b/mm/memory.c index 882c9d7ae2f5..a3f9a8ccec0f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3232,6 +3232,27 @@ out: return 0; } +static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, unsigned int flags) +{ + if (!vma->vm_ops) + return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); + if (vma->vm_ops->pmd_fault) + return vma->vm_ops->pmd_fault(vma, address, pmd, flags); + return VM_FAULT_FALLBACK; +} + +static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, pmd_t orig_pmd, + unsigned int flags) +{ + if (!vma->vm_ops) + return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); + if (vma->vm_ops->pmd_fault) + return vma->vm_ops->pmd_fault(vma, address, pmd, flags); + return VM_FAULT_FALLBACK; +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3334,10 +3355,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pmd) return VM_FAULT_OOM; if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { - int ret = VM_FAULT_FALLBACK; - if (!vma->vm_ops) - ret = do_huge_pmd_anonymous_page(mm, vma, address, - pmd, flags); + int ret = create_huge_pmd(mm, vma, address, pmd, flags); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { @@ -3361,8 +3379,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, orig_pmd, pmd); if (dirty && !pmd_write(orig_pmd)) { - ret = do_huge_pmd_wp_page(mm, vma, address, pmd, - orig_pmd); + ret = wp_huge_pmd(mm, vma, address, pmd, + orig_pmd, flags); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { -- cgit v1.2.3-70-g09d2 From b5e3aa0a4d5e35329203fd09acb0dbc7f7fd64de Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 8 Sep 2015 14:59:56 -0700 Subject: mm: remove put_page_unless_one() It has no callers. Signed-off-by: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index e1fbd18b8c4b..4ec72ef1f04a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -361,18 +361,6 @@ static inline int get_page_unless_zero(struct page *page) return atomic_inc_not_zero(&page->_count); } -/* - * Try to drop a ref unless the page has a refcount of one, return false if - * that is the case. - * This is to make sure that the refcount won't become zero after this drop. - * This can be called when MMU is off so it must not access - * any of the virtual mappings. - */ -static inline int put_page_unless_one(struct page *page) -{ - return atomic_add_unless(&page->_count, -1, 1); -} - extern int page_is_ram(unsigned long pfn); extern int region_is_ram(resource_size_t phys_addr, unsigned long size); -- cgit v1.2.3-70-g09d2 From bb14c2c75db972a1bf65fd63c8d5a0b41a8f263a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 8 Sep 2015 15:01:25 -0700 Subject: mm: rename and move get/set_freepage_migratetype The pair of get/set_freepage_migratetype() functions are used to cache pageblock migratetype for a page put on a pcplist, so that it does not have to be retrieved again when the page is put on a free list (e.g. when pcplists become full). Historically it was also assumed that the value is accurate for pages on freelists (as the functions' names unfortunately suggest), but that cannot be guaranteed without affecting various allocator fast paths. It is in fact not needed and all such uses have been removed. The last remaining (but pointless) usage related to pages of freelists is in move_freepages(), which this patch removes. To prevent further confusion, rename the functions to get/set_pcppage_migratetype() and expand their description. Since all the users are now in mm/page_alloc.c, move the functions there from the shared header. Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Acked-by: Joonsoo Kim Cc: Minchan Kim Acked-by: Michal Nazarewicz Cc: Laura Abbott Reviewed-by: Naoya Horiguchi Cc: Seungho Park Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ------------ mm/page_alloc.c | 41 ++++++++++++++++++++++++++++------------- 2 files changed, 28 insertions(+), 25 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4ec72ef1f04a..bab8ff89da50 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -309,18 +309,6 @@ struct inode; #define page_private(page) ((page)->private) #define set_page_private(page, v) ((page)->private = (v)) -/* It's valid only if the page is free path or free_list */ -static inline void set_freepage_migratetype(struct page *page, int migratetype) -{ - page->index = migratetype; -} - -/* It's valid only if the page is free path or free_list */ -static inline int get_freepage_migratetype(struct page *page) -{ - return page->index; -} - /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a329cfaf634d..252665d553b4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -125,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +/* + * A cached value of the page's pageblock's migratetype, used when the page is + * put on a pcplist. Used to avoid the pageblock migratetype lookup when + * freeing from pcplists in most cases, at the cost of possibly becoming stale. + * Also the migratetype set in the page does not necessarily match the pcplist + * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any + * other index - this ensures that it will be put on the correct CMA freelist. + */ +static inline int get_pcppage_migratetype(struct page *page) +{ + return page->index; +} + +static inline void set_pcppage_migratetype(struct page *page, int migratetype) +{ + page->index = migratetype; +} + #ifdef CONFIG_PM_SLEEP /* * The following functions are used by the suspend/hibernate code to temporarily @@ -789,7 +807,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* must delete as __free_one_page list manipulates */ list_del(&page->lru); - mt = get_freepage_migratetype(page); + mt = get_pcppage_migratetype(page); /* MIGRATE_ISOLATE page should not go to pcplists */ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); /* Pageblock could have been isolated meanwhile */ @@ -956,7 +974,6 @@ static void __free_pages_ok(struct page *page, unsigned int order) migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - set_freepage_migratetype(page, migratetype); free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); } @@ -1384,7 +1401,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); - set_freepage_migratetype(page, migratetype); + set_pcppage_migratetype(page, migratetype); return page; } @@ -1461,7 +1478,6 @@ int move_freepages(struct zone *zone, order = page_order(page); list_move(&page->lru, &zone->free_area[order].free_list[migratetype]); - set_freepage_migratetype(page, migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -1631,14 +1647,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) expand(zone, page, order, current_order, area, start_migratetype); /* - * The freepage_migratetype may differ from pageblock's + * The pcppage_migratetype may differ from pageblock's * migratetype depending on the decisions in - * try_to_steal_freepages(). This is OK as long as it - * does not differ for MIGRATE_CMA pageblocks. For CMA - * we need to make sure unallocated pages flushed from - * pcp lists are returned to the correct freelist. + * find_suitable_fallback(). This is OK as long as it does not + * differ for MIGRATE_CMA pageblocks. Those can be used as + * fallback only via special __rmqueue_cma_fallback() function */ - set_freepage_migratetype(page, start_migratetype); + set_pcppage_migratetype(page, start_migratetype); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); @@ -1714,7 +1729,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, else list_add_tail(&page->lru, list); list = &page->lru; - if (is_migrate_cma(get_freepage_migratetype(page))) + if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1911,7 +1926,7 @@ void free_hot_cold_page(struct page *page, bool cold) return; migratetype = get_pfnblock_migratetype(page, pfn); - set_freepage_migratetype(page, migratetype); + set_pcppage_migratetype(page, migratetype); local_irq_save(flags); __count_vm_event(PGFREE); @@ -2116,7 +2131,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, if (!page) goto failed; __mod_zone_freepage_state(zone, -(1 << order), - get_freepage_migratetype(page)); + get_pcppage_migratetype(page)); } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); -- cgit v1.2.3-70-g09d2 From 94bf4ec84a84d3ab2513b4e681fd3d083328d76d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 8 Sep 2015 15:03:15 -0700 Subject: mm/hwpoison: introduce put_hwpoison_page to put refcount for memory error handling Introduce put_hwpoison_page to put refcount for memory error handling. Signed-off-by: Wanpeng Li Suggested-by: Naoya Horiguchi Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/memory-failure.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index bab8ff89da50..11df1a8ea38b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2169,6 +2169,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags); extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); extern int get_hwpoison_page(struct page *page); +extern void put_hwpoison_page(struct page *page); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 863544d84a09..5ceb8253e33b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -934,6 +934,27 @@ int get_hwpoison_page(struct page *page) } EXPORT_SYMBOL_GPL(get_hwpoison_page); +/** + * put_hwpoison_page() - Put refcount for memory error handling: + * @page: raw error page (hit by memory error) + */ +void put_hwpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + + if (PageHuge(head)) { + put_page(head); + return; + } + + if (PageTransHuge(head)) + if (page != head) + put_page(head); + + put_page(page); +} +EXPORT_SYMBOL_GPL(put_hwpoison_page); + /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. -- cgit v1.2.3-70-g09d2 From 1fcfd8db7f82fa1f533a6f0e4155614ff4144d56 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 9 Sep 2015 15:39:29 -0700 Subject: mm, mpx: add "vm_flags_t vm_flags" arg to do_mmap_pgoff() Add the additional "vm_flags_t vm_flags" argument to do_mmap_pgoff(), rename it to do_mmap(), and re-introduce do_mmap_pgoff() as a simple wrapper on top of do_mmap(). Perhaps we should update the callers of do_mmap_pgoff() and kill it later. This way mpx_mmap() can simply call do_mmap(vm_flags => VM_MPX) and do not play with vm internals. After this change mmap_region() has a single user outside of mmap.c, arch/tile/mm/elf.c:arch_setup_additional_pages(). It would be nice to change arch/tile/ and unexport mmap_region(). [kirill@shutemov.name: fix build] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Oleg Nesterov Acked-by: Dave Hansen Tested-by: Dave Hansen Signed-off-by: Kirill A. Shutemov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Minchan Kim Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/mpx.c | 51 +++++++-------------------------------------------- include/linux/mm.h | 12 ++++++++++-- mm/mmap.c | 10 ++++------ mm/nommu.c | 19 ++++++++++--------- 4 files changed, 31 insertions(+), 61 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index db1b0bc5017c..134948b0926f 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -42,58 +42,21 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm) */ static unsigned long mpx_mmap(unsigned long len) { - unsigned long ret; - unsigned long addr, pgoff; struct mm_struct *mm = current->mm; - vm_flags_t vm_flags; - struct vm_area_struct *vma; + unsigned long addr, populate; /* Only bounds table can be allocated here */ if (len != mpx_bt_size_bytes(mm)) return -EINVAL; down_write(&mm->mmap_sem); - - /* Too many mappings? */ - if (mm->map_count > sysctl_max_map_count) { - ret = -ENOMEM; - goto out; - } - - /* Obtain the address to map to. we verify (or select) it and ensure - * that it represents a valid section of the address space. - */ - addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE); - if (addr & ~PAGE_MASK) { - ret = addr; - goto out; - } - - vm_flags = VM_READ | VM_WRITE | VM_MPX | - mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - - /* Set pgoff according to addr for anon_vma */ - pgoff = addr >> PAGE_SHIFT; - - ret = mmap_region(NULL, addr, len, vm_flags, pgoff); - if (IS_ERR_VALUE(ret)) - goto out; - - vma = find_vma(mm, ret); - if (!vma) { - ret = -ENOMEM; - goto out; - } - - if (vm_flags & VM_LOCKED) { - up_write(&mm->mmap_sem); - mm_populate(ret, len); - return ret; - } - -out: + addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate); up_write(&mm->mmap_sem); - return ret; + if (populate) + mm_populate(addr, populate); + + return addr; } enum reg_type { diff --git a/include/linux/mm.h b/include/linux/mm.h index f25a957bf0ab..fda728e3c27d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1873,11 +1873,19 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); -extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, +extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff, unsigned long *populate); + vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate); extern int do_munmap(struct mm_struct *, unsigned long, size_t); +static inline unsigned long +do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, unsigned long flags, + unsigned long pgoff, unsigned long *populate) +{ + return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate); +} + #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); diff --git a/mm/mmap.c b/mm/mmap.c index b6be3249f0a9..c739d6db7193 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1260,14 +1260,12 @@ static inline int mlock_future_check(struct mm_struct *mm, /* * The caller must hold down_write(¤t->mm->mmap_sem). */ - -unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, +unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff, - unsigned long *populate) + unsigned long flags, vm_flags_t vm_flags, + unsigned long pgoff, unsigned long *populate) { struct mm_struct *mm = current->mm; - vm_flags_t vm_flags; *populate = 0; @@ -1311,7 +1309,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) diff --git a/mm/nommu.c b/mm/nommu.c index 1cc0709fcaa5..ab14a2014dea 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1233,18 +1233,19 @@ enomem: /* * handle mapping creation for uClinux */ -unsigned long do_mmap_pgoff(struct file *file, - unsigned long addr, - unsigned long len, - unsigned long prot, - unsigned long flags, - unsigned long pgoff, - unsigned long *populate) +unsigned long do_mmap(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long prot, + unsigned long flags, + vm_flags_t vm_flags, + unsigned long pgoff, + unsigned long *populate) { struct vm_area_struct *vma; struct vm_region *region; struct rb_node *rb; - unsigned long capabilities, vm_flags, result; + unsigned long capabilities, result; int ret; *populate = 0; @@ -1262,7 +1263,7 @@ unsigned long do_mmap_pgoff(struct file *file, /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ - vm_flags = determine_vm_flags(file, prot, flags, capabilities); + vm_flags |= determine_vm_flags(file, prot, flags, capabilities); /* we're going to need to record the mapping */ region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); -- cgit v1.2.3-70-g09d2