From 3964acd0dbec123aa0a621973a2a0580034b4788 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 31 Jul 2013 13:53:28 -0700 Subject: mm: mempolicy: fix mbind_range() && vma_adjust() interaction vma_adjust() does vma_set_policy(vma, vma_policy(next)) and this is doubly wrong: 1. This leaks vma->vm_policy if it is not NULL and not equal to next->vm_policy. This can happen if vma_merge() expands "area", not prev (case 8). 2. This sets the wrong policy if vma_merge() joins prev and area, area is the vma the caller needs to update and it still has the old policy. Revert commit 1444f92c8498 ("mm: merging memory blocks resets mempolicy") which introduced these problems. Change mbind_range() to recheck mpol_equal() after vma_merge() to fix the problem that commit tried to address. Signed-off-by: Oleg Nesterov Acked-by: KOSAKI Motohiro Cc: Steven T Hampson Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Andi Kleen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 6 +++++- mm/mmap.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 74310017296e..4baf12e534d1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -732,7 +732,10 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (prev) { vma = prev; next = vma->vm_next; - continue; + if (mpol_equal(vma_policy(vma), new_pol)) + continue; + /* vma_merge() joined vma && vma->next, case 8 */ + goto replace; } if (vma->vm_start != vmstart) { err = split_vma(vma->vm_mm, vma, vmstart, 1); @@ -744,6 +747,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (err) goto out; } + replace: err = vma_replace_policy(vma, new_pol); if (err) goto out; diff --git a/mm/mmap.c b/mm/mmap.c index fbad7b091090..1edbaa3136c3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -865,7 +865,7 @@ again: remove_next = 1 + (end > next->vm_end); if (next->anon_vma) anon_vma_merge(vma, next); mm->map_count--; - vma_set_policy(vma, vma_policy(next)); + mpol_put(vma_policy(next)); kmem_cache_free(vm_area_cachep, next); /* * In mprotect's case 6 (see comments on vma_merge), -- cgit v1.2.3-70-g09d2 From ef2a2cbdda7e9d084a85846770fcc844958881f6 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 31 Jul 2013 13:53:37 -0700 Subject: mm/swap.c: clear PageActive before adding pages onto unevictable list As a result of commit 13f7f78981e4 ("mm: pagevec: defer deciding which LRU to add a page to until pagevec drain time"), pages on unevictable lists can have both of PageActive and PageUnevictable set. This is not only confusing, but also corrupts page migration and shrink_[in]active_list. This patch fixes the problem by adding ClearPageActive before adding pages into unevictable list. It also cleans up VM_BUG_ONs. Signed-off-by: Naoya Horiguchi Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2c52fa..d3982c062629 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -512,12 +512,7 @@ EXPORT_SYMBOL(__lru_cache_add); */ void lru_cache_add(struct page *page) { - if (PageActive(page)) { - VM_BUG_ON(PageUnevictable(page)); - } else if (PageUnevictable(page)) { - VM_BUG_ON(PageActive(page)); - } - + VM_BUG_ON(PageActive(page) && PageUnevictable(page)); VM_BUG_ON(PageLRU(page)); __lru_cache_add(page); } @@ -539,6 +534,7 @@ void add_page_to_unevictable_list(struct page *page) spin_lock_irq(&zone->lru_lock); lruvec = mem_cgroup_page_lruvec(page, zone); + ClearPageActive(page); SetPageUnevictable(page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); @@ -833,7 +829,6 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, int active = PageActive(page); enum lru_list lru = page_lru(page); - VM_BUG_ON(PageUnevictable(page)); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); -- cgit v1.2.3-70-g09d2 From e180cf806a93ea1abbce47b245d25204ff557ce9 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 31 Jul 2013 13:53:39 -0700 Subject: thp, mm: avoid PageUnevictable on active/inactive lru lists active/inactive lru lists can contain unevicable pages (i.e. ramfs pages that have been placed on the LRU lists when first allocated), but these pages must not have PageUnevictable set - otherwise shrink_[in]active_list goes crazy: kernel BUG at /home/space/kas/git/public/linux-next/mm/vmscan.c:1122! 1090 static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1091 struct lruvec *lruvec, struct list_head *dst, 1092 unsigned long *nr_scanned, struct scan_control *sc, 1093 isolate_mode_t mode, enum lru_list lru) 1094 { ... 1108 switch (__isolate_lru_page(page, mode)) { 1109 case 0: ... 1116 case -EBUSY: ... 1121 default: 1122 BUG(); 1123 } 1124 } ... 1130 } __isolate_lru_page() returns EINVAL for PageUnevictable(page). For lru_add_page_tail(), it means we should not set PageUnevictable() for tail pages unless we're sure that it will go to LRU_UNEVICTABLE. Let's just copy PG_active and PG_unevictable from head page in __split_huge_page_refcount(), it will simplify lru_add_page_tail(). This will fix one more bug in lru_add_page_tail(): if page_evictable(page_tail) is false and PageLRU(page) is true, page_tail will go to the same lru as page, but nobody cares to sync page_tail active/inactive state with page. So we can end up with inactive page on active lru. The patch will fix it as well since we copy PG_active from head page. Signed-off-by: Kirill A. Shutemov Acked-by: Dave Hansen Cc: Naoya Horiguchi Cc: Mel Gorman Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 +++- mm/swap.c | 20 ++------------------ 2 files changed, 5 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 243e710c6039..a92012a71702 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1620,7 +1620,9 @@ static void __split_huge_page_refcount(struct page *page, ((1L << PG_referenced) | (1L << PG_swapbacked) | (1L << PG_mlocked) | - (1L << PG_uptodate))); + (1L << PG_uptodate) | + (1L << PG_active) | + (1L << PG_unevictable))); page_tail->flags |= (1L << PG_dirty); /* clear PageTail before overwriting first_page */ diff --git a/mm/swap.c b/mm/swap.c index d3982c062629..62b78a6e224f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -770,8 +770,6 @@ EXPORT_SYMBOL(__pagevec_release); void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *list) { - int uninitialized_var(active); - enum lru_list lru; const int file = 0; VM_BUG_ON(!PageHead(page)); @@ -783,20 +781,6 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, if (!list) SetPageLRU(page_tail); - if (page_evictable(page_tail)) { - if (PageActive(page)) { - SetPageActive(page_tail); - active = 1; - lru = LRU_ACTIVE_ANON; - } else { - active = 0; - lru = LRU_INACTIVE_ANON; - } - } else { - SetPageUnevictable(page_tail); - lru = LRU_UNEVICTABLE; - } - if (likely(PageLRU(page))) list_add_tail(&page_tail->lru, &page->lru); else if (list) { @@ -812,13 +796,13 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, * Use the standard add function to put page_tail on the list, * but then correct its position so they all end up in order. */ - add_page_to_lru_list(page_tail, lruvec, lru); + add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); list_head = page_tail->lru.prev; list_move_tail(&page_tail->lru, list_head); } if (!PageUnevictable(page)) - update_page_reclaim_stat(lruvec, file, active); + update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit v1.2.3-70-g09d2 From 9d8c5b5284e4a0167c5f7b1193692492358b8700 Mon Sep 17 00:00:00 2001 From: Heesub Shin Date: Wed, 31 Jul 2013 13:53:40 -0700 Subject: mm: zbud: fix condition check on allocation size zbud_alloc() incorrectly verifies the size of allocation limit. It should deny the allocation request greater than (PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE), not (PAGE_SIZE - ZHDR_SIZE_ALIGNED) which has no remaining spaces for its buddy. There is no point in spending the entire zbud page storing only a single page, since we don't have any benefits. Signed-off-by: Heesub Shin Acked-by: Seth Jennings Cc: Bob Liu Cc: Dongjun Shin Cc: Sunae Seo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zbud.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/zbud.c b/mm/zbud.c index 9bb4710e3589..ad1e781284fd 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -257,7 +257,7 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, if (size <= 0 || gfp & __GFP_HIGHMEM) return -EINVAL; - if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED) + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) return -ENOSPC; chunks = size_to_chunks(size); spin_lock(&pool->lock); -- cgit v1.2.3-70-g09d2 From 22f2020f84c6da2dd0acb2dce12e39e59ff7c8be Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jul 2013 13:53:48 -0700 Subject: vmpressure: change vmpressure::sr_lock to spinlock There is nothing that can sleep inside critical sections protected by this lock and those sections are really small so there doesn't make much sense to use mutex for them. Change the log to a spinlock Signed-off-by: Michal Hocko Reported-by: Tejun Heo Cc: Anton Vorontsov Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Li Zefan Reviewed-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmpressure.h | 2 +- mm/vmpressure.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 76be077340ea..2081680e015d 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -12,7 +12,7 @@ struct vmpressure { unsigned long scanned; unsigned long reclaimed; /* The lock is used to keep the scanned/reclaimed above in sync. */ - struct mutex sr_lock; + struct spinlock sr_lock; /* The list of vmpressure_event structs. */ struct list_head events; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 736a6011c2c8..f4ee6a190a4d 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -180,12 +180,12 @@ static void vmpressure_work_fn(struct work_struct *work) if (!vmpr->scanned) return; - mutex_lock(&vmpr->sr_lock); + spin_lock(&vmpr->sr_lock); scanned = vmpr->scanned; reclaimed = vmpr->reclaimed; vmpr->scanned = 0; vmpr->reclaimed = 0; - mutex_unlock(&vmpr->sr_lock); + spin_unlock(&vmpr->sr_lock); do { if (vmpressure_event(vmpr, scanned, reclaimed)) @@ -240,11 +240,11 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, if (!scanned) return; - mutex_lock(&vmpr->sr_lock); + spin_lock(&vmpr->sr_lock); vmpr->scanned += scanned; vmpr->reclaimed += reclaimed; scanned = vmpr->scanned; - mutex_unlock(&vmpr->sr_lock); + spin_unlock(&vmpr->sr_lock); if (scanned < vmpressure_win || work_pending(&vmpr->work)) return; @@ -367,7 +367,7 @@ void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, */ void vmpressure_init(struct vmpressure *vmpr) { - mutex_init(&vmpr->sr_lock); + spin_lock_init(&vmpr->sr_lock); mutex_init(&vmpr->events_lock); INIT_LIST_HEAD(&vmpr->events); INIT_WORK(&vmpr->work, vmpressure_work_fn); -- cgit v1.2.3-70-g09d2 From 8e0ed445b3478468372449859c45c6b3032acf2f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jul 2013 13:53:50 -0700 Subject: vmpressure: do not check for pending work to prevent from new work because it is racy and it doesn't give us much anyway as schedule_work handles this case already. Signed-off-by: Michal Hocko Reported-by: Tejun Heo Cc: Anton Vorontsov Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Li Zefan Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmpressure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmpressure.c b/mm/vmpressure.c index f4ee6a190a4d..192f9731931d 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -246,7 +246,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, scanned = vmpr->scanned; spin_unlock(&vmpr->sr_lock); - if (scanned < vmpressure_win || work_pending(&vmpr->work)) + if (scanned < vmpressure_win) return; schedule_work(&vmpr->work); } -- cgit v1.2.3-70-g09d2 From 33cb876e947b9ddda8dca3fb99234b743a597ef9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jul 2013 13:53:51 -0700 Subject: vmpressure: make sure there are no events queued after memcg is offlined vmpressure is called synchronously from reclaim where the target_memcg is guaranteed to be alive but the eventfd is signaled from the work queue context. This means that memcg (along with vmpressure structure which is embedded into it) might go away while the work item is pending which would result in use-after-release bug. We have two possible ways how to fix this. Either vmpressure pins memcg before it schedules vmpr->work and unpin it in vmpressure_work_fn or explicitely flush the work item from the css_offline context (as suggested by Tejun). This patch implements the later one and it introduces vmpressure_cleanup which flushes the vmpressure work queue item item. It hooks into mem_cgroup_css_offline after the memcg itself is cleaned up. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Michal Hocko Reported-by: Tejun Heo Cc: Anton Vorontsov Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Li Zefan Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmpressure.h | 1 + mm/memcontrol.c | 1 + mm/vmpressure.c | 16 ++++++++++++++++ 3 files changed, 18 insertions(+) (limited to 'mm') diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 2081680e015d..7dc17e2456de 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -30,6 +30,7 @@ extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); extern void vmpressure_init(struct vmpressure *vmpr); +extern void vmpressure_cleanup(struct vmpressure *vmpr); extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00a7a664b9c1..c290a1cf3862 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6335,6 +6335,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont) mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); mem_cgroup_destroy_all_caches(memcg); + vmpressure_cleanup(&memcg->vmpressure); } static void mem_cgroup_css_free(struct cgroup *cont) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 192f9731931d..0c1e37d829fa 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -372,3 +372,19 @@ void vmpressure_init(struct vmpressure *vmpr) INIT_LIST_HEAD(&vmpr->events); INIT_WORK(&vmpr->work, vmpressure_work_fn); } + +/** + * vmpressure_cleanup() - shuts down vmpressure control structure + * @vmpr: Structure to be cleaned up + * + * This function should be called before the structure in which it is + * embedded is cleaned up. + */ +void vmpressure_cleanup(struct vmpressure *vmpr) +{ + /* + * Make sure there is no pending work before eventfd infrastructure + * goes away. + */ + flush_work(&vmpr->work); +} -- cgit v1.2.3-70-g09d2 From 387aae6fdd737038e92d7bb40712bdf6dcb11945 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 4 Aug 2013 11:30:25 -0700 Subject: tmpfs: fix SEEK_DATA/SEEK_HOLE regression Commit 46a1c2c7ae53 ("vfs: export lseek_execute() to modules") broke the tmpfs SEEK_DATA/SEEK_HOLE implementation, because vfs_setpos() converts the carefully prepared -ENXIO to -EINVAL. Other filesystems avoid it in error cases: do the same in tmpfs. Signed-off-by: Hugh Dickins Cc: Jie Liu Cc: Al Viro Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index a87990cf9f94..8335dbd3fc35 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1798,7 +1798,8 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) } } - offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); + if (offset >= 0) + offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); mutex_unlock(&inode->i_mutex); return offset; } -- cgit v1.2.3-70-g09d2 From 370905069ce6515f38d5de0a5b1c899cbe58fe22 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 8 Aug 2013 09:06:37 -0700 Subject: Revert "slub: do not put a slab to cpu partial list when cpu_partial is 0" This reverts commit 318df36e57c0ca9f2146660d41ff28e8650af423. This commit caused Steven Rostedt's hackbench runs to run out of memory due to a leak. As noted by Joonsoo Kim, it is buggy in the following scenario: "I guess, you may set 0 to all kmem caches's cpu_partial via sysfs, doesn't it? In this case, memory leak is possible in following case. Code flow of possible leak is follwing case. * in __slab_free() 1. (!new.inuse || !prior) && !was_frozen 2. !kmem_cache_debug && !prior 3. new.frozen = 1 4. after cmpxchg_double_slab, run the (!n) case with new.frozen=1 5. with this patch, put_cpu_partial() doesn't do anything, because this cache's cpu_partial is 0 6. return In step 5, leak occur" And Steven does indeed have cpu_partial set to 0 due to RT testing. Joonsoo is cooking up a patch, but everybody agrees that reverting this for now is the right thing to do. Reported-and-bisected-by: Steven Rostedt Acked-by: Joonsoo Kim Acked-by: Pekka Enberg Signed-off-by: Linus Torvalds --- mm/slub.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 2b02d666bf63..e3ba1f2cf60c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1968,9 +1968,6 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) int pages; int pobjects; - if (!s->cpu_partial) - return; - do { pages = 0; pobjects = 0; -- cgit v1.2.3-70-g09d2 From 3e6b11df245180949938734bc192eaf32f3a06b3 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 13 Aug 2013 16:00:47 -0700 Subject: memcg: don't initialize kmem-cache destroying work for root caches struct memcg_cache_params has a union. Different parts of this union are used for root and non-root caches. A part with destroying work is used only for non-root caches. I fixed the same problem in another place v3.9-rc1-16204-gf101a94, but didn't notice this one. This patch fixes the kernel panic: [ 46.848187] BUG: unable to handle kernel paging request at 000000fffffffeb8 [ 46.849026] IP: [] kmem_cache_destroy_memcg_children+0x6c/0xc0 [ 46.849092] PGD 0 [ 46.849092] Oops: 0000 [#1] SMP ... Signed-off-by: Andrey Vagin Cc: Glauber Costa Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Konstantin Khlebnikov Cc: [3.9.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c290a1cf3862..c5792a5d87ce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3195,11 +3195,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, if (!s->memcg_params) return -ENOMEM; - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; + INIT_WORK(&s->memcg_params->destroy, + kmem_cache_destroy_work_func); } else s->memcg_params->is_root_cache = true; -- cgit v1.2.3-70-g09d2 From 179ef71cbc085252e3fe6b8159263a7ed1d88ea4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 13 Aug 2013 16:00:49 -0700 Subject: mm: save soft-dirty bits on swapped pages Andy Lutomirski reported that if a page with _PAGE_SOFT_DIRTY bit set get swapped out, the bit is getting lost and no longer available when pte read back. To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is saved in pte entry for the page being swapped out. When such page is to be read back from a swap cache we check for bit presence and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY bit back. One of the problem was to find a place in pte entry where we can save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The _PAGE_PSE was chosen for that, it doesn't intersect with swap entry format stored in pte. Reported-by: Andy Lutomirski Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Reviewed-by: Minchan Kim Reviewed-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ fs/proc/task_mmu.c | 21 +++++++++++++++------ include/asm-generic/pgtable.h | 15 +++++++++++++++ include/linux/swapops.h | 2 ++ mm/memory.c | 2 ++ mm/rmap.c | 6 +++++- mm/swapfile.c | 19 +++++++++++++++++-- 8 files changed, 84 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7dc305a46058..bd0518a7f197 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index c98ac63aae48..5e8442f178f9 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -67,6 +67,19 @@ #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif +/* + * Tracking soft dirty bit when a page goes to a swap is tricky. + * We need a bit which can be stored in pte _and_ not conflict + * with swap entry format. On x86 bits 6 and 7 are *not* involved + * into swap entry computation, but bit 6 is used for nonlinear + * file mapping, so we borrow bit 7 for soft dirty tracking. + */ +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE +#else +#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dbf61f6174f0..e2d9bdce5e7e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -730,8 +730,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * of how soft-dirty works. */ pte_t ptent = *pte; - ptent = pte_wrprotect(ptent); - ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + + if (pte_present(ptent)) { + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_clear_soft_dirty(ptent); + } + set_pte_at(vma->vm_mm, addr, pte, ptent); #endif } @@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; - if (!pte_present(ptent)) - continue; if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); continue; } + if (!pte_present(ptent)) + continue; + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, flags = PM_PRESENT; page = vm_normal_page(vma, addr, pte); } else if (is_swap_pte(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; + entry = pte_to_swp_entry(pte); frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags = PM_SWAP; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 2f47ade1b567..2a7e0d10ad9a 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd; } + +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return pte; +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + return 0; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return pte; +} #endif #ifndef __HAVE_PFNMAP_TRACKING diff --git a/include/linux/swapops.h b/include/linux/swapops.h index c5fd30d2a415..8d4fa82bfb91 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte) swp_entry_t arch_entry; BUG_ON(pte_file(pte)); + if (pte_swp_soft_dirty(pte)) + pte = pte_swp_clear_soft_dirty(pte); arch_entry = __pte_to_swp_entry(pte); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } diff --git a/mm/memory.c b/mm/memory.c index 1ce2e2a734fc..e98ecad2b9c8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, exclusive = 1; } flush_icache_page(vma, page); + if (pte_swp_soft_dirty(orig_pte)) + pte = pte_mksoft_dirty(pte); set_pte_at(mm, address, page_table, pte); if (page == swapcache) do_page_add_anon_rmap(page, vma, address, exclusive); diff --git a/mm/rmap.c b/mm/rmap.c index cd356df4f71a..83325b80142b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; + pte_t swp_pte; if (PageSwapCache(page)) { /* @@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); entry = make_migration_entry(page, pte_write(pteval)); } - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pte, swp_pte); BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (TTU_ACTION(flags) == TTU_MIGRATION)) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 36af6eeaa67e..6cf2e60983b7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, int free) } #endif /* CONFIG_HIBERNATION */ +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * When pte keeps soft dirty bit the pte generated + * from swap entry does not has it, still it's same + * pte from logical point of view. + */ + pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); + return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); +#else + return pte_same(pte, swp_pte); +#endif +} + /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to @@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { mem_cgroup_cancel_charge_swapin(memcg); ret = 0; goto out; @@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ - if (unlikely(pte_same(*pte, swp_pte))) { + if (unlikely(maybe_same_pte(*pte, swp_pte))) { pte_unmap(pte); ret = unuse_pte(vma, pmd, addr, entry, page); if (ret) -- cgit v1.2.3-70-g09d2 From 41bb3476b361ef38576cf9d539b19bae2ac93167 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 13 Aug 2013 16:00:51 -0700 Subject: mm: save soft-dirty bits on file pages Andy reported that if file page get reclaimed we lose the soft-dirty bit if it was there, so save _PAGE_BIT_SOFT_DIRTY bit when page address get encoded into pte entry. Thus when #pf happens on such non-present pte we can restore it back. Reported-by: Andy Lutomirski Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Cc: Minchan Kim Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-2level.h | 48 ++++++++++++++++++++++++++++++++++- arch/x86/include/asm/pgtable-3level.h | 3 +++ arch/x86/include/asm/pgtable.h | 15 +++++++++++ arch/x86/include/asm/pgtable_types.h | 4 ++- fs/proc/task_mmu.c | 2 ++ include/asm-generic/pgtable.h | 15 +++++++++++ mm/fremap.c | 11 +++++--- mm/memory.c | 11 +++++--- mm/rmap.c | 8 ++++-- 9 files changed, 107 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index f2b489cf1602..3bf2dd0cf61f 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -55,9 +55,53 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifdef CONFIG_MEM_SOFT_DIRTY + +/* + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and + * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset + * into this range. + */ +#define PTE_FILE_MAX_BITS 28 +#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) +#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) +#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1) +#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) +#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) +#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) + +#define pte_to_pgoff(pte) \ + ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ + & ((1U << PTE_FILE_BITS1) - 1))) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << (PTE_FILE_BITS1)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ + + ((((off) >> PTE_FILE_BITS1) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << PTE_FILE_SHIFT2) \ + + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << PTE_FILE_SHIFT3) \ + + ((((off) >> \ + (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ + << PTE_FILE_SHIFT4) \ + + _PAGE_FILE }) + +#else /* CONFIG_MEM_SOFT_DIRTY */ + /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, - * split up the 29 bits of offset into this range: + * split up the 29 bits of offset into this range. */ #define PTE_FILE_MAX_BITS 29 #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) @@ -88,6 +132,8 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) << PTE_FILE_SHIFT3) \ + _PAGE_FILE }) +#endif /* CONFIG_MEM_SOFT_DIRTY */ + /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 4cc9f2b7cdc3..81bb91b49a88 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -179,6 +179,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. + * + * For soft-dirty tracking 11 bit is taken from + * the low part of pte as well. */ #define pte_to_pgoff(pte) ((pte).pte_high) #define pgoff_to_pte(off) \ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index bd0518a7f197..1c00631164c2 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -329,6 +329,21 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } +static inline pte_t pte_file_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_file_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline int pte_file_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFT_DIRTY; +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 5e8442f178f9..f4843e031131 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -61,8 +61,10 @@ * they do not conflict with each other. */ +#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN + #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e2d9bdce5e7e..a11720767abc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -736,6 +736,8 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); + } else if (pte_file(ptent)) { + ptent = pte_file_clear_soft_dirty(ptent); } set_pte_at(vma->vm_mm, addr, pte, ptent); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 2a7e0d10ad9a..0807ddf97b05 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -432,6 +432,21 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } + +static inline pte_t pte_file_clear_soft_dirty(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_file_mksoft_dirty(pte_t pte) +{ + return pte; +} + +static inline int pte_file_soft_dirty(pte_t pte) +{ + return 0; +} #endif #ifndef __HAVE_PFNMAP_TRACKING diff --git a/mm/fremap.c b/mm/fremap.c index 87da3590c61e..5bff08147768 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -57,17 +57,22 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot) { int err = -ENOMEM; - pte_t *pte; + pte_t *pte, ptfile; spinlock_t *ptl; pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; - if (!pte_none(*pte)) + ptfile = pgoff_to_pte(pgoff); + + if (!pte_none(*pte)) { + if (pte_present(*pte) && pte_soft_dirty(*pte)) + pte_file_mksoft_dirty(ptfile); zap_pte(mm, vma, addr, pte); + } - set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); + set_pte_at(mm, addr, pte, ptfile); /* * We don't need to run update_mmu_cache() here because the "file pte" * being installed by install_file_pte() is not a real pte - it's a diff --git a/mm/memory.c b/mm/memory.c index e98ecad2b9c8..40268410732a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1141,9 +1141,12 @@ again: continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, - addr) != page->index) - set_pte_at(mm, addr, pte, - pgoff_to_pte(page->index)); + addr) != page->index) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(ptent)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, addr, pte, ptfile); + } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { @@ -3410,6 +3413,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) + pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); diff --git a/mm/rmap.c b/mm/rmap.c index 83325b80142b..b2e29acd7e3d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1405,8 +1405,12 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ - if (page->index != linear_page_index(vma, address)) - set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); + if (page->index != linear_page_index(vma, address)) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(pteval)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, address, pte, ptfile); + } /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) -- cgit v1.2.3-70-g09d2 From 2b047252d087be7f2ba088b4933cd904f92e6fce Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 15 Aug 2013 11:42:25 -0700 Subject: Fix TLB gather virtual address range invalidation corner cases Ben Tebulin reported: "Since v3.7.2 on two independent machines a very specific Git repository fails in 9/10 cases on git-fsck due to an SHA1/memory failures. This only occurs on a very specific repository and can be reproduced stably on two independent laptops. Git mailing list ran out of ideas and for me this looks like some very exotic kernel issue" and bisected the failure to the backport of commit 53a59fc67f97 ("mm: limit mmu_gather batching to fix soft lockups on !CONFIG_PREEMPT"). That commit itself is not actually buggy, but what it does is to make it much more likely to hit the partial TLB invalidation case, since it introduces a new case in tlb_next_batch() that previously only ever happened when running out of memory. The real bug is that the TLB gather virtual memory range setup is subtly buggered. It was introduced in commit 597e1c3580b7 ("mm/mmu_gather: enable tlb flush range in generic mmu_gather"), and the range handling was already fixed at least once in commit e6c495a96ce0 ("mm: fix the TLB range flushed when __tlb_remove_page() runs out of slots"), but that fix was not complete. The problem with the TLB gather virtual address range is that it isn't set up by the initial tlb_gather_mmu() initialization (which didn't get the TLB range information), but it is set up ad-hoc later by the functions that actually flush the TLB. And so any such case that forgot to update the TLB range entries would potentially miss TLB invalidates. Rather than try to figure out exactly which particular ad-hoc range setup was missing (I personally suspect it's the hugetlb case in zap_huge_pmd(), which didn't have the same logic as zap_pte_range() did), this patch just gets rid of the problem at the source: make the TLB range information available to tlb_gather_mmu(), and initialize it when initializing all the other tlb gather fields. This makes the patch larger, but conceptually much simpler. And the end result is much more understandable; even if you want to play games with partial ranges when invalidating the TLB contents in chunks, now the range information is always there, and anybody who doesn't want to bother with it won't introduce subtle bugs. Ben verified that this fixes his problem. Reported-bisected-and-tested-by: Ben Tebulin Build-testing-by: Stephen Rothwell Build-testing-by: Richard Weinberger Reviewed-by: Michal Hocko Acked-by: Peter Zijlstra Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- arch/arm/include/asm/tlb.h | 7 +++++-- arch/arm64/include/asm/tlb.h | 7 +++++-- arch/ia64/include/asm/tlb.h | 9 ++++++--- arch/s390/include/asm/tlb.h | 8 ++++++-- arch/sh/include/asm/tlb.h | 6 ++++-- arch/um/include/asm/tlb.h | 6 ++++-- fs/exec.c | 4 ++-- include/asm-generic/tlb.h | 2 +- mm/hugetlb.c | 2 +- mm/memory.c | 36 +++++++++++++++++++++--------------- mm/mmap.c | 4 ++-- 11 files changed, 57 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 46e7cfb3e721..0baf7f0d9394 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -43,6 +43,7 @@ struct mmu_gather { struct mm_struct *mm; unsigned int fullmm; struct vm_area_struct *vma; + unsigned long start, end; unsigned long range_start; unsigned long range_end; unsigned int nr; @@ -107,10 +108,12 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) } static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int fullmm) +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = fullmm; + tlb->fullmm = !(start | (end+1)); + tlb->start = start; + tlb->end = end; tlb->vma = NULL; tlb->max = ARRAY_SIZE(tlb->local); tlb->pages = tlb->local; diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 46b3beb4b773..717031a762c2 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -35,6 +35,7 @@ struct mmu_gather { struct mm_struct *mm; unsigned int fullmm; struct vm_area_struct *vma; + unsigned long start, end; unsigned long range_start; unsigned long range_end; unsigned int nr; @@ -97,10 +98,12 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) } static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int fullmm) +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = fullmm; + tlb->fullmm = !(start | (end+1)); + tlb->start = start; + tlb->end = end; tlb->vma = NULL; tlb->max = ARRAY_SIZE(tlb->local); tlb->pages = tlb->local; diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index ef3a9de01954..bc5efc7c3f3f 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -22,7 +22,7 @@ * unmapping a portion of the virtual address space, these hooks are called according to * the following template: * - * tlb <- tlb_gather_mmu(mm, full_mm_flush); // start unmap for address space MM + * tlb <- tlb_gather_mmu(mm, start, end); // start unmap for address space MM * { * for each vma that needs a shootdown do { * tlb_start_vma(tlb, vma); @@ -58,6 +58,7 @@ struct mmu_gather { unsigned int max; unsigned char fullmm; /* non-zero means full mm flush */ unsigned char need_flush; /* really unmapped some PTEs? */ + unsigned long start, end; unsigned long start_addr; unsigned long end_addr; struct page **pages; @@ -155,13 +156,15 @@ static inline void __tlb_alloc_page(struct mmu_gather *tlb) static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; tlb->max = ARRAY_SIZE(tlb->local); tlb->pages = tlb->local; tlb->nr = 0; - tlb->fullmm = full_mm_flush; + tlb->fullmm = !(start | (end+1)); + tlb->start = start; + tlb->end = end; tlb->start_addr = ~0UL; } diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index b75d7d686684..23a64d25f2b1 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -32,6 +32,7 @@ struct mmu_gather { struct mm_struct *mm; struct mmu_table_batch *batch; unsigned int fullmm; + unsigned long start, unsigned long end; }; struct mmu_table_batch { @@ -48,10 +49,13 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); static inline void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned int full_mm_flush) + unsigned long start, + unsigned long end) { tlb->mm = mm; - tlb->fullmm = full_mm_flush; + tlb->start = start; + tlb->end = end; + tlb->fullmm = !(start | (end+1)); tlb->batch = NULL; if (tlb->fullmm) __tlb_flush_mm(mm); diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index e61d43d9f689..362192ed12fe 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -36,10 +36,12 @@ static inline void init_tlb_gather(struct mmu_gather *tlb) } static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = full_mm_flush; + tlb->start = start; + tlb->end = end; + tlb->fullmm = !(start | (end+1)); init_tlb_gather(tlb); } diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 4febacd1a8a1..29b0301c18aa 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -45,10 +45,12 @@ static inline void init_tlb_gather(struct mmu_gather *tlb) } static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = full_mm_flush; + tlb->start = start; + tlb->end = end; + tlb->fullmm = !(start | (end+1)); init_tlb_gather(tlb); } diff --git a/fs/exec.c b/fs/exec.c index 9c73def87642..fd774c7cb483 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -608,7 +608,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) return -ENOMEM; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, old_start, old_end); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. @@ -625,7 +625,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) free_pgd_range(&tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } - tlb_finish_mmu(&tlb, new_end, old_end); + tlb_finish_mmu(&tlb, old_start, old_end); /* * Shrink the vma to just the new range. Always succeeds. diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 13821c339a41..5672d7ea1fa0 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -112,7 +112,7 @@ struct mmu_gather { #define HAVE_GENERIC_MMU_GATHER -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm); +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end); void tlb_flush_mmu(struct mmu_gather *tlb); void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 83aff0a4d093..b60f33080a28 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2490,7 +2490,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, mm = vma->vm_mm; - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); __unmap_hugepage_range(&tlb, vma, start, end, ref_page); tlb_finish_mmu(&tlb, start, end); } diff --git a/mm/memory.c b/mm/memory.c index 40268410732a..af84bc0ec17c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -209,14 +209,15 @@ static int tlb_next_batch(struct mmu_gather *tlb) * tear-down from @mm. The @fullmm argument is used when @mm is without * users and we're going to destroy the full address space (exit/execve). */ -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = fullmm; + /* Is it from 0 to ~0? */ + tlb->fullmm = !(start | (end+1)); tlb->need_flush_all = 0; - tlb->start = -1UL; - tlb->end = 0; + tlb->start = start; + tlb->end = end; tlb->need_flush = 0; tlb->local.next = NULL; tlb->local.nr = 0; @@ -256,8 +257,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e { struct mmu_gather_batch *batch, *next; - tlb->start = start; - tlb->end = end; tlb_flush_mmu(tlb); /* keep the page table cache within bounds */ @@ -1099,7 +1098,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; - unsigned long range_start = addr; again: init_rss_vec(rss); @@ -1205,17 +1203,25 @@ again: * and page-free while holding it. */ if (force_flush) { + unsigned long old_end; + force_flush = 0; -#ifdef HAVE_GENERIC_MMU_GATHER - tlb->start = range_start; + /* + * Flush the TLB just for the previous segment, + * then update the range to be the remaining + * TLB range. + */ + old_end = tlb->end; tlb->end = addr; -#endif + tlb_flush_mmu(tlb); - if (addr != end) { - range_start = addr; + + tlb->start = addr; + tlb->end = old_end; + + if (addr != end) goto again; - } } return addr; @@ -1400,7 +1406,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end = start + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); for ( ; vma && vma->vm_start < end; vma = vma->vm_next) @@ -1426,7 +1432,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr unsigned long end = address + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, address, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, address, end); unmap_single_vma(&tlb, vma, address, end, details); diff --git a/mm/mmap.c b/mm/mmap.c index 1edbaa3136c3..f9c97d10b873 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2336,7 +2336,7 @@ static void unmap_region(struct mm_struct *mm, struct mmu_gather tlb; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end); free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, @@ -2709,7 +2709,7 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); - tlb_gather_mmu(&tlb, mm, 1); + tlb_gather_mmu(&tlb, mm, 0, -1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); -- cgit v1.2.3-70-g09d2