From 9aabf810a67cd97e2d1a48f0bab338b7680f1929 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 10 Feb 2015 14:09:32 -0800 Subject: mm/slub: optimize alloc/free fastpath by removing preemption on/off We had to insert a preempt enable/disable in the fastpath a while ago in order to guarantee that tid and kmem_cache_cpu are retrieved on the same cpu. It is the problem only for CONFIG_PREEMPT in which scheduler can move the process to other cpu during retrieving data. Now, I reach the solution to remove preempt enable/disable in the fastpath. If tid is matched with kmem_cache_cpu's tid after tid and kmem_cache_cpu are retrieved by separate this_cpu operation, it means that they are retrieved on the same cpu. If not matched, we just have to retry it. With this guarantee, preemption enable/disable isn't need at all even if CONFIG_PREEMPT, so this patch removes it. I saw roughly 5% win in a fast-path loop over kmem_cache_alloc/free in CONFIG_PREEMPT. (14.821 ns -> 14.049 ns) Below is the result of Christoph's slab_test reported by Jesper Dangaard Brouer. * Before Single thread testing ===================== 1. Kmalloc: Repeatedly allocate then free test 10000 times kmalloc(8) -> 49 cycles kfree -> 62 cycles 10000 times kmalloc(16) -> 48 cycles kfree -> 64 cycles 10000 times kmalloc(32) -> 53 cycles kfree -> 70 cycles 10000 times kmalloc(64) -> 64 cycles kfree -> 77 cycles 10000 times kmalloc(128) -> 74 cycles kfree -> 84 cycles 10000 times kmalloc(256) -> 84 cycles kfree -> 114 cycles 10000 times kmalloc(512) -> 83 cycles kfree -> 116 cycles 10000 times kmalloc(1024) -> 81 cycles kfree -> 120 cycles 10000 times kmalloc(2048) -> 104 cycles kfree -> 136 cycles 10000 times kmalloc(4096) -> 142 cycles kfree -> 165 cycles 10000 times kmalloc(8192) -> 238 cycles kfree -> 226 cycles 10000 times kmalloc(16384) -> 403 cycles kfree -> 264 cycles 2. Kmalloc: alloc/free test 10000 times kmalloc(8)/kfree -> 68 cycles 10000 times kmalloc(16)/kfree -> 68 cycles 10000 times kmalloc(32)/kfree -> 69 cycles 10000 times kmalloc(64)/kfree -> 68 cycles 10000 times kmalloc(128)/kfree -> 68 cycles 10000 times kmalloc(256)/kfree -> 68 cycles 10000 times kmalloc(512)/kfree -> 74 cycles 10000 times kmalloc(1024)/kfree -> 75 cycles 10000 times kmalloc(2048)/kfree -> 74 cycles 10000 times kmalloc(4096)/kfree -> 74 cycles 10000 times kmalloc(8192)/kfree -> 75 cycles 10000 times kmalloc(16384)/kfree -> 510 cycles * After Single thread testing ===================== 1. Kmalloc: Repeatedly allocate then free test 10000 times kmalloc(8) -> 46 cycles kfree -> 61 cycles 10000 times kmalloc(16) -> 46 cycles kfree -> 63 cycles 10000 times kmalloc(32) -> 49 cycles kfree -> 69 cycles 10000 times kmalloc(64) -> 57 cycles kfree -> 76 cycles 10000 times kmalloc(128) -> 66 cycles kfree -> 83 cycles 10000 times kmalloc(256) -> 84 cycles kfree -> 110 cycles 10000 times kmalloc(512) -> 77 cycles kfree -> 114 cycles 10000 times kmalloc(1024) -> 80 cycles kfree -> 116 cycles 10000 times kmalloc(2048) -> 102 cycles kfree -> 131 cycles 10000 times kmalloc(4096) -> 135 cycles kfree -> 163 cycles 10000 times kmalloc(8192) -> 238 cycles kfree -> 218 cycles 10000 times kmalloc(16384) -> 399 cycles kfree -> 262 cycles 2. Kmalloc: alloc/free test 10000 times kmalloc(8)/kfree -> 65 cycles 10000 times kmalloc(16)/kfree -> 66 cycles 10000 times kmalloc(32)/kfree -> 65 cycles 10000 times kmalloc(64)/kfree -> 66 cycles 10000 times kmalloc(128)/kfree -> 66 cycles 10000 times kmalloc(256)/kfree -> 71 cycles 10000 times kmalloc(512)/kfree -> 72 cycles 10000 times kmalloc(1024)/kfree -> 71 cycles 10000 times kmalloc(2048)/kfree -> 71 cycles 10000 times kmalloc(4096)/kfree -> 71 cycles 10000 times kmalloc(8192)/kfree -> 65 cycles 10000 times kmalloc(16384)/kfree -> 511 cycles Most of the results are better than before. Note that this change slightly worses performance in !CONFIG_PREEMPT, roughly 0.3%. Implementing each case separately would help performance, but, since it's so marginal, I didn't do that. This would help maintanance since we have same code for all cases. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Tested-by: Jesper Dangaard Brouer Acked-by: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index fe376fe1f4fe..e7ed6f8304f4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2398,13 +2398,24 @@ redo: * reading from one cpu area. That does not matter as long * as we end up on the original cpu again when doing the cmpxchg. * - * Preemption is disabled for the retrieval of the tid because that - * must occur from the current processor. We cannot allow rescheduling - * on a different processor between the determination of the pointer - * and the retrieval of the tid. + * We should guarantee that tid and kmem_cache are retrieved on + * the same cpu. It could be different if CONFIG_PREEMPT so we need + * to check if it is matched or not. */ - preempt_disable(); - c = this_cpu_ptr(s->cpu_slab); + do { + tid = this_cpu_read(s->cpu_slab->tid); + c = raw_cpu_ptr(s->cpu_slab); + } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); + + /* + * Irqless object alloc/free algorithm used here depends on sequence + * of fetching cpu_slab's data. tid should be fetched before anything + * on c to guarantee that object and page associated with previous tid + * won't be used with current tid. If we fetch tid first, object and + * page could be one associated with next tid and our alloc/free + * request will be failed. In this case, we will retry. So, no problem. + */ + barrier(); /* * The transaction ids are globally unique per cpu and per operation on @@ -2412,8 +2423,6 @@ redo: * occurs on the right processor and that there was no operation on the * linked list in between. */ - tid = c->tid; - preempt_enable(); object = c->freelist; page = c->page; @@ -2659,11 +2668,13 @@ redo: * data is retrieved via this pointer. If we are on the same cpu * during the cmpxchg then the free will succedd. */ - preempt_disable(); - c = this_cpu_ptr(s->cpu_slab); + do { + tid = this_cpu_read(s->cpu_slab->tid); + c = raw_cpu_ptr(s->cpu_slab); + } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); - tid = c->tid; - preempt_enable(); + /* Same with comment on barrier() in slab_alloc_node() */ + barrier(); if (likely(page == c->page)) { set_freepointer(s, object, c->freelist); -- cgit v1.2.3-70-g09d2 From 94e4d712eb28c9a87c40c898af540725f63e68eb Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 10 Feb 2015 14:09:37 -0800 Subject: mm/slub.c: fix typo in comment Signed-off-by: Kim Phillips Acked-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e7ed6f8304f4..8b8508adf9c2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2521,7 +2521,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif /* - * Slow patch handling. This may still be called frequently since objects + * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. * * So we still attempt to reduce cache line usage. Just take the slab -- cgit v1.2.3-70-g09d2 From 7c4da061f2e953df479b126b9263f0e845bce0ec Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Tue, 10 Feb 2015 14:09:40 -0800 Subject: mm/slab_common.c: use kmem_cache_free() Here, free memory is allocated using kmem_cache_zalloc. So, use kmem_cache_free instead of kfree. This is done using Coccinelle and semantic patch used is as follows: @@ expression x,E,c; @@ x = \(kmem_cache_alloc\|kmem_cache_zalloc\|kmem_cache_alloc_node\)(c,...) ... when != x = E when != &x ?-kfree(x) +kmem_cache_free(c,x) Signed-off-by: Vaishali Thakkar Acked-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index e03dd6f2a272..67f182c10f24 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -331,7 +331,7 @@ out: out_free_cache: memcg_free_cache_params(s); - kfree(s); + kmem_cache_free(kmem_cache, s); goto out; } -- cgit v1.2.3-70-g09d2 From 3c4868710951dd7a6b991d71ca5f46737c4acf28 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 10 Feb 2015 14:09:43 -0800 Subject: mm/vmstat.c: fix/cleanup ifdefs CONFIG_COMPACTION=y, CONFIG_DEBUG_FS=n: mm/vmstat.c:690: warning: 'frag_start' defined but not used mm/vmstat.c:702: warning: 'frag_next' defined but not used mm/vmstat.c:710: warning: 'frag_stop' defined but not used mm/vmstat.c:715: warning: 'walk_zones_in_node' defined but not used It's all a bit of a tangly mess and it's unclear why CONFIG_COMPACTION figures in there at all. Move frag_start/frag_next/frag_stop and migratetype_names[] into the existing CONFIG_PROC_FS block. walk_zones_in_node() gets a special ifdef. Also move the #include lines up to where #include lines live. [axel.lin@ingics.com: fix build error when !CONFIG_PROC_FS] Signed-off-by: Axel Lin Cc: Mel Gorman Cc: Joonsoo Kim Cc: Mel Gorman Cc: Joonsoo Kim Tested-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 124 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 62 insertions(+), 62 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 1284f89fca08..9943e5fd74e6 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -670,66 +673,6 @@ int fragmentation_index(struct zone *zone, unsigned int order) } #endif -#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) -#include -#include - -static char * const migratetype_names[MIGRATE_TYPES] = { - "Unmovable", - "Reclaimable", - "Movable", - "Reserve", -#ifdef CONFIG_CMA - "CMA", -#endif -#ifdef CONFIG_MEMORY_ISOLATION - "Isolate", -#endif -}; - -static void *frag_start(struct seq_file *m, loff_t *pos) -{ - pg_data_t *pgdat; - loff_t node = *pos; - for (pgdat = first_online_pgdat(); - pgdat && node; - pgdat = next_online_pgdat(pgdat)) - --node; - - return pgdat; -} - -static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) -{ - pg_data_t *pgdat = (pg_data_t *)arg; - - (*pos)++; - return next_online_pgdat(pgdat); -} - -static void frag_stop(struct seq_file *m, void *arg) -{ -} - -/* Walk all the zones in a node and print using a callback */ -static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, - void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) -{ - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - unsigned long flags; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) - continue; - - spin_lock_irqsave(&zone->lock, flags); - print(m, pgdat, zone); - spin_unlock_irqrestore(&zone->lock, flags); - } -} -#endif - #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) #ifdef CONFIG_ZONE_DMA #define TEXT_FOR_DMA(xx) xx "_dma", @@ -907,7 +850,66 @@ const char * const vmstat_text[] = { #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ +#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ + defined(CONFIG_PROC_FS) +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + + for (pgdat = first_online_pgdat(); + pgdat && node; + pgdat = next_online_pgdat(pgdat)) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return next_online_pgdat(pgdat); +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* Walk all the zones in a node and print using a callback */ +static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) +{ + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + print(m, pgdat, zone); + spin_unlock_irqrestore(&zone->lock, flags); + } +} +#endif + #ifdef CONFIG_PROC_FS +static char * const migratetype_names[MIGRATE_TYPES] = { + "Unmovable", + "Reclaimable", + "Movable", + "Reserve", +#ifdef CONFIG_CMA + "CMA", +#endif +#ifdef CONFIG_MEMORY_ISOLATION + "Isolate", +#endif +}; + static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { @@ -1536,8 +1538,6 @@ static int __init setup_vmstat(void) module_init(setup_vmstat) #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) -#include - /* * Return an index indicating how much of the available free memory is -- cgit v1.2.3-70-g09d2 From c8d78c1823f46519473949d33f0d1d33fe21ea16 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:46 -0800 Subject: mm: replace remap_file_pages() syscall with emulation remap_file_pages(2) was invented to be able efficiently map parts of huge file into limited 32-bit virtual address space such as in database workloads. Nonlinear mappings are pain to support and it seems there's no legitimate use-cases nowadays since 64-bit systems are widely available. Let's drop it and get rid of all these special-cased code. The patch replaces the syscall with emulation which creates new VMA on each remap_file_pages(), unless they it can be merged with an adjacent one. I didn't find *any* real code that uses remap_file_pages(2) to test emulation impact on. I've checked Debian code search and source of all packages in ALT Linux. No real users: libc wrappers, mentions in strace, gdb, valgrind and this kind of stuff. There are few basic tests in LTP for the syscall. They work just fine with emulation. To test performance impact, I've written small test case which demonstrate pretty much worst case scenario: map 4G shmfs file, write to begin of every page pgoff of the page, remap pages in reverse order, read every page. The test creates 1 million of VMAs if emulation is in use, so I had to set vm.max_map_count to 1100000 to avoid -ENOMEM. Before: 23.3 ( +- 4.31% ) seconds After: 43.9 ( +- 0.85% ) seconds Slowdown: 1.88x I believe we can live with that. Test case: #define _GNU_SOURCE #include #include #include #include #define MB (1024UL * 1024) #define SIZE (4096 * MB) int main(int argc, char **argv) { unsigned long *p; long i, pass; for (pass = 0; pass < 10; pass++) { p = mmap(NULL, SIZE, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) { perror("mmap"); return -1; } for (i = 0; i < SIZE / 4096; i++) p[i * 4096 / sizeof(*p)] = i; for (i = 0; i < SIZE / 4096; i++) { if (remap_file_pages(p + i * 4096 / sizeof(*p), 4096, 0, (SIZE - 4096 * (i + 1)) >> 12, 0)) { perror("remap_file_pages"); return -1; } } for (i = SIZE / 4096 - 1; i >= 0; i--) assert(p[i * 4096 / sizeof(*p)] == SIZE / 4096 - i - 1); munmap(p, SIZE); } return 0; } [akpm@linux-foundation.org: fix spello] [sasha.levin@oracle.com: initialize populate before usage] [sasha.levin@oracle.com: grab file ref to prevent race while mmaping] Signed-off-by: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Dave Jones Cc: Linus Torvalds Cc: Armin Rigo Signed-off-by: Sasha Levin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/remap_file_pages.txt | 7 +- include/linux/fs.h | 8 +- mm/Makefile | 2 +- mm/fremap.c | 283 ---------------------------------- mm/mmap.c | 69 +++++++++ mm/nommu.c | 8 - 6 files changed, 79 insertions(+), 298 deletions(-) delete mode 100644 mm/fremap.c (limited to 'mm') diff --git a/Documentation/vm/remap_file_pages.txt b/Documentation/vm/remap_file_pages.txt index 560e4363a55d..f609142f406a 100644 --- a/Documentation/vm/remap_file_pages.txt +++ b/Documentation/vm/remap_file_pages.txt @@ -18,10 +18,9 @@ on 32-bit systems to map files bigger than can linearly fit into 32-bit virtual address space. This use-case is not critical anymore since 64-bit systems are widely available. -The plan is to deprecate the syscall and replace it with an emulation. -The emulation will create new VMAs instead of nonlinear mappings. It's -going to work slower for rare users of remap_file_pages() but ABI is -preserved. +The syscall is deprecated and replaced it with an emulation now. The +emulation creates new VMAs instead of nonlinear mappings. It's going to +work slower for rare users of remap_file_pages() but ABI is preserved. One side effect of emulation (apart from performance) is that user can hit vm.max_map_count limit more easily due to additional VMAs. See comment for diff --git a/include/linux/fs.h b/include/linux/fs.h index 42efe13077b6..60c4996df7f3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2481,8 +2481,12 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); -extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr, - unsigned long size, pgoff_t pgoff); +static inline int generic_file_remap_pages(struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgoff_t pgoff) +{ + BUG(); + return 0; +} int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/mm/Makefile b/mm/Makefile index 4bf586e66378..3548460ab7b6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o diff --git a/mm/fremap.c b/mm/fremap.c deleted file mode 100644 index 2805d71cf476..000000000000 --- a/mm/fremap.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * linux/mm/fremap.c - * - * Explicit pagetable population and nonlinear (random) mappings support. - * - * started by Ingo Molnar, Copyright (C) 2002, 2003 - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "internal.h" - -static int mm_counter(struct page *page) -{ - return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; -} - -static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - pte_t pte = *ptep; - struct page *page; - swp_entry_t entry; - - if (pte_present(pte)) { - flush_cache_page(vma, addr, pte_pfn(pte)); - pte = ptep_clear_flush_notify(vma, addr, ptep); - page = vm_normal_page(vma, addr, pte); - if (page) { - if (pte_dirty(pte)) - set_page_dirty(page); - update_hiwater_rss(mm); - dec_mm_counter(mm, mm_counter(page)); - page_remove_rmap(page); - page_cache_release(page); - } - } else { /* zap_pte() is not called when pte_none() */ - if (!pte_file(pte)) { - update_hiwater_rss(mm); - entry = pte_to_swp_entry(pte); - if (non_swap_entry(entry)) { - if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); - dec_mm_counter(mm, mm_counter(page)); - } - } else { - free_swap_and_cache(entry); - dec_mm_counter(mm, MM_SWAPENTS); - } - } - pte_clear_not_present_full(mm, addr, ptep, 0); - } -} - -/* - * Install a file pte to a given virtual memory address, release any - * previously existing mapping. - */ -static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long pgoff, pgprot_t prot) -{ - int err = -ENOMEM; - pte_t *pte, ptfile; - spinlock_t *ptl; - - pte = get_locked_pte(mm, addr, &ptl); - if (!pte) - goto out; - - ptfile = pgoff_to_pte(pgoff); - - if (!pte_none(*pte)) - zap_pte(mm, vma, addr, pte); - - set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); - /* - * We don't need to run update_mmu_cache() here because the "file pte" - * being installed by install_file_pte() is not a real pte - it's a - * non-present entry (like a swap entry), noting what file offset should - * be mapped there when there's a fault (in a non-linear vma where - * that's not obvious). - */ - pte_unmap_unlock(pte, ptl); - err = 0; -out: - return err; -} - -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff) -{ - struct mm_struct *mm = vma->vm_mm; - int err; - - do { - err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot); - if (err) - return err; - - size -= PAGE_SIZE; - addr += PAGE_SIZE; - pgoff++; - } while (size); - - return 0; -} -EXPORT_SYMBOL(generic_file_remap_pages); - -/** - * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma - * @start: start of the remapped virtual memory range - * @size: size of the remapped virtual memory range - * @prot: new protection bits of the range (see NOTE) - * @pgoff: to-be-mapped page of the backing store file - * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. - * - * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma - * (shared backing store file). - * - * This syscall works purely via pagetables, so it's the most efficient - * way to map the same (large) file into a given virtual window. Unlike - * mmap()/mremap() it does not create any new vmas. The new mappings are - * also safe across swapout. - * - * NOTE: the @prot parameter right now is ignored (but must be zero), - * and the vma's default protection is used. Arbitrary protections - * might be implemented in the future. - */ -SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, - unsigned long, prot, unsigned long, pgoff, unsigned long, flags) -{ - struct mm_struct *mm = current->mm; - struct address_space *mapping; - struct vm_area_struct *vma; - int err = -EINVAL; - int has_write_lock = 0; - vm_flags_t vm_flags = 0; - - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " - "See Documentation/vm/remap_file_pages.txt.\n", - current->comm, current->pid); - - if (prot) - return err; - /* - * Sanitize the syscall parameters: - */ - start = start & PAGE_MASK; - size = size & PAGE_MASK; - - /* Does the address range wrap, or is the span zero-sized? */ - if (start + size <= start) - return err; - - /* Does pgoff wrap? */ - if (pgoff + (size >> PAGE_SHIFT) < pgoff) - return err; - - /* Can we represent this offset inside this architecture's pte's? */ -#if PTE_FILE_MAX_BITS < BITS_PER_LONG - if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) - return err; -#endif - - /* We need down_write() to change vma->vm_flags. */ - down_read(&mm->mmap_sem); - retry: - vma = find_vma(mm, start); - - /* - * Make sure the vma is shared, that it supports prefaulting, - * and that the remapped range is valid and fully within - * the single existing vma. - */ - if (!vma || !(vma->vm_flags & VM_SHARED)) - goto out; - - if (!vma->vm_ops || !vma->vm_ops->remap_pages) - goto out; - - if (start < vma->vm_start || start + size > vma->vm_end) - goto out; - - /* Must set VM_NONLINEAR before any pages are populated. */ - if (!(vma->vm_flags & VM_NONLINEAR)) { - /* - * vm_private_data is used as a swapout cursor - * in a VM_NONLINEAR vma. - */ - if (vma->vm_private_data) - goto out; - - /* Don't need a nonlinear mapping, exit success */ - if (pgoff == linear_page_index(vma, start)) { - err = 0; - goto out; - } - - if (!has_write_lock) { -get_write_lock: - up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - has_write_lock = 1; - goto retry; - } - mapping = vma->vm_file->f_mapping; - /* - * page_mkclean doesn't work on nonlinear vmas, so if - * dirty pages need to be accounted, emulate with linear - * vmas. - */ - if (mapping_cap_account_dirty(mapping)) { - unsigned long addr; - struct file *file = get_file(vma->vm_file); - /* mmap_region may free vma; grab the info now */ - vm_flags = vma->vm_flags; - - addr = mmap_region(file, start, size, vm_flags, pgoff); - fput(file); - if (IS_ERR_VALUE(addr)) { - err = addr; - } else { - BUG_ON(addr != start); - err = 0; - } - goto out_freed; - } - i_mmap_lock_write(mapping); - flush_dcache_mmap_lock(mapping); - vma->vm_flags |= VM_NONLINEAR; - vma_interval_tree_remove(vma, &mapping->i_mmap); - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - if (vma->vm_flags & VM_LOCKED) { - /* - * drop PG_Mlocked flag for over-mapped range - */ - if (!has_write_lock) - goto get_write_lock; - vm_flags = vma->vm_flags; - munlock_vma_pages_range(vma, start, start + size); - vma->vm_flags = vm_flags; - } - - mmu_notifier_invalidate_range_start(mm, start, start + size); - err = vma->vm_ops->remap_pages(vma, start, size, pgoff); - mmu_notifier_invalidate_range_end(mm, start, start + size); - - /* - * We can't clear VM_NONLINEAR because we'd have to do - * it after ->populate completes, and that would prevent - * downgrading the lock. (Locks can't be upgraded). - */ - -out: - if (vma) - vm_flags = vma->vm_flags; -out_freed: - if (likely(!has_write_lock)) - up_read(&mm->mmap_sem); - else - up_write(&mm->mmap_sem); - if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) - mm_populate(start, size); - - return err; -} diff --git a/mm/mmap.c b/mm/mmap.c index 7f684d5a8087..e023dc5e59a8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2634,6 +2634,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) return vm_munmap(addr, len); } + +/* + * Emulation of deprecated remap_file_pages() syscall. + */ +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) +{ + + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long populate = 0; + unsigned long ret = -EINVAL; + struct file *file; + + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " + "See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); + + if (prot) + return ret; + start = start & PAGE_MASK; + size = size & PAGE_MASK; + + if (start + size <= start) + return ret; + + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return ret; + + down_write(&mm->mmap_sem); + vma = find_vma(mm, start); + + if (!vma || !(vma->vm_flags & VM_SHARED)) + goto out; + + if (start < vma->vm_start || start + size > vma->vm_end) + goto out; + + if (pgoff == linear_page_index(vma, start)) { + ret = 0; + goto out; + } + + prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; + prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; + prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; + + flags &= MAP_NONBLOCK; + flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; + if (vma->vm_flags & VM_LOCKED) { + flags |= MAP_LOCKED; + /* drop PG_Mlocked flag for over-mapped range */ + munlock_vma_pages_range(vma, start, start + size); + } + + file = get_file(vma->vm_file); + ret = do_mmap_pgoff(vma->vm_file, start, size, + prot, flags, pgoff, &populate); + fput(file); +out: + up_write(&mm->mmap_sem); + if (populate) + mm_populate(ret, populate); + if (!IS_ERR_VALUE(ret)) + ret = 0; + return ret; +} + static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_VM diff --git a/mm/nommu.c b/mm/nommu.c index 28bd8c4dff6f..541bed64e348 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1984,14 +1984,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_map_pages); -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff) -{ - BUG(); - return 0; -} -EXPORT_SYMBOL(generic_file_remap_pages); - static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, int write) { -- cgit v1.2.3-70-g09d2 From 8a5f14a23177061ec11daeaa3d09d0765d785c47 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:49 -0800 Subject: mm: drop support of non-linear mapping from unmap/zap codepath We have remap_file_pages(2) emulation in -mm tree for few release cycles and we plan to have it mainline in v3.20. This patchset removes rest of VM_NONLINEAR infrastructure. Patches 1-8 take care about generic code. They are pretty straight-forward and can be applied without other of patches. Rest patches removes pte_file()-related stuff from architecture-specific code. It usually frees up one bit in non-present pte. I've tried to reuse that bit for swap offset, where I was able to figure out how to do that. For obvious reason I cannot test all that arch-specific code and would like to see acks from maintainers. In total, remap_file_pages(2) required about 1.4K lines of not-so-trivial kernel code. That's too much for functionality nobody uses. Tested-by: Felipe Balbi This patch (of 38): We don't create non-linear mappings anymore. Let's drop code which handles them on unmap/zap. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/madvise.c | 9 +----- mm/memory.c | 82 ++++++++++++++---------------------------------------- 3 files changed, 22 insertions(+), 70 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c6fd3c5424a..600ef5ed4698 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1146,7 +1146,6 @@ extern void user_shm_unlock(size_t, struct user_struct *); * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { - struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ diff --git a/mm/madvise.c b/mm/madvise.c index a271adc93289..917754d26c17 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -278,14 +278,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) return -EINVAL; - if (unlikely(vma->vm_flags & VM_NONLINEAR)) { - struct zap_details details = { - .nonlinear_vma = vma, - .last_index = ULONG_MAX, - }; - zap_page_range(vma, start, end - start, &details); - } else - zap_page_range(vma, start, end - start, NULL); + zap_page_range(vma, start, end - start, NULL); return 0; } diff --git a/mm/memory.c b/mm/memory.c index 2c3536cc6c63..9a3e73b69dad 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1082,6 +1082,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; + swp_entry_t entry; again: init_rss_vec(rss); @@ -1107,28 +1108,12 @@ again: if (details->check_mapping && details->check_mapping != page->mapping) continue; - /* - * Each page->index must be checked when - * invalidating or truncating nonlinear. - */ - if (details->nonlinear_vma && - (page->index < details->first_index || - page->index > details->last_index)) - continue; } ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; - if (unlikely(details) && details->nonlinear_vma - && linear_page_index(details->nonlinear_vma, - addr) != page->index) { - pte_t ptfile = pgoff_to_pte(page->index); - if (pte_soft_dirty(ptent)) - ptfile = pte_file_mksoft_dirty(ptfile); - set_pte_at(mm, addr, pte, ptfile); - } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { @@ -1151,33 +1136,25 @@ again: } continue; } - /* - * If details->check_mapping, we leave swap entries; - * if details->nonlinear_vma, we leave file entries. - */ + /* If details->check_mapping, we leave swap entries. */ if (unlikely(details)) continue; - if (pte_file(ptent)) { - if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) - print_bad_pte(vma, addr, ptent, NULL); - } else { - swp_entry_t entry = pte_to_swp_entry(ptent); - if (!non_swap_entry(entry)) - rss[MM_SWAPENTS]--; - else if (is_migration_entry(entry)) { - struct page *page; + entry = pte_to_swp_entry(ptent); + if (!non_swap_entry(entry)) + rss[MM_SWAPENTS]--; + else if (is_migration_entry(entry)) { + struct page *page; - page = migration_entry_to_page(entry); + page = migration_entry_to_page(entry); - if (PageAnon(page)) - rss[MM_ANONPAGES]--; - else - rss[MM_FILEPAGES]--; - } - if (unlikely(!free_swap_and_cache(entry))) - print_bad_pte(vma, addr, ptent, NULL); + if (PageAnon(page)) + rss[MM_ANONPAGES]--; + else + rss[MM_FILEPAGES]--; } + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, addr != end); @@ -1277,7 +1254,7 @@ static void unmap_page_range(struct mmu_gather *tlb, pgd_t *pgd; unsigned long next; - if (details && !details->check_mapping && !details->nonlinear_vma) + if (details && !details->check_mapping) details = NULL; BUG_ON(addr >= end); @@ -1371,7 +1348,7 @@ void unmap_vmas(struct mmu_gather *tlb, * @vma: vm_area_struct holding the applicable pages * @start: starting address of pages to zap * @size: number of bytes to zap - * @details: details of nonlinear truncation or shared cache invalidation + * @details: details of shared cache invalidation * * Caller must protect the VMA list */ @@ -1397,7 +1374,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap - * @details: details of nonlinear truncation or shared cache invalidation + * @details: details of shared cache invalidation * * The range must fit into one VMA. */ @@ -2331,25 +2308,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, } } -static inline void unmap_mapping_range_list(struct list_head *head, - struct zap_details *details) -{ - struct vm_area_struct *vma; - - /* - * In nonlinear VMAs there is no correspondence between virtual address - * offset and file offset. So we must perform an exhaustive search - * across *all* the pages in each nonlinear VMA, not just the pages - * whose virtual address lies outside the file truncation point. - */ - list_for_each_entry(vma, head, shared.nonlinear) { - details->nonlinear_vma = vma; - unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); - } -} - /** - * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. + * unmap_mapping_range - unmap the portion of all mmaps in the specified + * address_space corresponding to the specified page range in the underlying + * file. + * * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE @@ -2378,7 +2341,6 @@ void unmap_mapping_range(struct address_space *mapping, } details.check_mapping = even_cows? NULL: mapping; - details.nonlinear_vma = NULL; details.first_index = hba; details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) @@ -2388,8 +2350,6 @@ void unmap_mapping_range(struct address_space *mapping, i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); - if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) - unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); i_mmap_unlock_write(mapping); } EXPORT_SYMBOL(unmap_mapping_range); -- cgit v1.2.3-70-g09d2 From 9b4bdd2ffab9557ac43af7dff02e7dab1c8c58bd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:51 -0800 Subject: mm: drop support of non-linear mapping from fault codepath We don't create non-linear mappings anymore. Let's drop code which handles them on page fault. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 16 ++++++-------- mm/memory.c | 65 ++++++++---------------------------------------------- 2 files changed, 16 insertions(+), 65 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 600ef5ed4698..376e5c325dee 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -206,21 +206,19 @@ extern unsigned int kobjsize(const void *objp); extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ -#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ -#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ -#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ -#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ -#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ -#define FAULT_FLAG_TRIED 0x40 /* second try */ -#define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */ +#define FAULT_FLAG_MKWRITE 0x02 /* Fault was mkwrite of existing pte */ +#define FAULT_FLAG_ALLOW_RETRY 0x04 /* Retry fault if blocking */ +#define FAULT_FLAG_RETRY_NOWAIT 0x08 /* Don't drop mmap_sem and wait when retrying */ +#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */ +#define FAULT_FLAG_TRIED 0x20 /* Second try */ +#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * - * pgoff should be used in favour of virtual_address, if possible. If pgoff - * is used, one may implement ->remap_pages to get nonlinear mapping support. + * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { unsigned int flags; /* FAULT_FLAG_xxx flags */ diff --git a/mm/memory.c b/mm/memory.c index 9a3e73b69dad..43a53743cbb4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1899,12 +1899,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, EXPORT_SYMBOL_GPL(apply_to_page_range); /* - * handle_pte_fault chooses page fault handler according to an entry - * which was read non-atomically. Before making any commitment, on - * those architectures or configurations (e.g. i386 with PAE) which - * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault - * must check under lock before unmapping the pte and proceeding - * (but do_wp_page is only called after already making such a check; + * handle_pte_fault chooses page fault handler according to an entry which was + * read non-atomically. Before making any commitment, on those architectures + * or configurations (e.g. i386 with PAE) which might give a mix of unmatched + * parts, do_swap_page must check under lock before unmapping the pte and + * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). */ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, @@ -2710,8 +2709,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, entry = mk_pte(page, vma->vm_page_prot); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) - entry = pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); @@ -2846,8 +2843,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * if page by the offset is not ready to be mapped (cold cache or * something). */ - if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && - fault_around_bytes >> PAGE_SHIFT > 1) { + if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) @@ -2992,7 +2988,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) { @@ -3009,46 +3005,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } -/* - * Fault of a previously existing named mapping. Repopulate the pte - * from the encoded file_pte if possible. This enables swappable - * nonlinear vmas. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with pte unmapped and unlocked. - * The mmap_sem may have been released depending on flags and our - * return value. See filemap_fault() and __lock_page_or_retry(). - */ -static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - unsigned int flags, pte_t orig_pte) -{ - pgoff_t pgoff; - - flags |= FAULT_FLAG_NONLINEAR; - - if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - return 0; - - if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { - /* - * Page table corrupted: show pte and kill process. - */ - print_bad_pte(vma, address, orig_pte, NULL); - return VM_FAULT_SIGBUS; - } - - pgoff = pte_to_pgoff(orig_pte); - if (!(flags & FAULT_FLAG_WRITE)) - return do_read_fault(mm, vma, address, pmd, pgoff, flags, - orig_pte); - if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(mm, vma, address, pmd, pgoff, flags, - orig_pte); - return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); -} - static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags) @@ -3176,15 +3132,12 @@ static int handle_pte_fault(struct mm_struct *mm, if (pte_none(entry)) { if (vma->vm_ops) { if (likely(vma->vm_ops->fault)) - return do_linear_fault(mm, vma, address, - pte, pmd, flags, entry); + return do_fault(mm, vma, address, pte, + pmd, flags, entry); } return do_anonymous_page(mm, vma, address, pte, pmd, flags); } - if (pte_file(entry)) - return do_nonlinear_fault(mm, vma, address, - pte, pmd, flags, entry); return do_swap_page(mm, vma, address, pte, pmd, flags, entry); } -- cgit v1.2.3-70-g09d2 From d83a08db5ba6072caa658745881f4baa9bad6a08 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:54 -0800 Subject: mm: drop vm_ops->remap_pages and generic_file_remap_pages() stub Nobody uses it anymore. [akpm@linux-foundation.org: fix filemap_xip.c] Signed-off-by: Kirill A. Shutemov Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_file.c | 2 -- fs/btrfs/file.c | 1 - fs/ceph/addr.c | 1 - fs/cifs/file.c | 1 - fs/ext4/file.c | 1 - fs/f2fs/file.c | 1 - fs/fuse/file.c | 1 - fs/gfs2/file.c | 1 - fs/nfs/file.c | 1 - fs/nilfs2/file.c | 1 - fs/ocfs2/mmap.c | 1 - fs/ubifs/file.c | 1 - fs/xfs/xfs_file.c | 1 - include/linux/fs.h | 6 ------ include/linux/mm.h | 3 --- mm/filemap.c | 1 - mm/filemap_xip.c | 1 - mm/shmem.c | 1 - 18 files changed, 26 deletions(-) (limited to 'mm') diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 5594505e6e73..b40133796b87 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -831,7 +831,6 @@ static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { @@ -839,7 +838,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e4090259569b..a606ab551296 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2081,7 +2081,6 @@ static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c81c0e004588..24be059fd1f8 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1569,7 +1569,6 @@ out: static struct vm_operations_struct ceph_vmops = { .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int ceph_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 74f12877493a..294ff302a237 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3248,7 +3248,6 @@ static struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8131be8c0af3..7cb592386121 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -195,7 +195,6 @@ static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3c27e0ecb3bc..5674ba13102b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -92,7 +92,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int get_parent_ino(struct inode *inode, nid_t *pino) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 760b2c552197..d769e594855b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2062,7 +2062,6 @@ static const struct vm_operations_struct fuse_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = fuse_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6e600abf694a..ec9c2d33477a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -498,7 +498,6 @@ static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = gfs2_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; /** diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2ab6f00dba5b..94712fc781fa 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -646,7 +646,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = nfs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int nfs_need_sync_write(struct file *filp, struct inode *inode) diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 3a03e0aea1fb..a8c728acb7a8 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -128,7 +128,6 @@ static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = nilfs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 10d66c75cecb..9581d190f6e1 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -173,7 +173,6 @@ out: static const struct vm_operations_struct ocfs2_file_vm_ops = { .fault = ocfs2_fault, .page_mkwrite = ocfs2_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 538519ee37d9..035e51011444 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1536,7 +1536,6 @@ static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ubifs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 13e974e6a889..ac7f1e8f92b3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1384,5 +1384,4 @@ static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = xfs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 60c4996df7f3..47f557c7ef7e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2481,12 +2481,6 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); -static inline int generic_file_remap_pages(struct vm_area_struct *vma, - unsigned long addr, unsigned long size, pgoff_t pgoff) -{ - BUG(); - return 0; -} int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 376e5c325dee..2ddd9d1d6268 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -285,9 +285,6 @@ struct vm_operations_struct { struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr); #endif - /* called by sys_remap_file_pages() to populate non-linear mapping */ - int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff); }; struct mmu_gather; diff --git a/mm/filemap.c b/mm/filemap.c index 673e4581a2e5..bf7a27142704 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2087,7 +2087,6 @@ const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; /* This is used for a general mmap of a disk file */ diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 0d105aeff82f..70c09da1a419 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -301,7 +301,6 @@ out: static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, .page_mkwrite = filemap_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int xip_file_mmap(struct file * file, struct vm_area_struct * vma) diff --git a/mm/shmem.c b/mm/shmem.c index 993e6ba689cc..b3e403181981 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3201,7 +3201,6 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif - .remap_pages = generic_file_remap_pages, }; static struct dentry *shmem_mount(struct file_system_type *fs_type, -- cgit v1.2.3-70-g09d2 From 27ba0644ea9dfe6e7693abc85837b60e40583b96 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:59 -0800 Subject: rmap: drop support of non-linear mappings We don't create non-linear mappings anymore. Let's drop code which handles them in rmap. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cachetlb.txt | 8 +- fs/inode.c | 1 - include/linux/fs.h | 4 +- include/linux/mm.h | 6 -- include/linux/mm_types.h | 4 +- include/linux/rmap.h | 2 - kernel/fork.c | 8 +- mm/migrate.c | 32 ------- mm/mmap.c | 24 ++--- mm/rmap.c | 225 +-------------------------------------------- mm/swap.c | 4 +- 11 files changed, 18 insertions(+), 300 deletions(-) (limited to 'mm') diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index d79b008e4a32..3f9f808b5119 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -317,10 +317,10 @@ maps this page at its virtual address. about doing this. The idea is, first at flush_dcache_page() time, if - page->mapping->i_mmap is an empty tree and ->i_mmap_nonlinear - an empty list, just mark the architecture private page flag bit. - Later, in update_mmu_cache(), a check is made of this flag bit, - and if set the flush is done and the flag bit is cleared. + page->mapping->i_mmap is an empty tree, just mark the architecture + private page flag bit. Later, in update_mmu_cache(), a check is + made of this flag bit, and if set the flush is done and the flag + bit is cleared. IMPORTANT NOTE: It is often important, if you defer the flush, that the actual flush occurs on the same CPU diff --git a/fs/inode.c b/fs/inode.c index aa149e7262ac..c760fac33c92 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -355,7 +355,6 @@ void address_space_init_once(struct address_space *mapping) INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); mapping->i_mmap = RB_ROOT; - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); } EXPORT_SYMBOL(address_space_init_once); diff --git a/include/linux/fs.h b/include/linux/fs.h index 47f557c7ef7e..60acab209701 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -401,7 +401,6 @@ struct address_space { spinlock_t tree_lock; /* and lock protecting it */ atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root i_mmap; /* tree of private and shared mappings */ - struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ @@ -493,8 +492,7 @@ static inline void i_mmap_unlock_read(struct address_space *mapping) */ static inline int mapping_mapped(struct address_space *mapping) { - return !RB_EMPTY_ROOT(&mapping->i_mmap) || - !list_empty(&mapping->i_mmap_nonlinear); + return !RB_EMPTY_ROOT(&mapping->i_mmap); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ddd9d1d6268..18391eec4864 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1796,12 +1796,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, for (vma = vma_interval_tree_iter_first(root, start, last); \ vma; vma = vma_interval_tree_iter_next(vma, start, last)) -static inline void vma_nonlinear_insert(struct vm_area_struct *vma, - struct list_head *list) -{ - list_add_tail(&vma->shared.nonlinear, list); -} - void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root *root); void anon_vma_interval_tree_remove(struct anon_vma_chain *node, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6d34aa266a8c..3b1d20fb0848 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -273,15 +273,13 @@ struct vm_area_struct { /* * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree, or - * linkage of vma in the address_space->i_mmap_nonlinear list. + * linkage into the address_space->i_mmap interval tree. */ union { struct { struct rb_node rb; unsigned long rb_subtree_last; } linear; - struct list_head nonlinear; } shared; /* diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d9d7e7e56352..b38f559130d5 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -246,7 +246,6 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); * arg: passed to rmap_one() and invalid_vma() * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition - * file_nonlinear: for handling file nonlinear mapping * anon_lock: for getting anon_lock by optimized way rather than default * invalid_vma: for skipping uninterested vma */ @@ -255,7 +254,6 @@ struct rmap_walk_control { int (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); - int (*file_nonlinear)(struct page *, struct address_space *, void *arg); struct anon_vma *(*anon_lock)(struct page *page); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; diff --git a/kernel/fork.c b/kernel/fork.c index 4dc2ddade9f1..b379d9abddc7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - if (unlikely(tmp->vm_flags & VM_NONLINEAR)) - vma_nonlinear_insert(tmp, - &mapping->i_mmap_nonlinear); - else - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } diff --git a/mm/migrate.c b/mm/migrate.c index 344cdf692fc8..6e284bcca8bb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -178,37 +178,6 @@ out: return SWAP_AGAIN; } -/* - * Congratulations to trinity for discovering this bug. - * mm/fremap.c's remap_file_pages() accepts any range within a single vma to - * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then - * replace the specified range by file ptes throughout (maybe populated after). - * If page migration finds a page within that range, while it's still located - * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: - * zap_pte() clears the temporary migration entry before mmap_sem is dropped. - * But if the migrating page is in a part of the vma outside the range to be - * remapped, then it will not be cleared, and remove_migration_ptes() needs to - * deal with it. Fortunately, this part of the vma is of course still linear, - * so we just need to use linear location on the nonlinear list. - */ -static int remove_linear_migration_ptes_from_nonlinear(struct page *page, - struct address_space *mapping, void *arg) -{ - struct vm_area_struct *vma; - /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - unsigned long addr; - - list_for_each_entry(vma, - &mapping->i_mmap_nonlinear, shared.nonlinear) { - - addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (addr >= vma->vm_start && addr < vma->vm_end) - remove_migration_pte(page, vma, addr, arg); - } - return SWAP_AGAIN; -} - /* * Get rid of all migration entries and replace them by * references to the indicated page. @@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new) struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, - .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, }; rmap_walk(new, &rwc); diff --git a/mm/mmap.c b/mm/mmap.c index e023dc5e59a8..14d84666e8ba 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); - if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.nonlinear); - else - vma_interval_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } @@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); - if (unlikely(vma->vm_flags & VM_NONLINEAR)) - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - else - vma_interval_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } } @@ -789,14 +783,11 @@ again: remove_next = 1 + (end > next->vm_end); if (file) { mapping = file->f_mapping; - if (!(vma->vm_flags & VM_NONLINEAR)) { - root = &mapping->i_mmap; - uprobe_munmap(vma, vma->vm_start, vma->vm_end); + root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); - if (adjust_next) - uprobe_munmap(next, next->vm_start, - next->vm_end); - } + if (adjust_next) + uprobe_munmap(next, next->vm_start, next->vm_end); i_mmap_lock_write(mapping); if (insert) { @@ -3177,8 +3168,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * * mmap_sem in write mode is required in order to block all operations * that could modify pagetables and free pages without need of - * altering the vma layout (for example populate_range() with - * nonlinear vmas). It's also needed in write mode to avoid new + * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. * * A single task can't take more than one mm_take_all_locks() in a row diff --git a/mm/rmap.c b/mm/rmap.c index 71cd5bd0c17d..70b32498d4f2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -590,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) if (!vma->anon_vma || !page__anon_vma || vma->anon_vma->root != page__anon_vma->root) return -EFAULT; - } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { - if (!vma->vm_file || - vma->vm_file->f_mapping != page->mapping) + } else if (page->mapping) { + if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) return -EFAULT; } else return -EFAULT; @@ -1274,7 +1273,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pte, swp_pte); - BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { /* Establish migration entry for a file page */ @@ -1316,211 +1314,6 @@ out_mlock: return ret; } -/* - * objrmap doesn't work for nonlinear VMAs because the assumption that - * offset-into-file correlates with offset-into-virtual-addresses does not hold. - * Consequently, given a particular page and its ->index, we cannot locate the - * ptes which are mapping that page without an exhaustive linear search. - * - * So what this code does is a mini "virtual scan" of each nonlinear VMA which - * maps the file to which the target page belongs. The ->vm_private_data field - * holds the current cursor into that scan. Successive searches will circulate - * around the vma's virtual address space. - * - * So as more replacement pressure is applied to the pages in a nonlinear VMA, - * more scanning pressure is placed against them as well. Eventually pages - * will become fully unmapped and are eligible for eviction. - * - * For very sparsely populated VMAs this is a little inefficient - chances are - * there there won't be many ptes located within the scan cluster. In this case - * maybe we could scan further - to the end of the pte page, perhaps. - * - * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can - * acquire it without blocking. If vma locked, mlock the pages in the cluster, - * rather than unmapping them. If we encounter the "check_page" that vmscan is - * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. - */ -#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) -#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) - -static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, - struct vm_area_struct *vma, struct page *check_page) -{ - struct mm_struct *mm = vma->vm_mm; - pmd_t *pmd; - pte_t *pte; - pte_t pteval; - spinlock_t *ptl; - struct page *page; - unsigned long address; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - unsigned long end; - int ret = SWAP_AGAIN; - int locked_vma = 0; - - address = (vma->vm_start + cursor) & CLUSTER_MASK; - end = address + CLUSTER_SIZE; - if (address < vma->vm_start) - address = vma->vm_start; - if (end > vma->vm_end) - end = vma->vm_end; - - pmd = mm_find_pmd(mm, address); - if (!pmd) - return ret; - - mmun_start = address; - mmun_end = end; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - - /* - * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, - * keep the sem while scanning the cluster for mlocking pages. - */ - if (down_read_trylock(&vma->vm_mm->mmap_sem)) { - locked_vma = (vma->vm_flags & VM_LOCKED); - if (!locked_vma) - up_read(&vma->vm_mm->mmap_sem); /* don't need it */ - } - - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - - /* Update high watermark before we lower rss */ - update_hiwater_rss(mm); - - for (; address < end; pte++, address += PAGE_SIZE) { - if (!pte_present(*pte)) - continue; - page = vm_normal_page(vma, address, *pte); - BUG_ON(!page || PageAnon(page)); - - if (locked_vma) { - if (page == check_page) { - /* we know we have check_page locked */ - mlock_vma_page(page); - ret = SWAP_MLOCK; - } else if (trylock_page(page)) { - /* - * If we can lock the page, perform mlock. - * Otherwise leave the page alone, it will be - * eventually encountered again later. - */ - mlock_vma_page(page); - unlock_page(page); - } - continue; /* don't unmap */ - } - - /* - * No need for _notify because we're within an - * mmu_notifier_invalidate_range_ {start|end} scope. - */ - if (ptep_clear_flush_young(vma, address, pte)) - continue; - - /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); - - /* If nonlinear, store the file page offset in the pte. */ - if (page->index != linear_page_index(vma, address)) { - pte_t ptfile = pgoff_to_pte(page->index); - if (pte_soft_dirty(pteval)) - ptfile = pte_file_mksoft_dirty(ptfile); - set_pte_at(mm, address, pte, ptfile); - } - - /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pteval)) - set_page_dirty(page); - - page_remove_rmap(page); - page_cache_release(page); - dec_mm_counter(mm, MM_FILEPAGES); - (*mapcount)--; - } - pte_unmap_unlock(pte - 1, ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - if (locked_vma) - up_read(&vma->vm_mm->mmap_sem); - return ret; -} - -static int try_to_unmap_nonlinear(struct page *page, - struct address_space *mapping, void *arg) -{ - struct vm_area_struct *vma; - int ret = SWAP_AGAIN; - unsigned long cursor; - unsigned long max_nl_cursor = 0; - unsigned long max_nl_size = 0; - unsigned int mapcount; - - list_for_each_entry(vma, - &mapping->i_mmap_nonlinear, shared.nonlinear) { - - cursor = (unsigned long) vma->vm_private_data; - if (cursor > max_nl_cursor) - max_nl_cursor = cursor; - cursor = vma->vm_end - vma->vm_start; - if (cursor > max_nl_size) - max_nl_size = cursor; - } - - if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ - return SWAP_FAIL; - } - - /* - * We don't try to search for this page in the nonlinear vmas, - * and page_referenced wouldn't have found it anyway. Instead - * just walk the nonlinear vmas trying to age and unmap some. - * The mapcount of the page we came in with is irrelevant, - * but even so use it as a guide to how hard we should try? - */ - mapcount = page_mapcount(page); - if (!mapcount) - return ret; - - cond_resched(); - - max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; - if (max_nl_cursor == 0) - max_nl_cursor = CLUSTER_SIZE; - - do { - list_for_each_entry(vma, - &mapping->i_mmap_nonlinear, shared.nonlinear) { - - cursor = (unsigned long) vma->vm_private_data; - while (cursor < max_nl_cursor && - cursor < vma->vm_end - vma->vm_start) { - if (try_to_unmap_cluster(cursor, &mapcount, - vma, page) == SWAP_MLOCK) - ret = SWAP_MLOCK; - cursor += CLUSTER_SIZE; - vma->vm_private_data = (void *) cursor; - if ((int)mapcount <= 0) - return ret; - } - vma->vm_private_data = (void *) max_nl_cursor; - } - cond_resched(); - max_nl_cursor += CLUSTER_SIZE; - } while (max_nl_cursor <= max_nl_size); - - /* - * Don't loop forever (perhaps all the remaining pages are - * in locked vmas). Reset cursor on all unreserved nonlinear - * vmas, now forgetting on which ones it had fallen behind. - */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) - vma->vm_private_data = NULL; - - return ret; -} - bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -1566,7 +1359,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) .rmap_one = try_to_unmap_one, .arg = (void *)flags, .done = page_not_mapped, - .file_nonlinear = try_to_unmap_nonlinear, .anon_lock = page_lock_anon_vma_read, }; @@ -1612,12 +1404,6 @@ int try_to_munlock(struct page *page) .rmap_one = try_to_unmap_one, .arg = (void *)TTU_MUNLOCK, .done = page_not_mapped, - /* - * We don't bother to try to find the munlocked page in - * nonlinears. It's costly. Instead, later, page reclaim logic - * may call try_to_unmap() and recover PG_mlocked lazily. - */ - .file_nonlinear = NULL, .anon_lock = page_lock_anon_vma_read, }; @@ -1748,13 +1534,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) goto done; } - if (!rwc->file_nonlinear) - goto done; - - if (list_empty(&mapping->i_mmap_nonlinear)) - goto done; - - ret = rwc->file_nonlinear(page, mapping, rwc->arg); done: i_mmap_unlock_read(mapping); return ret; diff --git a/mm/swap.c b/mm/swap.c index 8a12b33936b4..5b3087228b99 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1140,10 +1140,8 @@ void __init swap_setup(void) if (bdi_init(swapper_spaces[0].backing_dev_info)) panic("Failed to init swap bdi"); - for (i = 0; i < MAX_SWAPFILES; i++) { + for (i = 0; i < MAX_SWAPFILES; i++) spin_lock_init(&swapper_spaces[i].tree_lock); - INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); - } #endif /* Use a smaller cluster for small-memory machines */ -- cgit v1.2.3-70-g09d2 From ac51b934f3912582d3c897c6c4d09b32ea57b2c7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:02 -0800 Subject: mm: replace vma->sharead.linear with vma->shared After removing vma->shared.nonlinear we have only one member of vma->shared union, which doesn't make much sense. This patch drops the union and move struct vma->shared.linear to vma->shared. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 8 +++----- mm/interval_tree.c | 34 +++++++++++++++++----------------- 2 files changed, 20 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3b1d20fb0848..07c8bd3f7b48 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -275,11 +275,9 @@ struct vm_area_struct { * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. */ - union { - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } linear; + struct { + struct rb_node rb; + unsigned long rb_subtree_last; } shared; /* diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 8da581fa9060..f2c2492681bf 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; } -INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, - unsigned long, shared.linear.rb_subtree_last, +INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, + unsigned long, shared.rb_subtree_last, vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) /* Insert node immediately after prev in the interval tree */ @@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); - if (!prev->shared.linear.rb.rb_right) { + if (!prev->shared.rb.rb_right) { parent = prev; - link = &prev->shared.linear.rb.rb_right; + link = &prev->shared.rb.rb_right; } else { - parent = rb_entry(prev->shared.linear.rb.rb_right, - struct vm_area_struct, shared.linear.rb); - if (parent->shared.linear.rb_subtree_last < last) - parent->shared.linear.rb_subtree_last = last; - while (parent->shared.linear.rb.rb_left) { - parent = rb_entry(parent->shared.linear.rb.rb_left, - struct vm_area_struct, shared.linear.rb); - if (parent->shared.linear.rb_subtree_last < last) - parent->shared.linear.rb_subtree_last = last; + parent = rb_entry(prev->shared.rb.rb_right, + struct vm_area_struct, shared.rb); + if (parent->shared.rb_subtree_last < last) + parent->shared.rb_subtree_last = last; + while (parent->shared.rb.rb_left) { + parent = rb_entry(parent->shared.rb.rb_left, + struct vm_area_struct, shared.rb); + if (parent->shared.rb_subtree_last < last) + parent->shared.rb_subtree_last = last; } - link = &parent->shared.linear.rb.rb_left; + link = &parent->shared.rb.rb_left; } - node->shared.linear.rb_subtree_last = last; - rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); - rb_insert_augmented(&node->shared.linear.rb, root, + node->shared.rb_subtree_last = last; + rb_link_node(&node->shared.rb, &parent->shared.rb, link); + rb_insert_augmented(&node->shared.rb, root, &vma_interval_tree_augment); } -- cgit v1.2.3-70-g09d2 From 0661a33611fca12570cba48d9344ce68834ee86c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:04 -0800 Subject: mm: remove rest usage of VM_NONLINEAR and pte_file() One bit in ->vm_flags is unused now! Signed-off-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_vma_manager.c | 3 +- include/linux/mm.h | 1 - include/linux/swapops.h | 4 +- mm/debug.c | 1 - mm/gup.c | 2 +- mm/ksm.c | 2 +- mm/madvise.c | 4 +- mm/memcontrol.c | 7 +--- mm/memory.c | 78 +++++++++++++++++++-------------------- mm/mincore.c | 9 +---- mm/mprotect.c | 2 +- mm/mremap.c | 2 - mm/msync.c | 5 +-- 13 files changed, 49 insertions(+), 71 deletions(-) (limited to 'mm') diff --git a/drivers/gpu/drm/drm_vma_manager.c b/drivers/gpu/drm/drm_vma_manager.c index 63b471205072..68c1f32fb086 100644 --- a/drivers/gpu/drm/drm_vma_manager.c +++ b/drivers/gpu/drm/drm_vma_manager.c @@ -50,8 +50,7 @@ * * You must not use multiple offset managers on a single address_space. * Otherwise, mm-core will be unable to tear down memory mappings as the VM will - * no longer be linear. Please use VM_NONLINEAR in that case and implement your - * own offset managers. + * no longer be linear. * * This offset manager works on page-based addresses. That is, every argument * and return code (with the exception of drm_vma_node_offset_addr()) is given diff --git a/include/linux/mm.h b/include/linux/mm.h index 18391eec4864..a0da685bdb82 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -138,7 +138,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ -#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_ARCH_2 0x02000000 #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 6adfb7bfbf44..50cbc876be56 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry) /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { - return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte); + return !pte_none(pte) && !pte_present_nonuma(pte); } #endif @@ -66,7 +66,6 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte) { swp_entry_t arch_entry; - BUG_ON(pte_file(pte)); if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); arch_entry = __pte_to_swp_entry(pte); @@ -82,7 +81,6 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry) swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); - BUG_ON(pte_file(__swp_entry_to_pte(arch_entry))); return __swp_entry_to_pte(arch_entry); } diff --git a/mm/debug.c b/mm/debug.c index 0e58f3211f89..d69cb5a7ba9a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = { {VM_ACCOUNT, "account" }, {VM_NORESERVE, "noreserve" }, {VM_HUGETLB, "hugetlb" }, - {VM_NONLINEAR, "nonlinear" }, #if defined(CONFIG_X86) {VM_PAT, "pat" }, #elif defined(CONFIG_PPC) diff --git a/mm/gup.c b/mm/gup.c index 8dd50ce6326f..12bc2bc33da7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -55,7 +55,7 @@ retry: */ if (likely(!(flags & FOLL_MIGRATION))) goto no_page; - if (pte_none(pte) || pte_file(pte)) + if (pte_none(pte)) goto no_page; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) diff --git a/mm/ksm.c b/mm/ksm.c index 15647fb0394f..4162dce2eb44 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) + VM_HUGETLB | VM_MIXEDMAP)) return 0; /* just ignore the advice */ #ifdef VM_SAO diff --git a/mm/madvise.c b/mm/madvise.c index 917754d26c17..d79fb5e8f80a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, pte = *(orig_pte + ((index - start) / PAGE_SIZE)); pte_unmap_unlock(orig_pte, ptl); - if (pte_present(pte) || pte_none(pte) || pte_file(pte)) + if (pte_present(pte) || pte_none(pte)) continue; entry = pte_to_swp_entry(pte); if (unlikely(non_swap_entry(entry))) @@ -296,7 +296,7 @@ static long madvise_remove(struct vm_area_struct *vma, *prev = NULL; /* tell sys_madvise we drop mmap_sem */ - if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB)) return -EINVAL; f = vma->vm_file; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2f6893c2f01b..8b58701b9647 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4926,10 +4926,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, return NULL; mapping = vma->vm_file->f_mapping; - if (pte_none(ptent)) - pgoff = linear_page_index(vma, addr); - else /* pte_file(ptent) is true */ - pgoff = pte_to_pgoff(ptent); + pgoff = linear_page_index(vma, addr); /* page is moved even if it's not RSS of this task(page-faulted). */ #ifdef CONFIG_SWAP @@ -4961,7 +4958,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, page = mc_handle_present_pte(vma, addr, ptent); else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, addr, ptent, &ent); - else if (pte_none(ptent) || pte_file(ptent)) + else if (pte_none(ptent)) page = mc_handle_file_pte(vma, addr, ptent, &ent); if (!page && !ent.val) diff --git a/mm/memory.c b/mm/memory.c index 43a53743cbb4..9aa09217fe20 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -811,42 +811,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { - if (!pte_file(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - - if (likely(!non_swap_entry(entry))) { - if (swap_duplicate(entry) < 0) - return entry.val; - - /* make sure dst_mm is on swapoff's mmlist. */ - if (unlikely(list_empty(&dst_mm->mmlist))) { - spin_lock(&mmlist_lock); - if (list_empty(&dst_mm->mmlist)) - list_add(&dst_mm->mmlist, - &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } - rss[MM_SWAPENTS]++; - } else if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); - - if (PageAnon(page)) - rss[MM_ANONPAGES]++; - else - rss[MM_FILEPAGES]++; - - if (is_write_migration_entry(entry) && - is_cow_mapping(vm_flags)) { - /* - * COW mappings require pages in both - * parent and child to be set to read. - */ - make_migration_entry_read(&entry); - pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*src_pte)) - pte = pte_swp_mksoft_dirty(pte); - set_pte_at(src_mm, addr, src_pte, pte); - } + swp_entry_t entry = pte_to_swp_entry(pte); + + if (likely(!non_swap_entry(entry))) { + if (swap_duplicate(entry) < 0) + return entry.val; + + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + rss[MM_SWAPENTS]++; + } else if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); + + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; + + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); + set_pte_at(src_mm, addr, src_pte, pte); } } goto out_set_pte; @@ -1020,11 +1018,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | - VM_PFNMAP | VM_MIXEDMAP))) { - if (!vma->anon_vma) - return 0; - } + if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && + !vma->anon_vma) + return 0; if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); diff --git a/mm/mincore.c b/mm/mincore.c index c8c528b36641..46527c023e0c 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -124,17 +124,13 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { pte_t pte = *ptep; - pgoff_t pgoff; next = addr + PAGE_SIZE; if (pte_none(pte)) mincore_unmapped_range(vma, addr, next, vec); else if (pte_present(pte)) *vec = 1; - else if (pte_file(pte)) { - pgoff = pte_to_pgoff(pte); - *vec = mincore_page(vma->vm_file->f_mapping, pgoff); - } else { /* pte is a swap entry */ + else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); if (non_swap_entry(entry)) { @@ -145,9 +141,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, *vec = 1; } else { #ifdef CONFIG_SWAP - pgoff = entry.val; *vec = mincore_page(swap_address_space(entry), - pgoff); + entry.val); #else WARN_ON(1); *vec = 1; diff --git a/mm/mprotect.c b/mm/mprotect.c index ace93454ce8e..33121662f08b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -105,7 +105,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } if (updated) pages++; - } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { + } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); if (is_write_migration_entry(entry)) { diff --git a/mm/mremap.c b/mm/mremap.c index 17fa018f5f39..57dadc025c64 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte) pte = pte_mksoft_dirty(pte); else if (is_swap_pte(pte)) pte = pte_swp_mksoft_dirty(pte); - else if (pte_file(pte)) - pte = pte_file_mksoft_dirty(pte); #endif return pte; } diff --git a/mm/msync.c b/mm/msync.c index 992a1673d488..bb04d53ae852 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) (vma->vm_flags & VM_SHARED)) { get_file(file); up_read(&mm->mmap_sem); - if (vma->vm_flags & VM_NONLINEAR) - error = vfs_fsync(file, 1); - else - error = vfs_fsync_range(file, fstart, fend, 1); + error = vfs_fsync_range(file, fstart, fend, 1); fput(file); if (error || start >= end) goto out; -- cgit v1.2.3-70-g09d2 From 74ec67511d36f9c731065b1dae7d9638a3b639d3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Feb 2015 14:11:28 -0800 Subject: mm: memory: remove ->vm_file check on shared writable vmas Shared anonymous mmaps are implemented with shmem files, so all VMAs with shared writable semantics also have an underlying backing file. Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 9aa09217fe20..0e9b32610655 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2127,9 +2127,7 @@ reuse: balance_dirty_pages_ratelimited(mapping); } - /* file_update_time outside page_lock */ - if (vma->vm_file) - file_update_time(vma->vm_file); + file_update_time(vma->vm_file); } put_page(dirty_page); if (page_mkwrite) { @@ -2971,8 +2969,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, balance_dirty_pages_ratelimited(mapping); } - /* file_update_time outside page_lock */ - if (vma->vm_file && !vma->vm_ops->page_mkwrite) + if (!vma->vm_ops->page_mkwrite) file_update_time(vma->vm_file); return ret; -- cgit v1.2.3-70-g09d2 From f38b4b310d402055702c63b0989dbcd16adf9537 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Feb 2015 14:11:30 -0800 Subject: mm: memory: merge shared-writable dirtying branches in do_wp_page() Whether there is a vm_ops->page_mkwrite or not, the page dirtying is pretty much the same. Make sure the page references are the same in both cases, then merge the two branches. It's tempting to go even further and page-lock the !page_mkwrite case, to get it in line with everybody else setting the page table and thus further simplify the model. But that's not quite compelling enough to justify dropping the pte lock, then relocking and verifying the entry for filesystems without ->page_mkwrite, which notably includes tmpfs. Leave it for now and lock the page late in the !page_mkwrite case. Signed-off-by: Johannes Weiner Acked-by: Kirill A. Shutemov Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 48 +++++++++++++++++------------------------------- 1 file changed, 17 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 0e9b32610655..988d3099a25d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2005,7 +2005,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_t entry; int ret = 0; int page_mkwrite = 0; - struct page *dirty_page = NULL; + bool dirty_shared = false; unsigned long mmun_start = 0; /* For mmu_notifiers */ unsigned long mmun_end = 0; /* For mmu_notifiers */ struct mem_cgroup *memcg; @@ -2056,6 +2056,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { + page_cache_get(old_page); /* * Only catch write-faults on shared writable pages, * read-only shared pages can get COWed by @@ -2063,7 +2064,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; - page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); tmp = do_page_mkwrite(vma, old_page, address); if (unlikely(!tmp || (tmp & @@ -2083,11 +2084,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(old_page); goto unlock; } - page_mkwrite = 1; } - dirty_page = old_page; - get_page(dirty_page); + + dirty_shared = true; reuse: /* @@ -2106,43 +2106,29 @@ reuse: pte_unmap_unlock(page_table, ptl); ret |= VM_FAULT_WRITE; - if (!dirty_page) - return ret; - - if (!page_mkwrite) { + if (dirty_shared) { struct address_space *mapping; int dirtied; - lock_page(dirty_page); - dirtied = set_page_dirty(dirty_page); - VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page); - mapping = dirty_page->mapping; - unlock_page(dirty_page); + if (!page_mkwrite) + lock_page(old_page); - if (dirtied && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } + dirtied = set_page_dirty(old_page); + VM_BUG_ON_PAGE(PageAnon(old_page), old_page); + mapping = old_page->mapping; + unlock_page(old_page); + page_cache_release(old_page); - file_update_time(vma->vm_file); - } - put_page(dirty_page); - if (page_mkwrite) { - struct address_space *mapping = dirty_page->mapping; - - set_page_dirty(dirty_page); - unlock_page(dirty_page); - page_cache_release(dirty_page); - if (mapping) { + if ((dirtied || page_mkwrite) && mapping) { /* * Some device drivers do not set page.mapping * but still dirty their pages */ balance_dirty_pages_ratelimited(mapping); } + + if (!page_mkwrite) + file_update_time(vma->vm_file); } return ret; -- cgit v1.2.3-70-g09d2 From 753162cd849c45580fb5aaa7f3597c81e74e391c Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 10 Feb 2015 14:11:36 -0800 Subject: mm: hugetlb: fix type of hugetlb_treat_as_movable variable hugetlb_treat_as_movable declared as unsigned long, but proc_dointvec() used for parsing it: static struct ctl_table vm_table[] = { ... { .procname = "hugepages_treat_as_movable", .data = &hugepages_treat_as_movable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, This seems harmless, but it's better to use int type here. Signed-off-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Manfred Spraul Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 431b7fc605c9..7d7856359920 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -86,7 +86,7 @@ void free_huge_page(struct page *page); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); #endif -extern unsigned long hugepages_treat_as_movable; +extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 85032de5e20f..be0e5d0db5ec 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,7 +35,7 @@ #include #include "internal.h" -unsigned long hugepages_treat_as_movable; +int hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; -- cgit v1.2.3-70-g09d2 From 4c5018ce06c6be292d4ed96cecf2c8dda361a923 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Tue, 10 Feb 2015 14:11:39 -0800 Subject: mm/page_alloc.c: place zone_id check before VM_BUG_ON_PAGE check If the freeing page and its buddy page are not at the same zone, the current holding zone->lock for the freeing page cann't prevent buddy page getting allocated, this could trigger VM_BUG_ON_PAGE in page_is_buddy() at a very tiny chance, such as: cpu 0: cpu 1: hold zone_1 lock check page and it buddy PageBuddy(buddy) is true hold zone_2 lock page_order(buddy) == order is true alloc buddy trigger VM_BUG_ON_PAGE(page_count(buddy) != 0) zone_1->lock prevents the freeing page getting allocated zone_2->lock prevents the buddy page getting allocated they are not the same zone->lock. If we can't remove the zone_id check statement, it's better handle this rare race. This patch fixes this by placing the zone_id check before the VM_BUG_ON_PAGE check. Signed-off-by: Weijie Yang Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e20f9c2fa5a..f121050e8530 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -552,17 +552,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, return 0; if (page_is_guard(buddy) && page_order(buddy) == order) { - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); - if (page_zone_id(page) != page_zone_id(buddy)) return 0; + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + return 1; } if (PageBuddy(buddy) && page_order(buddy) == order) { - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); - /* * zone check is done late to avoid uselessly * calculating zone/node ids for pages that could @@ -571,6 +569,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, if (page_zone_id(page) != page_zone_id(buddy)) return 0; + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + return 1; } return 0; -- cgit v1.2.3-70-g09d2 From dbf22eb6d8675fc173154d9f1bd1bd0fda53a001 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 10 Feb 2015 14:11:41 -0800 Subject: memcg: zap __memcg_{charge,uncharge}_slab They are simple wrappers around memcg_{charge,uncharge}_kmem, so let's zap them and call these functions directly. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++-- mm/memcontrol.c | 21 +++------------------ mm/slab.h | 4 ++-- 3 files changed, 8 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7c95af8d552c..18ccb2988979 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -403,8 +403,9 @@ void memcg_update_array_size(int num_groups); struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep); -int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); -void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, + unsigned long nr_pages); +void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); int __memcg_cleanup_cache_params(struct kmem_cache *s); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8b58701b9647..e229e3ad615c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2495,8 +2495,8 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); } -static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, - unsigned long nr_pages) +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, + unsigned long nr_pages) { struct page_counter *counter; int ret = 0; @@ -2533,8 +2533,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, return ret; } -static void memcg_uncharge_kmem(struct mem_cgroup *memcg, - unsigned long nr_pages) +void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) { page_counter_uncharge(&memcg->memory, nr_pages); if (do_swap_account) @@ -2767,20 +2766,6 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, current->memcg_kmem_skip_account = 0; } -int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) -{ - unsigned int nr_pages = 1 << order; - - return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); -} - -void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) -{ - unsigned int nr_pages = 1 << order; - - memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); -} - /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. diff --git a/mm/slab.h b/mm/slab.h index 1cf4005482dd..90430d6f665e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -235,7 +235,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s, return 0; if (is_root_cache(s)) return 0; - return __memcg_charge_slab(s, gfp, order); + return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order); } static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) @@ -244,7 +244,7 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) return; if (is_root_cache(s)) return; - __memcg_uncharge_slab(s, order); + memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order); } #else static inline bool is_root_cache(struct kmem_cache *s) -- cgit v1.2.3-70-g09d2 From 3e0350a36414a73c5c2d1e354f8c0ab4ace1296d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 10 Feb 2015 14:11:44 -0800 Subject: memcg: zap memcg_name argument of memcg_create_kmem_cache Instead of passing the name of the memory cgroup which the cache is created for in the memcg_name_argument, let's obtain it immediately in memcg_create_kmem_cache. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 3 +-- mm/memcontrol.c | 5 +---- mm/slab_common.c | 9 +++++---- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/include/linux/slab.h b/include/linux/slab.h index 9a139b637069..eca9ed303a1b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -117,8 +117,7 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *, - struct kmem_cache *, - const char *); + struct kmem_cache *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e229e3ad615c..baf7eb27e3ae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2607,8 +2607,6 @@ void memcg_update_array_size(int num) static void memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { - static char memcg_name_buf[NAME_MAX + 1]; /* protected by - memcg_slab_mutex */ struct kmem_cache *cachep; int id; @@ -2624,8 +2622,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg, if (cache_from_memcg_idx(root_cache, id)) return; - cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); - cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); + cachep = memcg_create_kmem_cache(memcg, root_cache); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root diff --git a/mm/slab_common.c b/mm/slab_common.c index 67f182c10f24..1b782a2d3b3d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -430,16 +430,15 @@ EXPORT_SYMBOL(kmem_cache_create); * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. * @root_cache: The parent of the new cache. - * @memcg_name: The name of the memory cgroup (used for naming the new cache). * * This function attempts to create a kmem cache that will serve allocation * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache, - const char *memcg_name) + struct kmem_cache *root_cache) { + static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ struct kmem_cache *s = NULL; char *cache_name; @@ -448,8 +447,10 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); + cgroup_name(mem_cgroup_css(memcg)->cgroup, + memcg_name_buf, sizeof(memcg_name_buf)); cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - memcg_cache_id(memcg), memcg_name); + memcg_cache_id(memcg), memcg_name_buf); if (!cache_name) goto out_unlock; -- cgit v1.2.3-70-g09d2 From d5b3cf7139b8770af4ed8bb36a1ab9d290ac39e9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 10 Feb 2015 14:11:47 -0800 Subject: memcg: zap memcg_slab_caches and memcg_slab_mutex mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to the given cgroup. Currently, it is only used on css free in order to destroy all caches corresponding to the memory cgroup being freed. The list is protected by memcg_slab_mutex. The mutex is also used to protect kmem_cache->memcg_params->memcg_caches arrays and synchronizes kmem_cache_destroy vs memcg_unregister_all_caches. However, we can perfectly get on without these two. To destroy all caches corresponding to a memory cgroup, we can walk over the global list of kmem caches, slab_caches, and we can do all the synchronization stuff using the slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid of the memcg_slab_caches and memcg_slab_mutex. Apart from this nice cleanup, it also: - assures that rcu_barrier() is called once at max when a root cache is destroyed or a memory cgroup is freed, no matter how many caches have SLAB_DESTROY_BY_RCU flag set; - fixes the race between kmem_cache_destroy and kmem_cache_create that exists, because memcg_cleanup_cache_params, which is called from kmem_cache_destroy after checking that kmem_cache->refcount=0, releases the slab_mutex, which gives kmem_cache_create a chance to make an alias to a cache doomed to be destroyed. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 - include/linux/slab.h | 6 +- mm/memcontrol.c | 156 +++++---------------------------------------- mm/slab_common.c | 142 +++++++++++++++++++++++++++++------------ 4 files changed, 120 insertions(+), 186 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 18ccb2988979..fb212e1d700d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -407,8 +407,6 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages); void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); -int __memcg_cleanup_cache_params(struct kmem_cache *s); - /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. diff --git a/include/linux/slab.h b/include/linux/slab.h index eca9ed303a1b..2e3b448cfa2d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,8 +116,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM -struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *, - struct kmem_cache *); +void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); +void memcg_destroy_kmem_caches(struct mem_cgroup *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); @@ -490,7 +490,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * Child caches will hold extra metadata needed for its operation. Fields are: * * @memcg: pointer to the memcg this cache belongs to - * @list: list_head for the list of all caches in this memcg * @root_cache: pointer to the global, root cache, this cache was derived from */ struct memcg_cache_params { @@ -502,7 +501,6 @@ struct memcg_cache_params { }; struct { struct mem_cgroup *memcg; - struct list_head list; struct kmem_cache *root_cache; }; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index baf7eb27e3ae..f3f8a4f52a0c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -343,9 +343,6 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* analogous to slab_common's slab_caches list, but per-memcg; - * protected by memcg_slab_mutex */ - struct list_head memcg_slab_caches; /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif @@ -2476,25 +2473,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, } #ifdef CONFIG_MEMCG_KMEM -/* - * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or - * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. - */ -static DEFINE_MUTEX(memcg_slab_mutex); - -/* - * This is a bit cumbersome, but it is rarely used and avoids a backpointer - * in the memcg_cache_params struct. - */ -static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) -{ - struct kmem_cache *cachep; - - VM_BUG_ON(p->is_root_cache); - cachep = p->root_cache; - return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); -} - int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages) { @@ -2578,10 +2556,7 @@ static int memcg_alloc_cache_id(void) else if (size > MEMCG_CACHES_MAX_SIZE) size = MEMCG_CACHES_MAX_SIZE; - mutex_lock(&memcg_slab_mutex); err = memcg_update_all_caches(size); - mutex_unlock(&memcg_slab_mutex); - if (err) { ida_simple_remove(&kmem_limited_groups, id); return err; @@ -2604,120 +2579,20 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = num; } -static void memcg_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) -{ - struct kmem_cache *cachep; - int id; - - lockdep_assert_held(&memcg_slab_mutex); - - id = memcg_cache_id(memcg); - - /* - * Since per-memcg caches are created asynchronously on first - * allocation (see memcg_kmem_get_cache()), several threads can try to - * create the same cache, but only one of them may succeed. - */ - if (cache_from_memcg_idx(root_cache, id)) - return; - - cachep = memcg_create_kmem_cache(memcg, root_cache); - /* - * If we could not create a memcg cache, do not complain, because - * that's not critical at all as we can always proceed with the root - * cache. - */ - if (!cachep) - return; - - list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); - - /* - * Since readers won't lock (see cache_from_memcg_idx()), we need a - * barrier here to ensure nobody will see the kmem_cache partially - * initialized. - */ - smp_wmb(); - - BUG_ON(root_cache->memcg_params->memcg_caches[id]); - root_cache->memcg_params->memcg_caches[id] = cachep; -} - -static void memcg_unregister_cache(struct kmem_cache *cachep) -{ - struct kmem_cache *root_cache; - struct mem_cgroup *memcg; - int id; - - lockdep_assert_held(&memcg_slab_mutex); - - BUG_ON(is_root_cache(cachep)); - - root_cache = cachep->memcg_params->root_cache; - memcg = cachep->memcg_params->memcg; - id = memcg_cache_id(memcg); - - BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); - root_cache->memcg_params->memcg_caches[id] = NULL; - - list_del(&cachep->memcg_params->list); - - kmem_cache_destroy(cachep); -} - -int __memcg_cleanup_cache_params(struct kmem_cache *s) -{ - struct kmem_cache *c; - int i, failed = 0; - - mutex_lock(&memcg_slab_mutex); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; - - memcg_unregister_cache(c); - - if (cache_from_memcg_idx(s, i)) - failed++; - } - mutex_unlock(&memcg_slab_mutex); - return failed; -} - -static void memcg_unregister_all_caches(struct mem_cgroup *memcg) -{ - struct kmem_cache *cachep; - struct memcg_cache_params *params, *tmp; - - if (!memcg_kmem_is_active(memcg)) - return; - - mutex_lock(&memcg_slab_mutex); - list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { - cachep = memcg_params_to_cache(params); - memcg_unregister_cache(cachep); - } - mutex_unlock(&memcg_slab_mutex); -} - -struct memcg_register_cache_work { +struct memcg_kmem_cache_create_work { struct mem_cgroup *memcg; struct kmem_cache *cachep; struct work_struct work; }; -static void memcg_register_cache_func(struct work_struct *w) +static void memcg_kmem_cache_create_func(struct work_struct *w) { - struct memcg_register_cache_work *cw = - container_of(w, struct memcg_register_cache_work, work); + struct memcg_kmem_cache_create_work *cw = + container_of(w, struct memcg_kmem_cache_create_work, work); struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; - mutex_lock(&memcg_slab_mutex); - memcg_register_cache(memcg, cachep); - mutex_unlock(&memcg_slab_mutex); + memcg_create_kmem_cache(memcg, cachep); css_put(&memcg->css); kfree(cw); @@ -2726,10 +2601,10 @@ static void memcg_register_cache_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { - struct memcg_register_cache_work *cw; + struct memcg_kmem_cache_create_work *cw; cw = kmalloc(sizeof(*cw), GFP_NOWAIT); if (!cw) @@ -2739,18 +2614,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, cw->memcg = memcg; cw->cachep = cachep; + INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - INIT_WORK(&cw->work, memcg_register_cache_func); schedule_work(&cw->work); } -static void memcg_schedule_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { /* * We need to stop accounting when we kmalloc, because if the * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_schedule_register_cache will recurse. + * in __memcg_schedule_kmem_cache_create will recurse. * * However, it is better to enclose the whole function. Depending on * the debugging options enabled, INIT_WORK(), for instance, can @@ -2759,7 +2634,7 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, * the safest choice is to do it like this, wrapping the whole function. */ current->memcg_kmem_skip_account = 1; - __memcg_schedule_register_cache(memcg, cachep); + __memcg_schedule_kmem_cache_create(memcg, cachep); current->memcg_kmem_skip_account = 0; } @@ -2807,7 +2682,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) * could happen with the slab_mutex held. So it's better to * defer everything. */ - memcg_schedule_register_cache(memcg, cachep); + memcg_schedule_kmem_cache_create(memcg, cachep); out: css_put(&memcg->css); return cachep; @@ -4136,7 +4011,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) static void memcg_destroy_kmem(struct mem_cgroup *memcg) { - memcg_unregister_all_caches(memcg); + memcg_destroy_kmem_caches(memcg); mem_cgroup_sockets_destroy(memcg); } #else @@ -4664,7 +4539,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) spin_lock_init(&memcg->event_list_lock); #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; - INIT_LIST_HEAD(&memcg->memcg_slab_caches); #endif return &memcg->css; diff --git a/mm/slab_common.c b/mm/slab_common.c index 1b782a2d3b3d..6e1e4cf65836 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -425,6 +425,49 @@ out_unlock: } EXPORT_SYMBOL(kmem_cache_create); +static int do_kmem_cache_shutdown(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + if (__kmem_cache_shutdown(s) != 0) { + printk(KERN_ERR "kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + return -EBUSY; + } + + if (s->flags & SLAB_DESTROY_BY_RCU) + *need_rcu_barrier = true; + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) { + struct kmem_cache *root_cache = s->memcg_params->root_cache; + int memcg_id = memcg_cache_id(s->memcg_params->memcg); + + BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s); + root_cache->memcg_params->memcg_caches[memcg_id] = NULL; + } +#endif + list_move(&s->list, release); + return 0; +} + +static void do_kmem_cache_release(struct list_head *release, + bool need_rcu_barrier) +{ + struct kmem_cache *s, *s2; + + if (need_rcu_barrier) + rcu_barrier(); + + list_for_each_entry_safe(s, s2, release, list) { +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_remove(s); +#else + slab_kmem_cache_release(s); +#endif + } +} + #ifdef CONFIG_MEMCG_KMEM /* * memcg_create_kmem_cache - Create a cache for a memory cgroup. @@ -435,10 +478,11 @@ EXPORT_SYMBOL(kmem_cache_create); * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ -struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) +void memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ + int memcg_id = memcg_cache_id(memcg); struct kmem_cache *s = NULL; char *cache_name; @@ -447,6 +491,14 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (cache_from_memcg_idx(root_cache, memcg_id)) + goto out_unlock; + cgroup_name(mem_cgroup_css(memcg)->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, @@ -458,49 +510,73 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, root_cache->size, root_cache->align, root_cache->flags, root_cache->ctor, memcg, root_cache); + /* + * If we could not create a memcg cache, do not complain, because + * that's not critical at all as we can always proceed with the root + * cache. + */ if (IS_ERR(s)) { kfree(cache_name); - s = NULL; + goto out_unlock; } + /* + * Since readers won't lock (see cache_from_memcg_idx()), we need a + * barrier here to ensure nobody will see the kmem_cache partially + * initialized. + */ + smp_wmb(); + root_cache->memcg_params->memcg_caches[memcg_id] = s; + out_unlock: mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - - return s; } -static int memcg_cleanup_cache_params(struct kmem_cache *s) +void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { - int rc; + LIST_HEAD(release); + bool need_rcu_barrier = false; + struct kmem_cache *s, *s2; - if (!s->memcg_params || - !s->memcg_params->is_root_cache) - return 0; + get_online_cpus(); + get_online_mems(); - mutex_unlock(&slab_mutex); - rc = __memcg_cleanup_cache_params(s); mutex_lock(&slab_mutex); + list_for_each_entry_safe(s, s2, &slab_caches, list) { + if (is_root_cache(s) || s->memcg_params->memcg != memcg) + continue; + /* + * The cgroup is about to be freed and therefore has no charges + * left. Hence, all its caches must be empty by now. + */ + BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); + } + mutex_unlock(&slab_mutex); - return rc; -} -#else -static int memcg_cleanup_cache_params(struct kmem_cache *s) -{ - return 0; + put_online_mems(); + put_online_cpus(); + + do_kmem_cache_release(&release, need_rcu_barrier); } #endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) { + memcg_free_cache_params(s); kfree(s->name); kmem_cache_free(kmem_cache, s); } void kmem_cache_destroy(struct kmem_cache *s) { + int i; + LIST_HEAD(release); + bool need_rcu_barrier = false; + bool busy = false; + get_online_cpus(); get_online_mems(); @@ -510,35 +586,23 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - if (memcg_cleanup_cache_params(s) != 0) - goto out_unlock; + for_each_memcg_cache_index(i) { + struct kmem_cache *c = cache_from_memcg_idx(s, i); - if (__kmem_cache_shutdown(s) != 0) { - printk(KERN_ERR "kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); - dump_stack(); - goto out_unlock; + if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) + busy = true; } - list_del(&s->list); - - mutex_unlock(&slab_mutex); - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - - memcg_free_cache_params(s); -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_remove(s); -#else - slab_kmem_cache_release(s); -#endif - goto out; + if (!busy) + do_kmem_cache_shutdown(s, &release, &need_rcu_barrier); out_unlock: mutex_unlock(&slab_mutex); -out: + put_online_mems(); put_online_cpus(); + + do_kmem_cache_release(&release, need_rcu_barrier); } EXPORT_SYMBOL(kmem_cache_destroy); -- cgit v1.2.3-70-g09d2